File Coverage

blib/lib/Mail/SpamAssassin/BayesStore/DBM.pm
Criterion Covered Total %
statement 418 1003 41.6
branch 124 390 31.7
condition 51 121 42.1
subroutine 57 74 77.0
pod 33 45 73.3
total 683 1633 41.8


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18              
19             use strict;
20 21     21   142 use warnings;
  21         53  
  21         617  
21 21     21   97 # use bytes;
  21         48  
  21         515  
22             use re 'taint';
23 21     21   126  
  21         38  
  21         517  
24             use Fcntl;
25 21     21   98 use Errno qw(EBADF);
  21         54  
  21         4985  
26 21     21   130 use File::Basename;
  21         38  
  21         772  
27 21     21   125 use File::Spec;
  21         40  
  21         1149  
28 21     21   115 use File::Path;
  21         46  
  21         813  
29 21     21   105  
  21         49  
  21         1731  
30             BEGIN {
31             eval { require Digest::SHA; import Digest::SHA qw(sha1); 1 }
32 21         94 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1) }
  21         471  
  21         411  
33 21 50   21   76 }
  0         0  
  0         0  
34              
35             use Mail::SpamAssassin;
36 21     21   117 use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows);
  21         931  
  21         678  
37 21     21   116 use Mail::SpamAssassin::BayesStore;
  21         37  
  21         924  
38 21     21   5938 use Mail::SpamAssassin::Logger;
  21         338  
  21         652  
39 21     21   683  
  21         55  
  21         1967  
40             use constant MAGIC_RE => qr/^\015\001\007\011\003/;
41 21     21   136  
  21         43  
  21         141579  
42             our ( @DBNAMES,
43             $NSPAM_MAGIC_TOKEN, $NHAM_MAGIC_TOKEN, $LAST_EXPIRE_MAGIC_TOKEN, $LAST_JOURNAL_SYNC_MAGIC_TOKEN,
44             $NTOKENS_MAGIC_TOKEN, $OLDEST_TOKEN_AGE_MAGIC_TOKEN, $LAST_EXPIRE_REDUCE_MAGIC_TOKEN,
45             $RUNNING_EXPIRE_MAGIC_TOKEN, $DB_VERSION_MAGIC_TOKEN, $LAST_ATIME_DELTA_MAGIC_TOKEN,
46             $NEWEST_TOKEN_AGE_MAGIC_TOKEN
47             );
48              
49             our @ISA = qw( Mail::SpamAssassin::BayesStore );
50              
51             # db layout (quoting Matt):
52             #
53             # > need five db files though to make it real fast:
54             # [probs] 1. ngood and nbad (two entries, so could be a flat file rather
55             # than a db file). (now 2 entries in db_toks)
56             # [toks] 2. good token -> number seen
57             # [toks] 3. bad token -> number seen (both are packed into 1 entry in 1 db)
58             # [probs] 4. Consolidated good token -> probability
59             # [probs] 5. Consolidated bad token -> probability
60             # > As you add new mails, you update the entry in 2 or 3, then regenerate
61             # > the entry for that token in 4 or 5.
62             # > Then as you test a new mail, you just need to pull the probability
63             # > direct from 4 and 5, and generate the overall probability. A simple and
64             # > very fast operation.
65             #
66             # jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
67             #
68             # update: probs is no longer maintained as a db, to keep on-disk and in-core
69             # usage down.
70             #
71             # also, added a new one to support forgetting, auto-learning, and
72             # auto-forgetting for refiled mails:
73             # [seen] 6. a list of Message-IDs of messages already learnt from. values
74             # are 's' for learnt-as-spam, 'h' for learnt-as-ham.
75             #
76             # and another, called [scancount] to model the scan-count for expiry.
77             # This is not a database. Instead it increases by one byte for each
78             # message scanned (note: scanned, not learned).
79              
80             @DBNAMES = qw(toks seen);
81              
82             # These are the magic tokens we use to track stuff in the DB.
83             # The format is '^M^A^G^I^C' followed by any string you want.
84             # None of the control chars will be in a real token.
85             $DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION";
86             $LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
87             $LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
88             $LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
89             $LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
90             $NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
91             $NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
92             $NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
93             $NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
94             $OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
95             $RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
96              
97             my ($self) = @_;
98             if (exists($self->{has_dbm_module})) {
99 136     136 0 250 return $self->{has_dbm_module};
100 136 100       379 }
101 79         392 $self->{has_dbm_module} = eval { require DB_File; };
102             }
103 57         113  
  57         9684  
104             return "DB_File";
105             }
106              
107 136     136 0 547 # Possible file extensions used by the kinds of database files DB_File
108             # might create. We need these so we can create a new file and rename
109             # it into place.
110             return ('', '.db');
111             }
112              
113             ###########################################################################
114 0     0 0 0  
115             my $class = shift;
116             $class = ref($class) || $class;
117              
118             my $self = $class->SUPER::new(@_);
119              
120 63     63 1 174 $self->{supported_db_version} = 3;
121 63   33     402  
122             $self->{already_tied} = 0;
123 63         461 $self->{is_locked} = 0;
124             $self->{string_to_journal} = '';
125 63         237  
126             $self;
127 63         184 }
128 63         150  
129 63         151 ###########################################################################
130              
131 63         1159 my ($self) = @_;
132              
133             if (!$self->HAS_DBM_MODULE) {
134             dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE);
135             return 0;
136             }
137 148     148 1 331  
138             # return if we've already tied to the db's, using the same mode
139 148 100       540 # (locked/unlocked) as before.
140 134         517 return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
141 134         614  
142             my $main = $self->{bayes}->{main};
143             if (!defined($main->{conf}->{bayes_path})) {
144             dbg("bayes: bayes_path not defined");
145             return 0;
146 14 100 66     99 }
147              
148 12         30 $self->read_db_configs();
149 12 50       64  
150 0         0 my $path = $main->sed_path($main->{conf}->{bayes_path});
151 0         0  
152             my $found = 0;
153             for my $ext ($self->DB_EXTENSIONS) {
154 12         103 if (-f $path.'_toks'.$ext) {
155             $found = 1;
156 12         63 last;
157             }
158 12         33 }
159 12         49  
160 20 100       502 if (!$found) {
161 4         15 dbg("bayes: no dbs present, cannot tie DB R/O: %s", $path.'_toks');
162 4         15 return 0;
163             }
164              
165             foreach my $dbname (@DBNAMES) {
166 12 100       56 my $name = $path.'_'.$dbname;
167 8         50 my $db_var = 'db_'.$dbname;
168 8         78 dbg("bayes: tie-ing to DB file R/O $name");
169              
170             # Bug 6901, [rt.cpan.org #83060]
171 4         24 # DB_File: Repeated tie to the same hash with no untie causes corruption
172 8         32 untie %{$self->{$db_var}}; # has no effect if the variable is not tied
173 8         16  
174 8         35 if (!tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDONLY,
175             (oct($main->{conf}->{bayes_file_mode}) & 0666))
176             {
177             # bug 2975: it's acceptable for the db_seen to not be present,
178 8         17 # to allow it to be recycled. if that's the case, just create
  8         45  
179             # a new, empty one. we don't need to lock it, since we won't
180 8 100       20 # be writing to it; let the R/W api deal with that case.
  8         42  
181              
182             if ($dbname eq 'seen') {
183             # Bug 6901, [rt.cpan.org #83060]
184             untie %{$self->{$db_var}}; # has no effect if the variable is not tied
185             tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
186             (oct($main->{conf}->{bayes_file_mode}) & 0666)
187             or goto failed_to_tie;
188 1 50       11 }
189             else {
190 1         3 goto failed_to_tie;
  1         5  
191 1         7 }
192 1 50       4 }
193             }
194              
195             $self->{db_version} = ($self->get_storage_variables())[6];
196 0         0 dbg("bayes: found bayes db version %s", $self->{db_version});
197              
198             # If the DB version is one we don't understand, abort!
199             if ($self->_check_db_version() != 0) {
200             warn("bayes: bayes db version ".$self->{db_version}." is not able to be used, aborting!");
201 4         49 $self->untie_db();
202 4         25 return 0;
203             }
204              
205 4 50       37 $self->{already_tied} = 1;
206 0         0 return 1;
207 0         0  
208 0         0 failed_to_tie:
209             warn "bayes: cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
210             foreach my $dbname (@DBNAMES) {
211 4         11 my $db_var = 'db_'.$dbname;
212 4         21 next unless exists $self->{$db_var};
213             dbg("bayes: untie-ing DB file $dbname");
214 0         0 untie %{$self->{$db_var}};
215             }
216 0         0  
217 0         0 return 0;
218 0 0       0 }
219 0         0  
220 0         0 # tie() to the databases, read-write and locked. Any callers of
  0         0  
221             # this should ensure they call untie_db() afterwards!
222             #
223 0         0 my ($self) = @_;
224              
225             if (!$self->HAS_DBM_MODULE) {
226             dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE);
227             return 0;
228             }
229              
230 30     30 1 85 # Useful shortcut ...
231             my $main = $self->{bayes}->{main};
232 30 100       243  
233 2         7 # if we've already tied the db's using the same mode
234 2         7 # (locked/unlocked) as we want now, freshen the lock and return.
235             if ($self->{already_tied} && $self->{is_locked} == 1) {
236             $main->{locker}->refresh_lock($self->{locked_file});
237             return 1;
238 28         96 }
239              
240             if (!defined($main->{conf}->{bayes_path})) {
241             dbg("bayes: bayes_path not defined");
242 28 100 66     135 return 0;
243 2         17 }
244 2         7  
245             $self->read_db_configs();
246              
247 26 50       150 my $path = $main->sed_path($main->{conf}->{bayes_path});
248 0         0  
249 0         0 my $found = 0;
250             for my $ext ($self->DB_EXTENSIONS) {
251             if (-f $path.'_toks'.$ext) {
252 26         169 $found = 1;
253             last;
254 26         123 }
255             }
256 26         55  
257 26         110 my $parentdir = dirname($path);
258 30 100       754 if (!-d $parentdir) {
259 22         74 # run in an eval(); if mkpath has no perms, it calls die()
260 22         62 eval {
261             mkpath($parentdir, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777));
262             };
263             }
264 26         1773  
265 26 50       446 my $tout;
266             if ($main->{learn_wait_for_lock}) {
267 0         0 $tout = 300; # TODO: Dan to write better lock code
268 0         0 } else {
269             $tout = 10;
270             }
271             if ($main->{locker}->safe_lock($path, $tout, $main->{conf}->{bayes_file_mode}))
272 26         77 {
273 26 50       91 $self->{locked_file} = $path;
274 0         0 $self->{is_locked} = 1;
275             } else {
276 26         74 warn "bayes: cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
277             return 0;
278 26 50       241 }
279              
280 26         90 my $umask = umask 0;
281 26         71 foreach my $dbname (@DBNAMES) {
282             my $name = $path.'_'.$dbname;
283 0         0 my $db_var = 'db_'.$dbname;
284 0         0 dbg("bayes: tie-ing to DB file R/W $name");
285              
286             ($self->DBM_MODULE eq 'DB_File') and
287 26         138 Mail::SpamAssassin::Util::avoid_db_file_locking_bug ($name);
288 26         128  
289 52         174 # Bug 6901, [rt.cpan.org #83060]
290 52         97 untie %{$self->{$db_var}}; # has no effect if the variable is not tied
291 52         180 tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
292             (oct($main->{conf}->{bayes_file_mode}) & 0666)
293 52 50       159 or goto failed_to_tie;
294             }
295             umask $umask;
296              
297 52         86 # set our cache to what version DB we're using
  52         181  
298 52         152 $self->{db_version} = ($self->get_storage_variables())[6];
299 52 50       73 # don't bother printing this unless found since it would be bogus anyway
300             dbg("bayes: found bayes db version %s", $self->{db_version}) if $found;
301              
302 26         256 # figure out if we can read the current DB and if we need to do a
303             # DB version update and do it if necessary if either has a problem,
304             # fail immediately
305 26         192 #
306             if ($found && !$self->_upgrade_db()) {
307 26 100       131 $self->untie_db();
308             return 0;
309             }
310             elsif (!$found) { # new DB, make sure we know that ...
311             $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
312             $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
313 26 50 66     205 dbg("bayes: new db, set db version %s and 0 tokens", $self->{db_version});
    100          
314 0         0 }
315 0         0  
316             $self->{already_tied} = 1;
317             return 1;
318 4         36  
319 4         75 failed_to_tie:
320 4         31 my $err = $!;
321             umask $umask;
322              
323 26         55 foreach my $dbname (@DBNAMES) {
324 26         165 my $db_var = 'db_'.$dbname;
325             next unless exists $self->{$db_var};
326 0         0 dbg("bayes: untie-ing DB file $dbname");
327             untie %{$self->{$db_var}};
328 0         0 }
329              
330 0         0 if ($self->{is_locked}) {
331 0         0 $self->{bayes}->{main}->{locker}->safe_unlock($self->{locked_file});
332 0 0       0 $self->{is_locked} = 0;
333 0         0 }
334 0         0 warn "bayes: cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
  0         0  
335             return 0;
336             }
337 0 0       0  
338 0         0 # Do we understand how to deal with this DB version?
339 0         0 my ($self) = @_;
340              
341 0         0 # return -1 if older, 0 if current, 1 if newer
342 0         0 return $self->{db_version} <=> $self->DB_VERSION;
343             }
344              
345             # Check to see if we need to upgrade the DB, and do so if necessary
346             my ($self) = @_;
347 26     26   56  
348             my $verschk = $self->_check_db_version();
349             my $res = 0; # used later on for tie() checks
350 26         123 my $umask; # used later for umask modifications
351              
352             # If the DB is the latest version, no problem.
353             return 1 if ($verschk == 0);
354              
355 22     22   58 # If the DB is a newer version that we know what to do with ... abort!
356             if ($verschk == 1) {
357 22         86 warn("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
358 22         43 return 0;
359 22         35 }
360              
361             # If the current DB version is lower than the new version, upgrade!
362 22 50       142 # Do conversions in order so we can go 1 -> 3, make sure to update
363             # $self->{db_version} along the way
364              
365 0 0       0 dbg("bayes: detected bayes db format %s, upgrading", $self->{db_version});
366 0         0  
367 0         0 # since DB_File will not shrink a database (!!), we need to *create*
368             # a new one instead.
369             my $main = $self->{bayes}->{main};
370             my $path = $main->sed_path($main->{conf}->{bayes_path});
371             my $name = $path.'_toks';
372              
373             # older version's journal files are likely not in the same format as the new ones, so remove it.
374 0         0 my $jpath = $self->_get_journal_filename();
375             if (-f $jpath) {
376             dbg("bayes: old journal file found, removing");
377             warn "bayes: couldn't remove $jpath: $!" if (!unlink $jpath);
378 0         0 }
379 0         0  
380 0         0 if ($self->{db_version} < 2) {
381             dbg("bayes: upgrading database format from v%s to v2", $self->{db_version});
382             $self->set_running_expire_tok();
383 0         0  
384 0 0       0 my ($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
385 0         0 my ($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
386 0 0       0  
387             # Magic tokens for version 0, defined as '**[A-Z]+'
388             if ($self->{db_version} == 0) {
389 0 0       0 $DB_NSPAM_MAGIC_TOKEN = '**NSPAM';
390 0         0 $DB_NHAM_MAGIC_TOKEN = '**NHAM';
391 0         0 $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
392             #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
393 0         0 #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
394 0         0 #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
395             #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE';
396             }
397 0 0       0 else {
398 0         0 $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
399 0         0 $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
400 0         0 $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
401             #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
402             #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
403             #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
404             #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
405             }
406              
407 0         0 # remember when we started ...
408 0         0 my $started = time;
409 0         0 my $newatime = $started;
410              
411             # use O_EXCL to avoid races (bonus paranoia, since we should be locked
412             # anyway)
413             my %new_toks;
414             $umask = umask 0;
415              
416             $res = tie %new_toks, $self->DBM_MODULE, "${name}.new",
417 0         0 O_RDWR|O_CREAT|O_EXCL,
418 0         0 (oct($main->{conf}->{bayes_file_mode}) & 0666);
419             umask $umask;
420             return 0 unless $res;
421             undef $res;
422 0         0  
423 0         0 # add the magic tokens to the new db.
424             $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
425             $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
426             $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
427 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
428 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
429 0 0       0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
430 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
431             $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
432             $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
433 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
434 0         0  
435 0         0 # deal with the data tokens
436 0         0 my ($tok, $packed);
437 0         0 my $count = 0;
438 0         0 while (($tok, $packed) = each %{$self->{db_toks}}) {
439 0         0 next if ($tok =~ /^(?:\*\*[A-Z]+$|\015\001\007\011\003)/); # skip magic tokens
440 0         0  
441 0         0 my ($ts, $th, $atime) = $self->tok_unpack($packed);
442 0         0 $new_toks{$tok} = $self->tok_pack($ts, $th, $newatime);
443              
444             # Refresh the lock every so often...
445 0         0 if (($count++ % 1000) == 0) {
446 0         0 $self->set_running_expire_tok();
447 0         0 }
  0         0  
448 0 0       0 }
449              
450 0         0  
451 0         0 # now untie so we can do renames
452             untie %{$self->{db_toks}};
453             untie %new_toks;
454 0 0       0  
455 0         0 # This is the critical phase (moving files around), so don't allow
456             # it to be interrupted.
457             local $SIG{'INT'} = 'IGNORE';
458             local $SIG{'TERM'} = 'IGNORE';
459             local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
460              
461 0         0 # older versions used scancount, so kill the stupid little file ...
  0         0  
462 0         0 my $msgc = $path.'_msgcount';
463             if (-f $msgc) {
464             dbg("bayes: old msgcount file found, removing");
465             if (!unlink $msgc) {
466 0         0 warn "bayes: couldn't remove $msgc: $!";
467 0         0 }
468 0 0       0 }
469              
470             # now rename in the new one. Try several extensions
471 0         0 for my $ext ($self->DB_EXTENSIONS) {
472 0 0       0 my $newf = $name.'.new'.$ext;
473 0         0 my $oldf = $name.$ext;
474 0 0       0 next unless (-f $newf);
475 0         0 if (!rename ($newf, $oldf)) {
476             warn "bayes: rename $newf to $oldf failed: $!\n";
477             return 0;
478             }
479             }
480 0         0  
481 0         0 # re-tie to the new db in read-write mode ...
482 0         0 $umask = umask 0;
483 0 0       0 # Bug 6901, [rt.cpan.org #83060]
484 0 0       0 untie %{$self->{db_toks}}; # has no effect if the variable is not tied
485 0         0 $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
486 0         0 (oct($main->{conf}->{bayes_file_mode}) & 0666);
487             umask $umask;
488             return 0 unless $res;
489             undef $res;
490              
491 0         0 dbg("bayes: upgraded database format from v%s to v2 in %d seconds",
492             $self->{db_version}, time - $started);
493 0         0 $self->{db_version} = 2; # need this for other functions which check
  0         0  
494 0         0 }
495 0         0  
496 0         0 # Version 3 of the database converts all existing tokens to SHA1 hashes
497 0 0       0 if ($self->{db_version} == 2) {
498 0         0 dbg("bayes: upgrading database format from v%s to v3", $self->{db_version});
499             $self->set_running_expire_tok();
500              
501 0         0 my $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
502 0         0 my $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
503             my $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
504             my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
505             my $DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
506 0 0       0 my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
507 0         0 my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
508 0         0 my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
509             my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
510 0         0  
511 0         0 # remember when we started ...
512 0         0 my $started = time;
513 0         0  
514 0         0 # use O_EXCL to avoid races (bonus paranoia, since we should be locked
515 0         0 # anyway)
516 0         0 my %new_toks;
517 0         0 $umask = umask 0;
518 0         0 $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL,
519             (oct($main->{conf}->{bayes_file_mode}) & 0666);
520             umask $umask;
521 0         0 return 0 unless $res;
522             undef $res;
523              
524             # add the magic tokens to the new db.
525 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
526 0         0 $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
527             $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
528 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file
529 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN};
530 0 0       0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN};
531 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN};
532             $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN};
533             $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN};
534 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
535 0         0  
536 0         0 # deal with the data tokens
537 0         0 my $count = 0;
538 0         0 while (my ($tok, $packed) = each %{$self->{db_toks}}) {
539 0         0 next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens
540 0         0 my $tok_hash = substr(sha1($tok), -5);
541 0         0 $new_toks{$tok_hash} = $packed;
542 0         0  
543 0         0 # Refresh the lock every so often...
544             if (($count++ % 1000) == 0) {
545             $self->set_running_expire_tok();
546 0         0 }
547 0         0 }
  0         0  
548 0 0       0  
549 0         0 # now untie so we can do renames
550 0         0 untie %{$self->{db_toks}};
551             untie %new_toks;
552              
553 0 0       0 # This is the critical phase (moving files around), so don't allow
554 0         0 # it to be interrupted.
555             local $SIG{'INT'} = 'IGNORE';
556             local $SIG{'TERM'} = 'IGNORE';
557             local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
558              
559 0         0 # now rename in the new one. Try several extensions
  0         0  
560 0         0 for my $ext ($self->DB_EXTENSIONS) {
561             my $newf = $name.'.new'.$ext;
562             my $oldf = $name.$ext;
563             next unless (-f $newf);
564 0         0 if (!rename($newf, $oldf)) {
565 0         0 warn "bayes: rename $newf to $oldf failed: $!\n";
566 0 0       0 return 0;
567             }
568             }
569 0         0  
570 0         0 # re-tie to the new db in read-write mode ...
571 0         0 $umask = umask 0;
572 0 0       0 # Bug 6901, [rt.cpan.org #83060]
573 0 0       0 untie %{$self->{db_toks}}; # has no effect if the variable is not tied
574 0         0 $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
575 0         0 (oct ($main->{conf}->{bayes_file_mode}) & 0666);
576             umask $umask;
577             return 0 unless $res;
578             undef $res;
579              
580 0         0 dbg("bayes: upgraded database format from v%s to v3 in %d seconds",
581             $self->{db_version}, time - $started);
582 0         0  
  0         0  
583 0         0 $self->{db_version} = 3; # need this for other functions which check
584 0         0 }
585 0         0  
586 0 0       0 # if ($self->{db_version} == 3) {
587 0         0 # ...
588             # $self->{db_version} = 4; # need this for other functions which check
589             # }
590 0         0 # ... and so on.
591              
592 0         0 return 1;
593             }
594              
595             ###########################################################################
596              
597             my $self = shift;
598              
599             return if (!$self->{already_tied});
600              
601 0         0 dbg("bayes: untie-ing");
602              
603             foreach my $dbname (@DBNAMES) {
604             my $db_var = 'db_'.$dbname;
605              
606             if (exists $self->{$db_var}) {
607 69     69 1 174 # dbg("bayes: untie-ing $db_var");
608             untie %{$self->{$db_var}};
609 69 100       326 delete $self->{$db_var};
610             }
611 30         118 }
612              
613 30         104 if ($self->{is_locked}) {
614 60         143 dbg("bayes: files locked, now unlocking lock");
615             $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
616 60 100       161 $self->{is_locked} = 0;
617             }
618 56         75  
  56         1065  
619 56         238 $self->{already_tied} = 0;
620             $self->{db_version} = undef;
621             }
622              
623 30 100       104 ###########################################################################
624 26         90  
625 26         260 my ($self, $newest_atime, $start, $max_expire_mult) = @_;
626 26         63  
627             my %delta; # use a hash since an array is going to be very sparse
628              
629 30         60 # do the first pass, figure out atime delta
630 30         91 my ($tok, $packed);
631             while (($tok, $packed) = each %{$self->{db_toks}}) {
632             next if ($tok =~ MAGIC_RE); # skip magic tokens
633            
634             my ($ts, $th, $atime) = $self->tok_unpack ($packed);
635              
636 0     0 1 0 # Go through from $start * 1 to $start * 512, mark how many tokens
637             # we would expire
638 0         0 my $token_age = $newest_atime - $atime;
639             for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
640             if ($token_age >= $start * $i) {
641 0         0 $delta{$i}++;
642 0         0 }
  0         0  
643 0 0       0 else {
644             # If the token age is less than the expire delta, it'll be
645 0         0 # less for all upcoming checks too, so abort early.
646             last;
647             }
648             }
649 0         0 }
650 0         0 return %delta;
651 0 0       0 }
652 0         0  
653             ###########################################################################
654              
655             my ($self, $opts, $newdelta, @vars) = @_;
656              
657 0         0 my $deleted = 0;
658             my $kept = 0;
659             my $num_hapaxes = 0;
660             my $num_lowfreq = 0;
661 0         0  
662             # since DB_File will not shrink a database (!!), we need to *create*
663             # a new one instead.
664             my $main = $self->{bayes}->{main};
665             my $path = $main->sed_path($main->{conf}->{bayes_path});
666              
667 0     0 1 0 # use a temporary PID-based suffix just in case another one was
668             # created previously by an interrupted expire
669 0         0 my $tmpsuffix = "expire$$";
670 0         0 my $tmpdbname = $path.'_toks.'.$tmpsuffix;
671 0         0  
672 0         0 # clean out any leftover db copies from previous runs
673             for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
674              
675             # use O_EXCL to avoid races (bonus paranoia, since we should be locked
676 0         0 # anyway)
677 0         0 my %new_toks;
678             my $umask = umask 0;
679             tie %new_toks, $self->DBM_MODULE, $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
680             (oct ($main->{conf}->{bayes_file_mode}) & 0666);
681 0         0 umask $umask;
682 0         0 my $oldest;
683              
684             my $showdots = $opts->{showdots};
685 0         0 if ($showdots) { print STDERR "\n"; }
  0         0  
686              
687             # We've chosen a new atime delta if we've gotten here, so record it
688             # for posterity.
689 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
690 0         0  
691             # Figure out how old is too old...
692 0         0 my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
693 0         0  
694 0         0 # Go ahead and do the move to new db/expire run now ...
695             my ($tok, $packed);
696 0         0 while (($tok, $packed) = each %{$self->{db_toks}}) {
697 0 0       0 next if ($tok =~ MAGIC_RE); # skip magic tokens
  0         0  
698              
699             my ($ts, $th, $atime) = $self->tok_unpack ($packed);
700              
701 0         0 if ($atime < $too_old) {
702             $deleted++;
703             }
704 0         0 else {
705             # if token atime > newest, reset to newest ...
706             if ($atime > $vars[10]) {
707 0         0 $atime = $vars[10];
708 0         0 }
  0         0  
709 0 0       0  
710             $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
711 0         0 if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
712             if ($ts + $th == 1) {
713 0 0       0 $num_hapaxes++;
714 0         0 } elsif ($ts < 8 && $th < 8) {
715             $num_lowfreq++;
716             }
717             }
718 0 0       0  
719 0         0 if ((($kept + $deleted) % 1000) == 0) {
720             if ($showdots) { print STDERR "."; }
721             $self->set_running_expire_tok();
722 0         0 }
  0         0  
723 0 0 0     0 }
  0         0  
724 0 0 0     0  
    0          
725 0         0 # and add the magic tokens. don't add the expire_running token.
726             $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
727 0         0  
728             # We haven't changed messages of each type seen, so just copy over.
729             $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1];
730             $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2];
731 0 0       0  
732 0 0       0 # We magically haven't removed the newest token, so just copy that value over.
  0         0  
733 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10];
734              
735             # The rest of these have been modified, so replace as necessary.
736             $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
737             $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
738 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
739             $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
740              
741 0         0 # Sanity check: if we expired too many tokens, abort!
742 0         0 if ($kept < 100000) {
743             dbg("bayes: token expiration would expire too many tokens, aborting");
744             # set the magic tokens appropriately
745 0         0 # make sure the next expire run does a first pass
746             $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
747             $self->{db_toks}->{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
748 0         0 $self->{db_toks}->{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
749 0         0  
750 0         0 # remove the new DB
751 0         0 untie %new_toks;
752             for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
753              
754 0 0       0 # reset the results for the return
755 0         0 $kept = $vars[3];
756             $deleted = 0;
757             $num_hapaxes = 0;
758 0         0 $num_lowfreq = 0;
759 0         0 }
760 0         0 else {
761             # now untie so we can do renames
762             untie %{$self->{db_toks}};
763 0         0 untie %new_toks;
764 0         0  
  0         0  
765             # This is the critical phase (moving files around), so don't allow
766             # it to be interrupted. Scope the signal changes.
767 0         0 {
768 0         0 local $SIG{'INT'} = 'IGNORE';
769 0         0 local $SIG{'TERM'} = 'IGNORE';
770 0         0 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
771              
772             # now rename in the new one. Try several extensions
773             for my $ext ($self->DB_EXTENSIONS) {
774 0         0 my $newf = $tmpdbname.$ext;
  0         0  
775 0         0 my $oldf = $path.'_toks'.$ext;
776             next unless (-f $newf);
777             if (!rename ($newf, $oldf)) {
778             warn "bayes: rename $newf to $oldf failed: $!\n";
779             }
780 0         0 }
  0         0  
781 0         0 }
782 0 0       0 }
783              
784             # Call untie_db() so we unlock correctly.
785 0         0 $self->untie_db();
786 0         0  
787 0         0 return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
788 0 0       0 }
789 0 0       0  
790 0         0 ###########################################################################
791              
792             # Is a sync due?
793             my ($self) = @_;
794              
795             # don't bother doing old db versions
796             return 0 if ($self->{db_version} < $self->DB_VERSION);
797 0         0  
798             my $conf = $self->{bayes}->{main}->{conf};
799 0         0 return 0 if ($conf->{bayes_journal_max_size} == 0);
800              
801             my @vars = $self->get_storage_variables();
802             dbg("bayes: DB journal sync: last sync: %s", $vars[7]);
803              
804             ## Ok, should we do a sync?
805              
806 10     10 1 32 # Not if the journal file doesn't exist, it's not a file, or it's 0
807             # bytes long.
808             return 0 unless (stat($self->_get_journal_filename()) && -f _);
809 10 50       42  
810             # Yes if the file size is larger than the specified maximum size.
811 10         24 return 1 if (-s _ > $conf->{bayes_journal_max_size});
812 10 50       35  
813             # Yes there has been a sync before, and if it's been at least a day
814 10         37 # since that sync.
815 10         52 return 1 if (($vars[7] > 0) && (time - $vars[7] > 86400));
816              
817             # No, I guess not.
818             return 0;
819             }
820              
821 10 100 66     53 ###########################################################################
822             # db_seen reading APIs
823              
824 4 50       24 my ($self, $msgid) = @_;
825             $self->{db_seen}->{$msgid};
826             }
827              
828 4 50 33     15 my ($self, $msgid, $seen) = @_;
829              
830             if ($self->{bayes}->{main}->{learn_to_journal}) {
831 4         20 $self->defer_update ("m $seen $msgid");
832             }
833             else {
834             $self->_seen_put_direct($msgid, $seen);
835             }
836             }
837             my ($self, $msgid, $seen) = @_;
838 24     24 1 88 $self->{db_seen}->{$msgid} = $seen;
839 24         496 }
840              
841             my ($self, $msgid) = @_;
842              
843 6     6 1 38 if ($self->{bayes}->{main}->{learn_to_journal}) {
844             $self->defer_update ("m f $msgid");
845 6 100       35 }
846 2         34 else {
847             $self->_seen_delete_direct($msgid);
848             }
849 4         25 }
850             my ($self, $msgid) = @_;
851             delete $self->{db_seen}->{$msgid};
852             }
853 6     6   30  
854 6         252 ###########################################################################
855             # db reading APIs
856              
857             my ($self, $tok) = @_;
858 4     4 1 26 $self->tok_unpack ($self->{db_toks}->{$tok});
859             }
860 4 50       30
861 0         0 my ($self, @tokens) = @_;
862              
863             my @tokensdata;
864 4         28 foreach my $token (@tokens) {
865             my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token});
866             push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]);
867             }
868 4     4   16 return \@tokensdata;
869 4         126 }
870              
871             # return the magic tokens in a specific order:
872             # 0: scan count base
873             # 1: number of spam
874             # 2: number of ham
875             # 3: number of tokens in db
876 3878     3878 1 6392 # 4: last expire atime
877 3878         10029 # 5: oldest token in db atime
878             # 6: db version value
879             # 7: last journal sync
880             # 8: last atime delta
881 6     6 1 254 # 9: last expire reduction count
882             # 10: newest token in db atime
883 6         24 #
884 6         21 my ($self) = @_;
885 1658         4218 my @values;
886 1658         5607  
887             my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
888 6         94  
889             if (!$db_ver || $db_ver =~ /\D/) { $db_ver = 0; }
890              
891             if ($db_ver >= 2) {
892             my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
893             my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
894             my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
895             my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
896             my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
897             my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
898             my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
899             my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
900             my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
901             my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
902              
903             @values = (
904             0,
905 64     64 1 138 $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
906 64         110 $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
907             $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
908 64         1374 $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
909             $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
910 64 100 66     592 $db_ver,
  4         10  
911             $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
912 64 100       216 $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
    50          
    0          
913 60         142 $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
914 60         101 $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
915 60         121 );
916 60         102 }
917 60         108 elsif ($db_ver == 0) {
918 60         115 my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
919 60         108 my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
920 60         116 my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
921 60         99 my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
922 60         113 my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
923             my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
924              
925             @values = (
926             $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
927             $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
928             $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
929             $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
930             $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
931             $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
932             0,
933             0,
934             0,
935 60         4452 0,
936             0,
937             );
938             }
939 4         16 elsif ($db_ver == 1) {
940 4         94 my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
941 4         12 my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
942 4         9 my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
943 4         10 my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
944 4         14 my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
945             my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
946              
947             @values = (
948             $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
949             $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
950             $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
951             $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
952 4         73 $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
953             $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
954             1,
955             0,
956             0,
957             0,
958             0,
959             );
960             }
961 0         0  
962 0         0 foreach (@values) {
963 0         0 if (!$_ || $_ =~ /\D/) {
964 0         0 $_ = 0;
965 0         0 }
966 0         0 }
967              
968             return @values;
969             }
970              
971             my ($self, $template, $regex, @vars) = @_;
972              
973             while (my ($tok, $tokvalue) = each %{$self->{db_toks}}) {
974 0         0 next if ($tok =~ MAGIC_RE); # skip magic tokens
975             next if (defined $regex && ($tok !~ /$regex/o));
976              
977             # We have the value already, so just unpack it.
978             my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue);
979            
980             my $prob = $self->{bayes}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
981             $prob ||= 0.5;
982            
983 64         353 my $encoded_tok = unpack("H*",$tok);
984 704 100 66     1540 printf $template,$prob,$ts,$th,$atime,$encoded_tok;
985 414         564 }
986             }
987              
988             my ($self, $time) = @_;
989 64         327 $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
990             }
991              
992             ## Don't bother using get_magic_tokens here. This token should only
993 0     0 1 0 ## ever exist when we're running expire, so we don't want to convert it if
994             ## it's there and we're not expiring ...
995 0         0 my ($self) = @_;
  0         0  
996 0 0       0 my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
997 0 0 0     0 if (!$running || $running =~ /\D/) { return; }
998             return $running;
999             }
1000 0         0  
1001             my ($self) = @_;
1002 0         0  
1003 0   0     0 # update the lock and running expire magic token
1004             $self->{bayes}->{main}->{locker}->refresh_lock ($self->{locked_file});
1005 0         0 $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
1006 0         0 }
1007              
1008             my ($self) = @_;
1009             delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
1010             }
1011 0     0 1 0  
1012 0         0 ###########################################################################
1013              
1014             # db abstraction: allow deferred writes, since we will be frequently
1015             # writing while checking.
1016              
1017             my ($self, $ds, $dh, $tok, $atime) = @_;
1018              
1019 10     10 1 19 $atime = 0 unless defined $atime;
1020 10         194  
1021 10 50 33     61 if ($self->{bayes}->{main}->{learn_to_journal}) {
  10         32  
1022 0         0 # we can't store the SHA1 binary value in the journal, so convert it
1023             # to a printable value that can be converted back later
1024             my $encoded_tok = unpack("H*",$tok);
1025             $self->defer_update ("c $ds $dh $atime $encoded_tok");
1026 2     2 1 7 } else {
1027             $self->tok_sync_counters ($ds, $dh, $atime, $tok);
1028             }
1029 2         19 }
1030 2         63  
1031             my ($self, $ds, $dh, $tokens, $atime) = @_;
1032              
1033             $atime = 0 unless defined $atime;
1034 0     0 1 0  
1035 0         0 foreach my $tok (keys %{$tokens}) {
1036             if ($self->{bayes}->{main}->{learn_to_journal}) {
1037             # we can't store the SHA1 binary value in the journal, so convert it
1038             # to a printable value that can be converted back later
1039             my $encoded_tok = unpack("H*",$tok);
1040             $self->defer_update ("c $ds $dh $atime $encoded_tok");
1041             } else {
1042             $self->tok_sync_counters ($ds, $dh, $atime, $tok);
1043             }
1044 0     0 1 0 }
1045             }
1046 0 0       0  
1047             my ($self) = @_;
1048 0 0       0 my @vars = $self->get_storage_variables();
1049             ($vars[1], $vars[2]);
1050             }
1051 0         0  
1052 0         0 my ($self, $ds, $dh) = @_;
1053              
1054 0         0 if ($self->{bayes}->{main}->{learn_to_journal}) {
1055             $self->defer_update ("n $ds $dh");
1056             } else {
1057             $self->tok_sync_nspam_nham ($ds, $dh);
1058             }
1059 10     10 1 55 }
1060              
1061 10 100       137 my ($self, $tok, $atime) = @_;
1062             # we can't store the SHA1 binary value in the journal, so convert it
1063 10         41 # to a printable value that can be converted back later
  10         303  
1064 2770 100       7492 my $encoded_tok = unpack("H*", $tok);
1065             $self->defer_update ("t $atime $encoded_tok");
1066             }
1067 554         825  
1068 554         1471 my ($self, $tokens, $atime) = @_;
1069              
1070 2216         4446 foreach my $token (@{$tokens}) {
1071             # we can't store the SHA1 binary value in the journal, so convert it
1072             # to a printable value that can be converted back later
1073             my $encoded_tok = unpack("H*", $token);
1074             $self->defer_update ("t $atime $encoded_tok");
1075             }
1076 10     10 1 26 }
1077 10         27  
1078 10         55 my ($self, $str) = @_;
1079             $self->{string_to_journal} .= "$str\n";
1080             }
1081              
1082 10     10 1 40 ###########################################################################
1083              
1084 10 100       62 my ($self) = @_;
1085 2         76  
1086             my $nbytes = length ($self->{string_to_journal});
1087 8         41 return if ($nbytes == 0);
1088              
1089             my $path = $self->_get_journal_filename();
1090              
1091             # use append mode, write atomically, then close, so simultaneous updates are
1092 0     0 1 0 # not lost
1093             my $conf = $self->{bayes}->{main}->{conf};
1094              
1095 0         0 # set the umask to the inverse of what we want ...
1096 0         0 my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
1097              
1098             if (!open (OUT, ">>".$path)) {
1099             warn "bayes: cannot write to $path, bayes db update ignored: $!\n";
1100 4     4 1 20 umask $umask; # reset umask
1101             return;
1102 4         9 }
  4         19  
1103             umask $umask; # reset umask
1104              
1105 282         460 # do not use print() here, it will break up the buffer if it's >8192 bytes,
1106 282         640 # which could result in two sets of tokens getting mixed up and their
1107             # touches missed.
1108             my $write_failure = 0;
1109             my $original_point = tell OUT;
1110             $original_point >= 0 or die "Can't obtain file position: $!";
1111 840     840 0 1488 my $len;
1112 840         2207 do {
1113             $len = syswrite (OUT, $self->{string_to_journal}, $nbytes);
1114              
1115             # argh, write failure, give up
1116             if (!defined $len || $len < 0) {
1117             my $err = '';
1118 14     14 1 48 if (!defined $len) {
1119             $len = 0;
1120 14         64 $err = " ($!)";
1121 14 100       79 }
1122             warn "bayes: write failed to Bayes journal $path ($len of $nbytes)!$err\n";
1123 6         60 last;
1124             }
1125              
1126             # This shouldn't happen, but could if the fs is full...
1127 6         17 if ($len != $nbytes) {
1128             warn "bayes: partial write to bayes journal $path ($len of $nbytes), recovering\n";
1129              
1130 6         65 # we want to be atomic, so revert the journal file back to where
1131             # we know it's "good". if we can't truncate the journal, or we've
1132 6 50       673 # tried 5 times to do the write, abort!
1133 0         0 if (!truncate(OUT, $original_point) || ($write_failure++ > 4)) {
1134 0         0 warn "bayes: cannot write to bayes journal $path, aborting!\n";
1135 0         0 last;
1136             }
1137 6         52  
1138             # if the fs is full, let's give the system a break
1139             sleep 1;
1140             }
1141             } while ($len != $nbytes);
1142 6         30  
1143 6         30 if (!close OUT) {
1144 6 50       34 warn "bayes: cannot write to $path, bayes db update ignored\n";
1145 6         9 }
1146 6         11  
1147 6         366 $self->{string_to_journal} = '';
1148             }
1149              
1150 6 50 33     93 # Return a qr'd RE to match a token with the correct format's magic token
1151 0         0 my ($self) = @_;
1152 0 0       0  
1153 0         0 if (!defined $self->{db_version} || $self->{db_version} >= 1) {
1154 0         0 return MAGIC_RE;
1155             }
1156 0         0  
1157 0         0 # When in doubt, assume v0
1158             return qr/^\*\*[A-Z]+$/;
1159             }
1160              
1161 6 50       51 # provide a more generalized public interface into the journal sync
1162 0         0  
1163             my ($self, $opts) = @_;
1164              
1165             return $self->_sync_journal($opts);
1166             }
1167 0 0 0     0  
1168 0         0 ###########################################################################
1169 0         0 # And this method reads the journal and applies the changes in one
1170             # (locked) transaction.
1171              
1172             my ($self, $opts) = @_;
1173 0         0 my $ret = 0;
1174              
1175             my $path = $self->_get_journal_filename();
1176              
1177 6 50       115 # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
1178 0         0 if (!stat($path) || !-f _ || -z _) {
1179             return 0;
1180             }
1181 6         48  
1182             my $eval_stat;
1183             eval {
1184             local $SIG{'__DIE__'}; # do not run user die() traps in here
1185             if ($self->tie_db_writable()) {
1186 532     532 1 891 $ret = $self->_sync_journal_trapped($opts, $path);
1187             }
1188 532 50 66     1738 1;
1189 532         1226 } or do {
1190             $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1191             };
1192              
1193 0         0 # ok, untie from write-mode if we can
1194             if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
1195             $self->untie_db();
1196             }
1197              
1198             # handle any errors that may have occurred
1199 2     2 1 8 if (defined $eval_stat) {
1200             warn "bayes: $eval_stat\n";
1201 2         38 return 0;
1202             }
1203              
1204             $ret;
1205             }
1206              
1207             my ($self, $opts, $path) = @_;
1208              
1209 2     2   9 # Flag that we're doing work
1210 2         6 $self->set_running_expire_tok();
1211              
1212 2         10 my $started = time();
1213             my $count = 0;
1214             my $total_count = 0;
1215 2 50 33     70 my %tokens;
      33        
1216 0         0 my $showdots = $opts->{showdots};
1217             my $retirepath = $path.".old";
1218              
1219 2         8 # if $path doesn't exist, or it's not a file, or is 0 bytes in length,
1220             # return we have to check again since the file may have been removed
1221 2         10 # by a recent bayes db upgrade ...
1222 2 50       14 if (!stat($path) || !-f _ || -z _) {
1223 2         27 return 0;
1224             }
1225 2         18  
1226 2 50       5 if (!-r $path) { # will we be able to read the file?
1227 0 0       0 warn "bayes: bad permissions on journal, can't read: $path\n";
  0         0  
1228             return 0;
1229             }
1230              
1231 2 50       13 # This is the critical phase (moving files around), so don't allow
1232 2         14 # it to be interrupted.
1233             {
1234             local $SIG{'INT'} = 'IGNORE';
1235             local $SIG{'TERM'} = 'IGNORE';
1236 2 50       12 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
1237 0         0  
1238 0         0 # retire the journal, so we can update the db files from it in peace.
1239             # TODO: use locking here
1240             if (!rename ($path, $retirepath)) {
1241 2         33 warn "bayes: failed rename $path to $retirepath\n";
1242             return 0;
1243             }
1244              
1245 2     2   13 # now read the retired journal
1246             local *JOURNAL;
1247             if (!open (JOURNAL, "<$retirepath")) {
1248 2         28 warn "bayes: cannot open read $retirepath\n";
1249             return 0;
1250 2         19 }
1251 2         8  
1252 2         4  
1253 2         6 # Read the journal
1254 2         6 for ($!=0; defined($_=<JOURNAL>); $!=0) {
1255 2         12 $total_count++;
1256              
1257             if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries
1258             my $tok = pack("H*",$2);
1259             $tokens{$tok} = $1+0 if (!exists $tokens{$tok} || $1+0 > $tokens{$tok});
1260 2 50 33     64 } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update
      33        
1261 0         0 my $tok = pack("H*",$4);
1262             $self->tok_sync_counters ($1+0, $2+0, $3+0, $tok);
1263             $count++;
1264 2 50       35 } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
1265 0         0 $self->tok_sync_nspam_nham ($1+0, $2+0);
1266 0         0 $count++;
1267             } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
1268             if ($1 eq "f") {
1269             $self->_seen_delete_direct($2);
1270             }
1271             else {
1272 2         7 $self->_seen_put_direct($2,$1);
  2         62  
1273 2         43 }
1274 2 50       17 $count++;
1275             } else {
1276             warn "bayes: gibberish entry found in journal: $_";
1277             }
1278 2 50       127 }
1279 0         0 defined $_ || $!==0 or
1280 0         0 $!==EBADF ? dbg("bayes: error reading journal file: $!")
1281             : die "error reading journal file: $!";
1282             close(JOURNAL) or die "Can't close journal file: $!";
1283              
1284 2         12 # Now that we've determined what tokens we need to update and their
1285 2 50       98 # final values, update the DB. Should be much smaller than the full
1286 0         0 # journal entries.
1287 0         0 while (my ($k,$v) = each %tokens) {
1288             $self->tok_touch_token ($v, $k);
1289              
1290             if ((++$count % 1000) == 0) {
1291             if ($showdots) { print STDERR "."; }
1292 2         93 $self->set_running_expire_tok();
1293 558         862 }
1294             }
1295 558 50       3392  
    100          
    100          
    50          
1296 0         0 if ($showdots) { print STDERR "\n"; }
1297 0 0 0     0  
1298             # we're all done, so unlink the old journal file
1299 554         1862 unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
1300 554         2708  
1301 554         3681 $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
1302              
1303 2         39 my $done = time();
1304 2         30 my $msg = ("bayes: synced databases from journal in " .
1305             ($done - $started) .
1306 2 50       23 " seconds: $count unique entries ($total_count total entries)");
1307 0         0  
1308             if ($opts->{verbose}) {
1309             print $msg,"\n";
1310 2         17 } else {
1311             dbg($msg);
1312 2         43 }
1313             }
1314 0         0  
1315             # else, that's the lot, we're synced. return
1316             return 1;
1317 2 0 33     66 }
    50          
1318              
1319             my ($self, $atime, $tok) = @_;
1320 2 50       44 my ($ts, $th, $oldatime) = $self->tok_get ($tok);
1321              
1322             # If the new atime is < the old atime, ignore the update
1323             # We figure that we'll never want to lower a token atime, so abort if
1324             # we try. (journal out of sync, etc.)
1325 2         18 return if ($oldatime >= $atime);
1326 0         0  
1327             $self->tok_put ($tok, $ts, $th, $atime);
1328 0 0       0 }
1329 0 0       0  
  0         0  
1330 0         0 my ($self, $ds, $dh, $atime, $tok) = @_;
1331             my ($ts, $th, $oldatime) = $self->tok_get ($tok);
1332             $ts += $ds; if ($ts < 0) { $ts = 0; }
1333             $th += $dh; if ($th < 0) { $th = 0; }
1334 2 50       24  
  0         0  
1335             # Don't roll the atime of tokens backwards ...
1336             $atime = $oldatime if ($oldatime > $atime);
1337 2 50       205  
1338             $self->tok_put ($tok, $ts, $th, $atime);
1339 2         65 }
1340              
1341 2         9 my ($self, $tok, $ts, $th, $atime) = @_;
1342 2         20 $ts ||= 0;
1343             $th ||= 0;
1344              
1345             # Ignore magic tokens, the don't go in this way ...
1346 2 50       41 return if ($tok =~ MAGIC_RE);
1347 0         0  
1348             # use defined() rather than exists(); the latter is not supported
1349 2         24 # by NDBM_File, believe it or not. Using defined() did not
1350             # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
1351             my $exists_already = defined $self->{db_toks}->{$tok};
1352              
1353             if ($ts == 0 && $th == 0) {
1354 2         14 return if (!$exists_already); # If the token doesn't exist, just return
1355             $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
1356             delete $self->{db_toks}->{$tok};
1357             } else {
1358 0     0 0 0 if (!$exists_already) { # If the token doesn't exist, raise the token count
1359 0         0 $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
1360             }
1361              
1362             $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
1363              
1364 0 0       0 my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
1365             if (!defined ($newmagic) || $atime > $newmagic) {
1366 0         0 $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
1367             }
1368              
1369             # Make sure to check for either !defined or "" ... Apparently
1370 2770     2770 0 6974 # sometimes the DB module doesn't return the value correctly. :(
1371 2770         5100 my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
1372 2770 50       5427 if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
  2770         4991  
  0         0  
1373 2770 50       3183 $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
  2770         4350  
  0         0  
1374             }
1375             }
1376 2770 100       5216 }
1377              
1378 2770         5141 my ($self, $ds, $dh) = @_;
1379             my ($ns, $nh) = ($self->get_storage_variables())[1,2];
1380             if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
1381             if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
1382 2770     2770 0 6189 $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
1383 2770   100     7474 $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
1384 2770   100     7293 }
1385              
1386             ###########################################################################
1387 2770 50       5516  
1388             my ($self) = @_;
1389              
1390             my $main = $self->{bayes}->{main};
1391             return $main->sed_path($main->{conf}->{bayes_path}."_journal");
1392 2770         11079 }
1393              
1394 2770 100 100     8924 ###########################################################################
1395 1108 50       1833  
1396 1108         24105 # this is called directly from sa-learn(1).
1397 1108         23278 my ($self, $opts) = @_;
1398             my $ret = 0;
1399 1662 50       2801  
1400 1662         34179 my $eval_stat;
1401             eval {
1402             local $SIG{'__DIE__'}; # do not run user die() traps in here
1403 1662         6714  
1404             use File::Basename;
1405 1662         16868  
1406 1662 100 66     9351 # bayes directory
1407 4         60 my $main = $self->{bayes}->{main};
1408             my $path = $main->sed_path($main->{conf}->{bayes_path});
1409              
1410             # prevent dirname() from tainting the result, it assumes $1 is not tainted
1411             local($1,$2,$3); # Bug 6310; perl #67962 (fixed in perl 5.12/5.13)
1412 1662         16366 my $dir = dirname($path);
1413 1662 100 66     14907  
      66        
1414 4         69 # make temporary copy since old dbm and new dbm may have same name
1415             opendir(DIR, $dir) or die "bayes: can't opendir $dir: $!";
1416             my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
1417             closedir(DIR) or die "bayes: can't close directory $dir: $!";
1418             if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
1419             {
1420 10     10 0 43 die "bayes: unable to find bayes_toks and bayes_seen, stopping\n";
1421 10         49 }
1422 10 100       47 # untaint @files (already safe after grep)
  6 50       34  
  10         39  
  0         0  
1423 10 100       53 untaint_var(\@files);
  4 50       13  
  10         34  
  0         0  
1424 10         382
1425 10         227 for (@files) {
1426             my $src = "$dir/$_";
1427             my $dst = "$dir/old_$_";
1428             eval q{
1429             use File::Copy;
1430             copy($src, $dst);
1431 18     18   44 } || die "bayes: can't copy $src to $dst: $!\n";
1432             }
1433 18         44  
1434 18         123 # delete previous to make way for import
1435             for (@files) { unlink("$dir/$_"); }
1436              
1437             # import
1438             if ($self->tie_db_writable()) {
1439             $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
1440             $self->{db_seen});
1441 0     0 1 0 $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
1442 0         0 $self->{db_toks});
1443             }
1444 0         0  
1445             if ($ret == 2) {
1446 0         0 print "import successful, original files saved with \"old\" prefix\n";
1447             }
1448 21     21   256 else {
  21         785  
  21         46835  
1449             print "import failed, original files saved with \"old\" prefix\n";
1450             }
1451 0         0 1;
1452 0         0 } or do {
1453             $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1454             };
1455 0         0  
1456 0         0 $self->untie_db();
1457              
1458             # if we died, untie the dbm files
1459 0 0       0 if (defined $eval_stat) {
1460 0         0 warn "bayes: perform_upgrade: $eval_stat\n";
  0         0  
1461 0 0       0 return 0;
1462 0 0 0     0 }
      0        
1463             $ret;
1464 0         0 }
1465              
1466             my ($self, $filename, $output) = @_;
1467 0         0  
1468             my $count;
1469 0         0 my %in;
1470 0         0  
1471 0         0 print "upgrading to DB_File, please be patient: $filename\n";
1472 0 0       0  
1473             # try each type of file until we find one with > 0 entries
1474             for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
1475             $count = 0;
1476             # wrap in eval so it doesn't run in general use. This accesses db
1477             # modules directly.
1478             # Note: (bug 2390), the 'use' needs to be on the same line as the eval
1479 0         0 # for RPM dependency checks to work properly. It's lame, but...
  0         0  
1480             my $eval_stat;
1481             eval 'use ' . $dbm . ';
1482 0 0       0 tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
1483             %{ $output } = %in;
1484 0         0 $count = scalar keys %{ $output };
1485             untie %in;
1486 0         0 1;
1487             ' or do {
1488             $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1489 0 0       0 };
1490 0         0 if (defined $eval_stat) {
1491             print "$dbm: $dbm module not installed(?), nothing copied: $eval_stat\n";
1492             dbg("bayes: error was: $eval_stat");
1493 0         0 }
1494             elsif ($count == 0) {
1495 0         0 print "$dbm: no database of that kind found, nothing copied\n";
1496 0 0       0 }
1497 0 0       0 else {
  0         0  
1498             print "$dbm: copied $count entries\n";
1499             return 1;
1500 0         0 }
1501             }
1502              
1503 0 0       0 return 0;
1504 0         0 }
1505 0         0  
1506             my ($self) = @_;
1507 0         0  
1508             return 0 unless ($self->tie_db_writable());
1509              
1510             dbg("bayes: untie-ing in preparation for removal.");
1511 0     0 0 0  
1512             foreach my $dbname (@DBNAMES) {
1513 0         0 my $db_var = 'db_'.$dbname;
1514              
1515             if (exists $self->{$db_var}) {
1516 0         0 # dbg("bayes: untie-ing $db_var");
1517             untie %{$self->{$db_var}};
1518             delete $self->{$db_var};
1519 0         0 }
1520 0         0 }
1521              
1522             my $path = $self->{bayes}->{main}->sed_path($self->{bayes}->{main}->{conf}->{bayes_path});
1523              
1524             foreach my $dbname (@DBNAMES, 'journal') {
1525 0         0 foreach my $ext ($self->DB_EXTENSIONS) {
1526             my $name = $path.'_'.$dbname.$ext;
1527             my $ret = unlink $name;
1528             dbg("bayes: clear_database: %s %s",
1529             $ret ? 'removed' : 'tried to remove', $name);
1530             }
1531             }
1532 0 0       0  
1533 0 0       0 # the journal file needs to be done separately since it has no extension
  0         0  
1534             foreach my $dbname ('journal') {
1535 0 0       0 my $name = $path.'_'.$dbname;
    0          
1536 0         0 my $ret = unlink $name;
1537 0         0 dbg("bayes: clear_database: %s %s",
1538             $ret ? 'removed' : 'tried to remove', $name);
1539             }
1540 0         0  
1541             $self->untie_db();
1542              
1543 0         0 return 1;
1544 0         0 }
1545              
1546             my ($self) = @_;
1547              
1548 0         0 # we tie writable because we want the upgrade code to kick in if needed
1549             return 0 unless ($self->tie_db_writable());
1550              
1551             my @vars = $self->get_storage_variables();
1552 2     2 1 9  
1553             print "v\t$vars[6]\tdb_version # this must be the first line!!!\n";
1554 2 50       11 print "v\t$vars[1]\tnum_spam\n";
1555             print "v\t$vars[2]\tnum_nonspam\n";
1556 2         8  
1557             while (my ($tok, $packed) = each %{$self->{db_toks}}) {
1558 2         13 next if ($tok =~ MAGIC_RE); # skip magic tokens
1559 4         13  
1560             my ($ts, $th, $atime) = $self->tok_unpack($packed);
1561 4 50       11 my $encoded_token = unpack("H*",$tok);
1562             print "t\t$ts\t$th\t$atime\t$encoded_token\n";
1563 4         7 }
  4         53  
1564 4         20  
1565             while (my ($msgid, $flag) = each %{$self->{db_seen}}) {
1566             print "s\t$flag\t$msgid\n";
1567             }
1568 2         14  
1569             $self->untie_db();
1570 2         9  
1571 6         21 return 1;
1572 12         26 }
1573 12         550  
1574 12 100       80 my ($self, $filename, $showdots) = @_;
1575              
1576             local *DUMPFILE;
1577             if (!open(DUMPFILE, '<', $filename)) {
1578             dbg("bayes: unable to open backup file $filename: $!");
1579             return 0;
1580 2         8 }
1581 2         14
1582 2         67 if (!$self->tie_db_writable()) {
1583 2 50       17 dbg("bayes: failed to tie db writable");
1584             return 0;
1585             }
1586              
1587 2         10 my $main = $self->{bayes}->{main};
1588             my $path = $main->sed_path($main->{conf}->{bayes_path});
1589 2         23  
1590             # use a temporary PID-based suffix just in case another one was
1591             # created previously by an interrupted expire
1592             my $tmpsuffix = "convert$$";
1593 0     0 1 0 my $tmptoksdbname = $path.'_toks.'.$tmpsuffix;
1594             my $tmpseendbname = $path.'_seen.'.$tmpsuffix;
1595             my $toksdbname = $path.'_toks';
1596 0 0       0 my $seendbname = $path.'_seen';
1597              
1598 0         0 my %new_toks;
1599             my %new_seen;
1600 0         0 my $umask = umask 0;
1601 0         0 unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL,
1602 0         0 (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
1603             dbg("bayes: failed to tie temp toks db: $!");
1604 0         0 $self->untie_db();
  0         0  
1605 0 0       0 umask $umask;
1606             return 0;
1607 0         0 }
1608 0         0 unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL,
1609 0         0 (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
1610             dbg("bayes: failed to tie temp seen db: $!");
1611             untie %new_toks;
1612 0         0 $self->_unlink_file($tmptoksdbname);
  0         0  
1613 0         0 $self->untie_db();
1614             umask $umask;
1615             return 0;
1616 0         0 }
1617             umask $umask;
1618 0         0  
1619             my $line_count = 0;
1620             my $db_version;
1621             my $token_count = 0;
1622 0     0 1 0 my $num_spam;
1623             my $num_ham;
1624 0         0 my $error_p = 0;
1625 0 0       0 my $newest_token_age = 0;
1626 0         0 # Kinda weird I know, but we need a nice big value and we know there will be
1627 0         0 # no tokens > time() since we reset atime if > time(), so use that with a
1628             # little buffer just in case.
1629             my $oldest_token_age = time() + 100000;
1630 0 0       0  
1631 0         0 my $line = <DUMPFILE>;
1632 0         0 defined $line or die "Error reading dump file: $!";
1633             $line_count++;
1634              
1635 0         0 # We require the database version line to be the first in the file so we can
1636 0         0 # figure out how to properly deal with the file. If it is not the first
1637             # line then fail
1638             if ($line =~ m/^v\s+(\d+)\s+db_version/) {
1639             $db_version = $1;
1640 0         0 }
1641 0         0 else {
1642 0         0 dbg("bayes: database version must be the first line in the backup file, correct and re-run");
1643 0         0 untie %new_toks;
1644 0         0 untie %new_seen;
1645             $self->_unlink_file($tmptoksdbname);
1646 0         0 $self->_unlink_file($tmpseendbname);
1647             $self->untie_db();
1648 0         0 return 0;
1649 0 0       0 }
1650              
1651 0         0 unless ($db_version == 2 || $db_version == 3) {
1652 0         0 warn("bayes: database version $db_version is unsupported, must be version 2 or 3");
1653 0         0 untie %new_toks;
1654 0         0 untie %new_seen;
1655             $self->_unlink_file($tmptoksdbname);
1656 0 0       0 $self->_unlink_file($tmpseendbname);
1657             $self->untie_db();
1658 0         0 return 0;
1659 0         0 }
1660 0         0  
1661 0         0 for ($!=0; defined($line=<DUMPFILE>); $!=0) {
1662 0         0 chomp($line);
1663 0         0 $line_count++;
1664              
1665 0         0 if ($line_count % 1000 == 0) {
1666             print STDERR "." if ($showdots);
1667 0         0 }
1668 0         0  
1669 0         0 if ($line =~ /^v\s+/) { # variable line
1670 0         0 my @parsed_line = split(/\s+/, $line, 3);
1671             my $value = $parsed_line[1] + 0;
1672 0         0 if ($parsed_line[2] eq 'num_spam') {
1673 0         0 $num_spam = $value;
1674             }
1675             elsif ($parsed_line[2] eq 'num_nonspam') {
1676             $num_ham = $value;
1677 0         0 }
1678             else {
1679 0         0 dbg("bayes: restore_database: skipping unknown line: $line");
1680 0 0       0 }
1681 0         0 }
1682             elsif ($line =~ /^t\s+/) { # token line
1683             my @parsed_line = split(/\s+/, $line, 5);
1684             my $spam_count = $parsed_line[1] + 0;
1685             my $ham_count = $parsed_line[2] + 0;
1686 0 0       0 my $atime = $parsed_line[3] + 0;
1687 0         0 my $token = $parsed_line[4];
1688              
1689             my $token_warn_p = 0;
1690 0         0 my @warnings;
1691 0         0  
1692 0         0 if ($spam_count < 0) {
1693 0         0 $spam_count = 0;
1694 0         0 push(@warnings, 'spam count < 0, resetting');
1695 0         0 $token_warn_p = 1;
1696 0         0 }
1697             if ($ham_count < 0) {
1698             $ham_count = 0;
1699 0 0 0     0 push(@warnings, 'ham count < 0, resetting');
1700 0         0 $token_warn_p = 1;
1701 0         0 }
1702 0         0  
1703 0         0 if ($spam_count == 0 && $ham_count == 0) {
1704 0         0 dbg("bayes: token has zero spam and ham count, skipping");
1705 0         0 next;
1706 0         0 }
1707              
1708             if ($atime > time()) {
1709 0         0 $atime = time();
1710 0         0 push(@warnings, 'atime > current time, resetting');
1711 0         0 $token_warn_p = 1;
1712             }
1713 0 0       0  
1714 0 0       0 if ($token_warn_p) {
1715             dbg("bayes: token (%s) has the following warnings:\n%s",
1716             $token, join("\n",@warnings));
1717 0 0       0 }
    0          
    0          
1718 0         0  
1719 0         0 # database versions < 3 did not encode their token values
1720 0 0       0 if ($db_version < 3) {
    0          
1721 0         0 $token = substr(sha1($token), -5);
1722             }
1723             else {
1724 0         0 # turn unpacked binary token back into binary value
1725             $token = pack("H*",$token);
1726             }
1727 0         0  
1728             $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime);
1729             if ($atime < $oldest_token_age) {
1730             $oldest_token_age = $atime;
1731 0         0 }
1732 0         0 if ($atime > $newest_token_age) {
1733 0         0 $newest_token_age = $atime;
1734 0         0 }
1735 0         0 $token_count++;
1736             }
1737 0         0 elsif ($line =~ /^s\s+/) { # seen line
1738 0         0 my @parsed_line = split(/\s+/, $line, 3);
1739             my $flag = $parsed_line[1];
1740 0 0       0 my $msgid = $parsed_line[2];
1741 0         0  
1742 0         0 unless ($flag eq 'h' || $flag eq 's') {
1743 0         0 dbg("bayes: unknown seen flag ($flag) for line: $line, skipping");
1744             next;
1745 0 0       0 }
1746 0         0  
1747 0         0 unless ($msgid) {
1748 0         0 dbg("bayes: blank msgid for line: $line, skipping");
1749             next;
1750             }
1751 0 0 0     0  
1752 0         0 $new_seen{$msgid} = $flag;
1753 0         0 }
1754             else {
1755             dbg("bayes: skipping unknown line: $line");
1756 0 0       0 next;
1757 0         0 }
1758 0         0 }
1759 0         0 defined $line || $!==0 or die "Error reading dump file: $!";
1760             close(DUMPFILE) or die "Can't close dump file: $!";
1761              
1762 0 0       0 print STDERR "\n" if ($showdots);
1763 0         0  
1764             unless (defined($num_spam)) {
1765             dbg("bayes: unable to find num spam, please check file");
1766             $error_p = 1;
1767             }
1768 0 0       0  
1769 0         0 unless (defined($num_ham)) {
1770             dbg("bayes: unable to find num ham, please check file");
1771             $error_p = 1;
1772             }
1773 0         0  
1774             if ($error_p) {
1775             dbg("bayes: error(s) while attempting to load $filename, correct and re-run");
1776 0         0  
1777 0 0       0 untie %new_toks;
1778 0         0 untie %new_seen;
1779             $self->_unlink_file($tmptoksdbname);
1780 0 0       0 $self->_unlink_file($tmpseendbname);
1781 0         0 $self->untie_db();
1782             return 0;
1783 0         0 }
1784              
1785             # set the calculated magic tokens
1786 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION();
1787 0         0 $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count;
1788 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam;
1789             $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham;
1790 0 0 0     0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age;
1791 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age;
1792 0         0  
1793             # go ahead and zero out these, chances are good that they are bogus anyway.
1794             $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0;
1795 0 0       0 $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0;
1796 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
1797 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
1798              
1799             local $SIG{'INT'} = 'IGNORE';
1800 0         0 local $SIG{'TERM'} = 'IGNORE';
1801             local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
1802              
1803 0         0 untie %new_toks;
1804 0         0 untie %new_seen;
1805             $self->untie_db();
1806              
1807 0 0 0     0 # Here is where something can go horribly wrong and screw up the bayes
1808 0 0       0 # database files. If we are able to copy one and not the other then it
1809             # will leave the database in an inconsistent state. Since this is an
1810 0 0       0 # edge case, and they're trying to replace the DB anyway we should be ok.
1811             unless ($self->_rename_file($tmptoksdbname, $toksdbname)) {
1812 0 0       0 dbg("bayes: error while renaming $tmptoksdbname to $toksdbname: $!");
1813 0         0 return 0;
1814 0         0 }
1815             unless ($self->_rename_file($tmpseendbname, $seendbname)) {
1816             dbg("bayes: error while renaming $tmpseendbname to $seendbname: $!");
1817 0 0       0 dbg("bayes: database now in inconsistent state");
1818 0         0 return 0;
1819 0         0 }
1820              
1821             dbg("bayes: parsed $line_count lines");
1822 0 0       0 dbg("bayes: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");
1823 0         0  
1824             return 1;
1825 0         0 }
1826 0         0  
1827 0         0 ###########################################################################
1828 0         0  
1829 0         0 # token marshalling format for db_toks.
1830 0         0  
1831             # Since we may have many entries with few hits, especially thousands of hapaxes
1832             # (1-occurrence entries), use a flexible entry format, instead of simply "2
1833             # packed ints", to keep the memory and disk space usage down. In my
1834 0         0 # 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
1835 0         0 # can use a 1-byte representation for the other 91% of low-hitting entries
1836 0         0 # and save masses of space.
1837 0         0  
1838 0         0 # This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
1839 0         0 # ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte
1840             # representation; otherwise, if XX in the first byte is 00, it's packed as
1841             # "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
1842 0         0  
1843 0         0 # Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
1844 0         0  
1845 0         0 use constant FORMAT_FLAG => 0xc0; # 11000000
1846             use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
1847 0         0 use constant TWO_LONGS_FORMAT => 0x00; # 00000000
1848 0         0  
1849 0 0       0 use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
1850             use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
1851 0         0  
1852 0         0 my ($self, $value) = @_;
1853 0         0 $value ||= 0;
1854              
1855             my ($packed, $atime);
1856             if ($self->{db_version} >= 1) {
1857             ($packed, $atime) = unpack("CV", $value);
1858             }
1859 0 0       0 elsif ($self->{db_version} == 0) {
1860 0         0 ($packed, $atime) = unpack("CS", $value);
1861 0         0 }
1862              
1863 0 0       0 if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
1864 0         0 return (($packed & ONE_BYTE_SSS_BITS) >> 3,
1865 0         0 $packed & ONE_BYTE_HHH_BITS,
1866 0         0 $atime || 0);
1867             }
1868             elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
1869 0         0 my ($packed, $ts, $th, $atime);
1870 0         0 if ($self->{db_version} >= 1) {
1871             ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
1872 0         0 }
1873             elsif ($self->{db_version} == 0) {
1874             ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
1875             }
1876             return ($ts || 0, $th || 0, $atime || 0);
1877             }
1878             # other formats would go here...
1879             else {
1880             warn "bayes: unknown packing format for bayes db, please re-learn: $packed";
1881             return (0, 0, 0);
1882             }
1883             }
1884              
1885             my ($self, $ts, $th, $atime) = @_;
1886             $ts ||= 0; $th ||= 0; $atime ||= 0;
1887             if ($ts < 8 && $th < 8) {
1888             return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
1889             } else {
1890             return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
1891             }
1892             }
1893 21     21   217  
  21         332  
  21         1474  
1894 21     21   398 ###########################################################################
  21         71  
  21         1531  
1895 21     21   153  
  21         57  
  21         988  
1896             my ($self) = @_;
1897 21     21   134 return $self->{already_tied};
  21         546  
  21         4951  
1898 21     21   132 }
  21         399  
  21         8869  
1899              
1900             my ($self) = @_;
1901 5536     5536 0 56963 return $self->{already_tied} && $self->{is_locked};
1902 5536   100     17553 }
1903              
1904 5536         7212 ###########################################################################
1905 5536 50       9780  
    0          
1906 5536         12130 my ($self, $filename) = @_;
1907              
1908             unlink $filename;
1909 0         0 }
1910              
1911             my ($self, $sourcefilename, $targetfilename) = @_;
1912 5536 100       11523  
    50          
1913 3134   50     11099 return 0 unless (rename($sourcefilename, $targetfilename));
1914              
1915             return 1;
1916             }
1917              
1918 2402         2862  
1919 2402 50       3354 1;
    0