File Coverage

blib/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
Criterion Covered Total %
statement 50 54 92.5
branch 6 10 60.0
condition n/a
subroutine 8 8 100.0
pod n/a
total 64 72 88.8


line stmt bran cond sub pod time code
1             # Chi-square probability combining and related constants.
2             #
3             # <@LICENSE>
4             # Licensed to the Apache Software Foundation (ASF) under one or more
5             # contributor license agreements. See the NOTICE file distributed with
6             # this work for additional information regarding copyright ownership.
7             # The ASF licenses this file to you under the Apache License, Version 2.0
8             # (the "License"); you may not use this file except in compliance with
9             # the License. You may obtain a copy of the License at:
10             #
11             # http://www.apache.org/licenses/LICENSE-2.0
12             #
13             # Unless required by applicable law or agreed to in writing, software
14             # distributed under the License is distributed on an "AS IS" BASIS,
15             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16             # See the License for the specific language governing permissions and
17             # limitations under the License.
18             # </@LICENSE>
19              
20 21     21   149 use strict; # make Test::Perl::Critic happy
  21         47  
  21         1359  
21              
22             # this package is a no-op; the real impl code is in another pkg.
23             package Mail::SpamAssassin::Bayes::CombineChi; 1;
24              
25             # Force into another package, so our symbols will appear in that namespace with
26             # no indirection, for speed. Other combiners must do the same, since Bayes.pm
27             # uses this namespace directly. This means only one combiner can be loaded at
28             # any time.
29             package Mail::SpamAssassin::Bayes::Combine;
30              
31 21     21   145 use strict;
  21         51  
  21         490  
32 21     21   108 use warnings;
  21         65  
  21         721  
33             # use bytes;
34 21     21   133 use re 'taint';
  21         52  
  21         908  
35              
36 21     21   141 use POSIX qw(frexp);
  21         42  
  21         462  
37 21     21   2776 use constant LN2 => log(2);
  21         46  
  21         9119  
38              
39             # Value for 'x' in Gary Robinson's f(w) equation.
40             # "Let x = the number used when n [hits] is 0."
41             our $FW_X_CONSTANT = 0.538;
42              
43             # Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
44             # the use of "s") of an original assumed expectation ... relative to how
45             # strongly we want to consider our actual collected data." Low 's' means
46             # trust collected data more strongly.
47             our $FW_S_CONSTANT = 0.030;
48              
49             # (s . x) for the f(w) equation.
50             our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
51              
52             # Should we ignore tokens with probs very close to the middle ground (.5)?
53             # tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
54             our $MIN_PROB_STRENGTH = 0.346;
55              
56             ###########################################################################
57              
58             # Chi-Squared method. Produces mostly boolean $result,
59             # but with a grey area.
60             sub combine {
61 4     4   15 my ($ns, $nn, $sortedref) = @_;
62              
63             # @$sortedref contains an array of the probabilities
64 4         17 my $wc = scalar @$sortedref;
65 4 50       13 return unless $wc;
66              
67 4         15 my ($H, $S);
68 4         0 my ($Hexp, $Sexp);
69 4         13 $Hexp = $Sexp = 0;
70              
71             # see bug 3118
72 4         18 my $totmsgs = ($ns + $nn);
73 4 50       12 if ($totmsgs == 0) { return; }
  0         0  
74 4         10 $S = ($ns / $totmsgs);
75 4         10 $H = ($nn / $totmsgs);
76              
77 4         20 foreach my $prob (@$sortedref) {
78 282         352 $S *= 1.0 - $prob;
79 282         339 $H *= $prob;
80 282 50       491 if ($S < 1e-200) {
81 0         0 my $e;
82 0         0 ($S, $e) = frexp($S);
83 0         0 $Sexp += $e;
84             }
85 282 100       584 if ($H < 1e-200) {
86 2         14 my $e;
87 2         17 ($H, $e) = frexp($H);
88 2         7 $Hexp += $e;
89             }
90             }
91              
92 4         24 $S = log($S) + $Sexp * LN2;
93 4         12 $H = log($H) + $Hexp * LN2;
94              
95             # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
96             # fn then just used ($v/2) internally! changed to simply supply $wc as
97             # ($halfv) directly instead to avoid redundant doubling and halving. The
98             # side-effect is that chi2q() uses a different API now, but it's only used
99             # here anyway.
100              
101 4         22 $S = 1.0 - chi2q(-2.0 * $S, $wc);
102 4         30 $H = 1.0 - chi2q(-2.0 * $H, $wc);
103 4         20 return (($S - $H) + 1.0) / 2.0;
104             }
105              
106             # Chi-squared function (API changed; see comment above)
107             sub chi2q {
108 8     8   21 my ($x2, $halfv) = @_;
109              
110 8         19 my $m = $x2 / 2.0;
111 8         17 my ($sum, $term);
112 8         59 $sum = $term = exp(0 - $m);
113            
114             # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
115             # array, with a plain C-style for loop
116 8         26 my $i;
117 8         27 for ($i = 1; $i < $halfv; $i++) {
118 556         729 $term *= $m / $i;
119 556         992 $sum += $term;
120             }
121 8 50       29 return $sum < 1.0 ? $sum : 1.0;
122             }
123              
124             1;