File Coverage

blib/lib/Mail/SpamAssassin/Bayes/CombineChi.pm
Criterion Covered Total %
statement 50 54 92.5
branch 7 10 70.0
condition n/a
subroutine 8 8 100.0
pod n/a
total 65 72 90.2


line stmt bran cond sub pod time code
1             # Chi-square probability combining and related constants.
2             #
3             # <@LICENSE>
4             # Licensed to the Apache Software Foundation (ASF) under one or more
5             # contributor license agreements. See the NOTICE file distributed with
6             # this work for additional information regarding copyright ownership.
7             # The ASF licenses this file to you under the Apache License, Version 2.0
8             # (the "License"); you may not use this file except in compliance with
9             # the License. You may obtain a copy of the License at:
10             #
11             # http://www.apache.org/licenses/LICENSE-2.0
12             #
13             # Unless required by applicable law or agreed to in writing, software
14             # distributed under the License is distributed on an "AS IS" BASIS,
15             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16             # See the License for the specific language governing permissions and
17             # limitations under the License.
18             # </@LICENSE>
19              
20 22     22   140 use strict; # make Test::Perl::Critic happy
  22         50  
  22         1158  
21              
22             # this package is a no-op; the real impl code is in another pkg.
23              
24             # Force into another package, so our symbols will appear in that namespace with
25             # no indirection, for speed. Other combiners must do the same, since Bayes.pm
26             # uses this namespace directly. This means only one combiner can be loaded at
27             # any time.
28              
29             use strict;
30             use warnings;
31 22     22   122 # use bytes;
  22         47  
  22         476  
32 22     22   128 use re 'taint';
  22         42  
  22         721  
33              
34 22     22   117 use POSIX qw(frexp);
  22         44  
  22         868  
35             use constant LN2 => log(2);
36 22     22   155  
  22         51  
  22         439  
37 22     22   2641 # Value for 'x' in Gary Robinson's f(w) equation.
  22         41  
  22         8709  
38             # "Let x = the number used when n [hits] is 0."
39             our $FW_X_CONSTANT = 0.538;
40              
41             # Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
42             # the use of "s") of an original assumed expectation ... relative to how
43             # strongly we want to consider our actual collected data." Low 's' means
44             # trust collected data more strongly.
45             our $FW_S_CONSTANT = 0.030;
46              
47             # (s . x) for the f(w) equation.
48             our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
49              
50             # Should we ignore tokens with probs very close to the middle ground (.5)?
51             # tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
52             our $MIN_PROB_STRENGTH = 0.346;
53              
54             ###########################################################################
55              
56             # Chi-Squared method. Produces mostly boolean $result,
57             # but with a grey area.
58             my ($ns, $nn, $sortedref) = @_;
59              
60             # @$sortedref contains an array of the probabilities
61 4     4   16 my $wc = scalar @$sortedref;
62             return unless $wc;
63              
64 4         8 my ($H, $S);
65 4 50       10 my ($Hexp, $Sexp);
66             $Hexp = $Sexp = 0;
67 4         12  
68 4         0 # see bug 3118
69 4         10 my $totmsgs = ($ns + $nn);
70             if ($totmsgs == 0) { return; }
71             $S = ($ns / $totmsgs);
72 4         10 $H = ($nn / $totmsgs);
73 4 50       12  
  0         0  
74 4         9 foreach my $prob (@$sortedref) {
75 4         8 $S *= 1.0 - $prob;
76             $H *= $prob;
77 4         19 if ($S < 1e-200) {
78 282         283 my $e;
79 282         265 ($S, $e) = frexp($S);
80 282 50       362 $Sexp += $e;
81 0         0 }
82 0         0 if ($H < 1e-200) {
83 0         0 my $e;
84             ($H, $e) = frexp($H);
85 282 100       632 $Hexp += $e;
86 2         7 }
87 2         23 }
88 2         7  
89             $S = log($S) + $Sexp * LN2;
90             $H = log($H) + $Hexp * LN2;
91              
92 4         33 # note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
93 4         10 # fn then just used ($v/2) internally! changed to simply supply $wc as
94             # ($halfv) directly instead to avoid redundant doubling and halving. The
95             # side-effect is that chi2q() uses a different API now, but it's only used
96             # here anyway.
97              
98             $S = 1.0 - chi2q(-2.0 * $S, $wc);
99             $H = 1.0 - chi2q(-2.0 * $H, $wc);
100             return (($S - $H) + 1.0) / 2.0;
101 4         24 }
102 4         13  
103 4         19 # Chi-squared function (API changed; see comment above)
104             my ($x2, $halfv) = @_;
105              
106             my $m = $x2 / 2.0;
107             my ($sum, $term);
108 8     8   14 $sum = $term = exp(0 - $m);
109            
110 8         16 # replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
111 8         10 # array, with a plain C-style for loop
112 8         48 my $i;
113             for ($i = 1; $i < $halfv; $i++) {
114             $term *= $m / $i;
115             $sum += $term;
116 8         19 }
117 8         282 return $sum < 1.0 ? $sum : 1.0;
118 556         536 }
119 556         754  
120             1;