| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# Chi-square probability combining and related constants. |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# <@LICENSE> |
|
4
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
5
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
|
6
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
|
7
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
|
8
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
|
9
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
|
14
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
15
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
16
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
|
17
|
|
|
|
|
|
|
# limitations under the License. |
|
18
|
|
|
|
|
|
|
# </@LICENSE> |
|
19
|
|
|
|
|
|
|
|
|
20
|
22
|
|
|
22
|
|
140
|
use strict; # make Test::Perl::Critic happy |
|
|
22
|
|
|
|
|
50
|
|
|
|
22
|
|
|
|
|
1158
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# this package is a no-op; the real impl code is in another pkg. |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# Force into another package, so our symbols will appear in that namespace with |
|
25
|
|
|
|
|
|
|
# no indirection, for speed. Other combiners must do the same, since Bayes.pm |
|
26
|
|
|
|
|
|
|
# uses this namespace directly. This means only one combiner can be loaded at |
|
27
|
|
|
|
|
|
|
# any time. |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use strict; |
|
30
|
|
|
|
|
|
|
use warnings; |
|
31
|
22
|
|
|
22
|
|
122
|
# use bytes; |
|
|
22
|
|
|
|
|
47
|
|
|
|
22
|
|
|
|
|
476
|
|
|
32
|
22
|
|
|
22
|
|
128
|
use re 'taint'; |
|
|
22
|
|
|
|
|
42
|
|
|
|
22
|
|
|
|
|
721
|
|
|
33
|
|
|
|
|
|
|
|
|
34
|
22
|
|
|
22
|
|
117
|
use POSIX qw(frexp); |
|
|
22
|
|
|
|
|
44
|
|
|
|
22
|
|
|
|
|
868
|
|
|
35
|
|
|
|
|
|
|
use constant LN2 => log(2); |
|
36
|
22
|
|
|
22
|
|
155
|
|
|
|
22
|
|
|
|
|
51
|
|
|
|
22
|
|
|
|
|
439
|
|
|
37
|
22
|
|
|
22
|
|
2641
|
# Value for 'x' in Gary Robinson's f(w) equation. |
|
|
22
|
|
|
|
|
41
|
|
|
|
22
|
|
|
|
|
8709
|
|
|
38
|
|
|
|
|
|
|
# "Let x = the number used when n [hits] is 0." |
|
39
|
|
|
|
|
|
|
our $FW_X_CONSTANT = 0.538; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence |
|
42
|
|
|
|
|
|
|
# the use of "s") of an original assumed expectation ... relative to how |
|
43
|
|
|
|
|
|
|
# strongly we want to consider our actual collected data." Low 's' means |
|
44
|
|
|
|
|
|
|
# trust collected data more strongly. |
|
45
|
|
|
|
|
|
|
our $FW_S_CONSTANT = 0.030; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# (s . x) for the f(w) equation. |
|
48
|
|
|
|
|
|
|
our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Should we ignore tokens with probs very close to the middle ground (.5)? |
|
51
|
|
|
|
|
|
|
# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used. |
|
52
|
|
|
|
|
|
|
our $MIN_PROB_STRENGTH = 0.346; |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
########################################################################### |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Chi-Squared method. Produces mostly boolean $result, |
|
57
|
|
|
|
|
|
|
# but with a grey area. |
|
58
|
|
|
|
|
|
|
my ($ns, $nn, $sortedref) = @_; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# @$sortedref contains an array of the probabilities |
|
61
|
4
|
|
|
4
|
|
16
|
my $wc = scalar @$sortedref; |
|
62
|
|
|
|
|
|
|
return unless $wc; |
|
63
|
|
|
|
|
|
|
|
|
64
|
4
|
|
|
|
|
8
|
my ($H, $S); |
|
65
|
4
|
50
|
|
|
|
10
|
my ($Hexp, $Sexp); |
|
66
|
|
|
|
|
|
|
$Hexp = $Sexp = 0; |
|
67
|
4
|
|
|
|
|
12
|
|
|
68
|
4
|
|
|
|
|
0
|
# see bug 3118 |
|
69
|
4
|
|
|
|
|
10
|
my $totmsgs = ($ns + $nn); |
|
70
|
|
|
|
|
|
|
if ($totmsgs == 0) { return; } |
|
71
|
|
|
|
|
|
|
$S = ($ns / $totmsgs); |
|
72
|
4
|
|
|
|
|
10
|
$H = ($nn / $totmsgs); |
|
73
|
4
|
50
|
|
|
|
12
|
|
|
|
0
|
|
|
|
|
0
|
|
|
74
|
4
|
|
|
|
|
9
|
foreach my $prob (@$sortedref) { |
|
75
|
4
|
|
|
|
|
8
|
$S *= 1.0 - $prob; |
|
76
|
|
|
|
|
|
|
$H *= $prob; |
|
77
|
4
|
|
|
|
|
19
|
if ($S < 1e-200) { |
|
78
|
282
|
|
|
|
|
283
|
my $e; |
|
79
|
282
|
|
|
|
|
265
|
($S, $e) = frexp($S); |
|
80
|
282
|
50
|
|
|
|
362
|
$Sexp += $e; |
|
81
|
0
|
|
|
|
|
0
|
} |
|
82
|
0
|
|
|
|
|
0
|
if ($H < 1e-200) { |
|
83
|
0
|
|
|
|
|
0
|
my $e; |
|
84
|
|
|
|
|
|
|
($H, $e) = frexp($H); |
|
85
|
282
|
100
|
|
|
|
632
|
$Hexp += $e; |
|
86
|
2
|
|
|
|
|
7
|
} |
|
87
|
2
|
|
|
|
|
23
|
} |
|
88
|
2
|
|
|
|
|
7
|
|
|
89
|
|
|
|
|
|
|
$S = log($S) + $Sexp * LN2; |
|
90
|
|
|
|
|
|
|
$H = log($H) + $Hexp * LN2; |
|
91
|
|
|
|
|
|
|
|
|
92
|
4
|
|
|
|
|
33
|
# note: previous versions used (2 * $wc) as second arg ($v), but the chi2q() |
|
93
|
4
|
|
|
|
|
10
|
# fn then just used ($v/2) internally! changed to simply supply $wc as |
|
94
|
|
|
|
|
|
|
# ($halfv) directly instead to avoid redundant doubling and halving. The |
|
95
|
|
|
|
|
|
|
# side-effect is that chi2q() uses a different API now, but it's only used |
|
96
|
|
|
|
|
|
|
# here anyway. |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$S = 1.0 - chi2q(-2.0 * $S, $wc); |
|
99
|
|
|
|
|
|
|
$H = 1.0 - chi2q(-2.0 * $H, $wc); |
|
100
|
|
|
|
|
|
|
return (($S - $H) + 1.0) / 2.0; |
|
101
|
4
|
|
|
|
|
24
|
} |
|
102
|
4
|
|
|
|
|
13
|
|
|
103
|
4
|
|
|
|
|
19
|
# Chi-squared function (API changed; see comment above) |
|
104
|
|
|
|
|
|
|
my ($x2, $halfv) = @_; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
my $m = $x2 / 2.0; |
|
107
|
|
|
|
|
|
|
my ($sum, $term); |
|
108
|
8
|
|
|
8
|
|
14
|
$sum = $term = exp(0 - $m); |
|
109
|
|
|
|
|
|
|
|
|
110
|
8
|
|
|
|
|
16
|
# replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp |
|
111
|
8
|
|
|
|
|
10
|
# array, with a plain C-style for loop |
|
112
|
8
|
|
|
|
|
48
|
my $i; |
|
113
|
|
|
|
|
|
|
for ($i = 1; $i < $halfv; $i++) { |
|
114
|
|
|
|
|
|
|
$term *= $m / $i; |
|
115
|
|
|
|
|
|
|
$sum += $term; |
|
116
|
8
|
|
|
|
|
19
|
} |
|
117
|
8
|
|
|
|
|
282
|
return $sum < 1.0 ? $sum : 1.0; |
|
118
|
556
|
|
|
|
|
536
|
} |
|
119
|
556
|
|
|
|
|
754
|
|
|
120
|
|
|
|
|
|
|
1; |