| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Algorithm::Classifier::IsolationForest; |
|
2
|
|
|
|
|
|
|
|
|
3
|
8
|
|
|
8
|
|
703549
|
use strict; |
|
|
8
|
|
|
|
|
16
|
|
|
|
8
|
|
|
|
|
308
|
|
|
4
|
8
|
|
|
8
|
|
32
|
use warnings; |
|
|
8
|
|
|
|
|
27
|
|
|
|
8
|
|
|
|
|
400
|
|
|
5
|
8
|
|
|
8
|
|
40
|
use Carp qw(croak); |
|
|
8
|
|
|
|
|
13
|
|
|
|
8
|
|
|
|
|
378
|
|
|
6
|
8
|
|
|
8
|
|
34
|
use List::Util qw(min); |
|
|
8
|
|
|
|
|
18
|
|
|
|
8
|
|
|
|
|
490
|
|
|
7
|
8
|
|
|
8
|
|
3543
|
use POSIX qw(ceil); |
|
|
8
|
|
|
|
|
59001
|
|
|
|
8
|
|
|
|
|
38
|
|
|
8
|
8
|
|
|
8
|
|
15930
|
use JSON::PP (); |
|
|
8
|
|
|
|
|
128821
|
|
|
|
8
|
|
|
|
|
246
|
|
|
9
|
8
|
|
|
8
|
|
4048
|
use File::Slurp qw(read_file write_file); |
|
|
8
|
|
|
|
|
156552
|
|
|
|
8
|
|
|
|
|
687
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = '0.0.1'; |
|
12
|
|
|
|
|
|
|
|
|
13
|
8
|
|
|
8
|
|
59
|
use constant EULER => 0.5772156649015329; |
|
|
8
|
|
|
|
|
17
|
|
|
|
8
|
|
|
|
|
442
|
|
|
14
|
8
|
|
|
8
|
|
36
|
use constant TWO_PI => 6.283185307179586; |
|
|
8
|
|
|
|
|
11
|
|
|
|
8
|
|
|
|
|
18447
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Algorithm::Classifier::IsolationForest - unsupervised anomaly detection via Isolation Forest or Extended Isolation Forest |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Algorithm::Classifier::IsolationForest; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my @data = ([0.1, -0.2], [0.0, 0.1], [5.0, 6.0], ...); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Classic, axis-parallel Isolation Forest |
|
27
|
|
|
|
|
|
|
my $iforest = Algorithm::Classifier::IsolationForest->new( |
|
28
|
|
|
|
|
|
|
n_trees => 100, |
|
29
|
|
|
|
|
|
|
sample_size => 256, |
|
30
|
|
|
|
|
|
|
seed => 42, |
|
31
|
|
|
|
|
|
|
); |
|
32
|
|
|
|
|
|
|
$iforest->fit(\@data); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $scores = $iforest->score_samples(\@data); # arrayref, each in (0,1] |
|
35
|
|
|
|
|
|
|
my $flags = $iforest->predict(\@data, 0.6); # arrayref of 0/1 |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Save and reload |
|
38
|
|
|
|
|
|
|
$iforest->save('model.json'); |
|
39
|
|
|
|
|
|
|
my $reloaded = Algorithm::Classifier::IsolationForest->load('model.json'); |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Extended Isolation Forest (oblique hyperplane splits) |
|
42
|
|
|
|
|
|
|
my $eif = IsolationForest->new(mode => 'extended', seed => 42); |
|
43
|
|
|
|
|
|
|
$eif->fit(\@data); |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
Isolation Forest (Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua, 2008) detects anomalies by random |
|
48
|
|
|
|
|
|
|
partitioning rather than by modelling normal points. Each tree repeatedly |
|
49
|
|
|
|
|
|
|
splits the data. Points that get isolated after only a few splits are likely |
|
50
|
|
|
|
|
|
|
anomalies. The score is the average isolation depth across many trees, |
|
51
|
|
|
|
|
|
|
normalised so values approach 1 for anomalies and stay below 0.5 for normal |
|
52
|
|
|
|
|
|
|
points. |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
In extended mode the module implements the Extended Isolation Forest |
|
55
|
|
|
|
|
|
|
variant. Each split is a random hyperplane instead of an axis-aligned cut, |
|
56
|
|
|
|
|
|
|
which removes the rectangular, axis-aligned bias in the score field and |
|
57
|
|
|
|
|
|
|
tends to help on elongated or multi-modal data. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
psi refernced below is ψ or the pitchfork math symbol refrenced in paper, |
|
60
|
|
|
|
|
|
|
Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17. |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
... or max samples. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
L |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 GENERAL METHODS |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 new(%args) |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Inits the object. |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
- n_trees :: number of isolation trees in the ensemble |
|
73
|
|
|
|
|
|
|
default :: 100 |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
- sample_size :: sub-sample size used to build each tree... max samples |
|
76
|
|
|
|
|
|
|
default :: 256 |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
- max_depth :: per-tree height limit... if not defined is set to ceil(log2(psi)) |
|
79
|
|
|
|
|
|
|
default :: undef |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
- seed :: optional integer to seed srand with for reproducible trees... |
|
82
|
|
|
|
|
|
|
see perldoc -f srand for more info. This number is processed via abs(int()). |
|
83
|
|
|
|
|
|
|
default :: undef |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
- mode :: if it should be IF or EIF |
|
86
|
|
|
|
|
|
|
axis :: classic axis-parallel splits (IF) |
|
87
|
|
|
|
|
|
|
extended :: oblique hyperplane splits (EIF) |
|
88
|
|
|
|
|
|
|
default :: axis |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
- extension_level :: extended mode only... how many features take partin each |
|
91
|
|
|
|
|
|
|
split. 0 behaves like a single-feature (axis) cut; the |
|
92
|
|
|
|
|
|
|
maximum (n_features - 1) uses every varying feature. undef |
|
93
|
|
|
|
|
|
|
=> maximum. Clamped to [0, n_features - 1] at fit time. |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
- contamination :: expected fraction of anomalies, in (0, 0.5]. When given, |
|
96
|
|
|
|
|
|
|
fit() learns a score threshold that flags this fraction of |
|
97
|
|
|
|
|
|
|
the training set, and predict() uses it by default. undef |
|
98
|
|
|
|
|
|
|
=> no learned threshold (predict() falls back to 0.5). |
|
99
|
|
|
|
|
|
|
default :: undef |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Note: log2 under Perl is as below... |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
log($psi) / log(2) |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub new { |
|
108
|
39
|
|
|
39
|
1
|
937750
|
my ( $class, %args ) = @_; |
|
109
|
|
|
|
|
|
|
|
|
110
|
39
|
|
100
|
|
|
193
|
my $mode = $args{mode} // 'axis'; |
|
111
|
39
|
100
|
100
|
|
|
331
|
croak "mode must be 'axis' or 'extended'" |
|
112
|
|
|
|
|
|
|
unless $mode eq 'axis' || $mode eq 'extended'; |
|
113
|
|
|
|
|
|
|
|
|
114
|
38
|
100
|
|
|
|
99
|
if ( defined( $args{seed} ) ) { |
|
115
|
20
|
|
|
|
|
63
|
$args{seed} = abs( int( $args{seed} ) ); |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my $self = { |
|
119
|
|
|
|
|
|
|
n_trees => $args{n_trees} // 100, |
|
120
|
|
|
|
|
|
|
sample_size => $args{sample_size} // 256, |
|
121
|
|
|
|
|
|
|
max_depth => $args{max_depth}, # undef => auto |
|
122
|
|
|
|
|
|
|
seed => $args{seed}, # undef => non-deterministic |
|
123
|
|
|
|
|
|
|
mode => $mode, |
|
124
|
|
|
|
|
|
|
extension_level => $args{extension_level}, # undef => max, resolved in fit() |
|
125
|
|
|
|
|
|
|
contamination => $args{contamination}, # undef => no learned threshold |
|
126
|
38
|
|
100
|
|
|
442
|
threshold => undef, # learned in fit() if contamination set |
|
|
|
|
100
|
|
|
|
|
|
127
|
|
|
|
|
|
|
trees => [], |
|
128
|
|
|
|
|
|
|
c_psi => undef, # c(psi), set during fit() |
|
129
|
|
|
|
|
|
|
n_features => undef, |
|
130
|
|
|
|
|
|
|
}; |
|
131
|
|
|
|
|
|
|
|
|
132
|
38
|
100
|
|
|
|
189
|
croak "n_trees must be >= 1" unless $self->{n_trees} >= 1; |
|
133
|
37
|
100
|
|
|
|
161
|
croak "sample_size must be >= 1" unless $self->{sample_size} >= 1; |
|
134
|
|
|
|
|
|
|
croak "extension_level must be >= 0" |
|
135
|
36
|
100
|
100
|
|
|
190
|
if defined $self->{extension_level} && $self->{extension_level} < 0; |
|
136
|
|
|
|
|
|
|
croak "contamination must be a number in (0, 0.5]" |
|
137
|
|
|
|
|
|
|
if defined $self->{contamination} |
|
138
|
35
|
100
|
100
|
|
|
392
|
&& !( $self->{contamination} > 0 && $self->{contamination} <= 0.5 ); |
|
|
|
|
100
|
|
|
|
|
|
139
|
|
|
|
|
|
|
|
|
140
|
32
|
|
|
|
|
100
|
return bless $self, $class; |
|
141
|
|
|
|
|
|
|
} ## end sub new |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 decision_threshold |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The score cutoff C uses by default; undef unless C was |
|
146
|
|
|
|
|
|
|
set. |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=cut |
|
149
|
|
|
|
|
|
|
|
|
150
|
6
|
|
|
6
|
1
|
2846
|
sub decision_threshold { return $_[0]->{threshold} } |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=head2 fit |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Trains the model on the specified data. |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The data taken is a array of arrays, with each sub array containing two numbers. |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
@training_data = ( |
|
159
|
|
|
|
|
|
|
[ 3, 5 ], |
|
160
|
|
|
|
|
|
|
[ 2.3, 1 ], |
|
161
|
|
|
|
|
|
|
[ 5, 9 ], |
|
162
|
|
|
|
|
|
|
... |
|
163
|
|
|
|
|
|
|
); |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Below shows a example of building a gausing cluster and using that for training. |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# so it is reproducible |
|
168
|
|
|
|
|
|
|
srand(7); |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# build a gaussian cluster and add a handful out outliers... |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
use constant PI => 3.14159265358979; |
|
173
|
|
|
|
|
|
|
sub gaussian { |
|
174
|
|
|
|
|
|
|
my ($mu, $sigma) = @_; |
|
175
|
|
|
|
|
|
|
my $u1 = rand() || 1e-12; |
|
176
|
|
|
|
|
|
|
my $u2 = rand(); |
|
177
|
|
|
|
|
|
|
my $z = sqrt(-2 * log($u1)) * cos(2 * PI * $u2); |
|
178
|
|
|
|
|
|
|
return $mu + $sigma * $z; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# add some normal items |
|
182
|
|
|
|
|
|
|
for (1 .. 500) { |
|
183
|
|
|
|
|
|
|
push @data, [ gaussian(0, 1), gaussian(0, 1) ]; |
|
184
|
|
|
|
|
|
|
push @truth, 0; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
# add some outliers |
|
187
|
|
|
|
|
|
|
for (1 .. 20) { |
|
188
|
|
|
|
|
|
|
my $angle = rand() * 2 * PI; |
|
189
|
|
|
|
|
|
|
my $radius = 5 + rand() * 3; # distance 5..8 from the origin |
|
190
|
|
|
|
|
|
|
push @data, [ $radius * cos($angle), $radius * sin($angle) ]; |
|
191
|
|
|
|
|
|
|
push @truth, 1; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$iforest->fit(\@training_data); |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub fit { |
|
199
|
24
|
|
|
24
|
1
|
2452
|
my ( $self, $data ) = @_; |
|
200
|
|
|
|
|
|
|
|
|
201
|
24
|
100
|
100
|
|
|
446
|
croak "fit() expects a non-empty arrayref of samples" |
|
202
|
|
|
|
|
|
|
unless ref $data eq 'ARRAY' && @$data; |
|
203
|
|
|
|
|
|
|
croak "each sample must be an arrayref of features" |
|
204
|
21
|
100
|
100
|
|
|
170
|
unless ref $data->[0] eq 'ARRAY' && @{ $data->[0] }; |
|
|
20
|
|
|
|
|
189
|
|
|
205
|
|
|
|
|
|
|
|
|
206
|
19
|
|
|
|
|
37
|
my $n = scalar @$data; |
|
207
|
19
|
|
|
|
|
27
|
my $n_features = scalar @{ $data->[0] }; |
|
|
19
|
|
|
|
|
33
|
|
|
208
|
19
|
|
|
|
|
44
|
$self->{n_features} = $n_features; |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# The sub-sample cannot be larger than the data set itself. |
|
211
|
19
|
|
|
|
|
135
|
my $psi = min( $self->{sample_size}, $n ); |
|
212
|
19
|
|
|
|
|
80
|
$self->{c_psi} = _c($psi); |
|
213
|
19
|
|
|
|
|
54
|
$self->{psi_used} = $psi; |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Resolve the extension level against the data's dimensionality. |
|
216
|
19
|
100
|
|
|
|
59
|
if ( $self->{mode} eq 'extended' ) { |
|
217
|
6
|
|
|
|
|
11
|
my $max_ext = $n_features - 1; |
|
218
|
|
|
|
|
|
|
my $ext |
|
219
|
|
|
|
|
|
|
= defined $self->{extension_level} |
|
220
|
|
|
|
|
|
|
? $self->{extension_level} |
|
221
|
6
|
100
|
|
|
|
13
|
: $max_ext; |
|
222
|
6
|
50
|
|
|
|
13
|
$ext = 0 if $ext < 0; |
|
223
|
6
|
100
|
|
|
|
12
|
$ext = $max_ext if $ext > $max_ext; |
|
224
|
6
|
|
|
|
|
19
|
$self->{extension_level_used} = $ext; |
|
225
|
|
|
|
|
|
|
} else { |
|
226
|
13
|
|
|
|
|
28
|
$self->{extension_level_used} = undef; |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Height limit: the average tree height ceil(log2(psi)). Past this depth the |
|
230
|
|
|
|
|
|
|
# remaining points are scored using the c(size) adjustment instead. |
|
231
|
|
|
|
|
|
|
my $limit |
|
232
|
|
|
|
|
|
|
= defined $self->{max_depth} |
|
233
|
|
|
|
|
|
|
? $self->{max_depth} |
|
234
|
19
|
50
|
|
|
|
162
|
: ceil( log($psi) / log(2) ); |
|
235
|
19
|
50
|
|
|
|
45
|
$limit = 1 if $limit < 1; |
|
236
|
19
|
|
|
|
|
37
|
$self->{max_depth_used} = $limit; |
|
237
|
|
|
|
|
|
|
|
|
238
|
19
|
50
|
|
|
|
64
|
srand( $self->{seed} ) if defined $self->{seed}; |
|
239
|
|
|
|
|
|
|
|
|
240
|
19
|
|
|
|
|
26
|
my @trees; |
|
241
|
19
|
|
|
|
|
52
|
for ( 1 .. $self->{n_trees} ) { |
|
242
|
747
|
|
|
|
|
2120
|
my $sample = _subsample( $data, $psi ); |
|
243
|
747
|
|
|
|
|
1985
|
push @trees, $self->_build_tree( $sample, 0, $limit ); |
|
244
|
|
|
|
|
|
|
} |
|
245
|
19
|
|
|
|
|
77
|
$self->{trees} = \@trees; |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# If a contamination rate was requested, learn the score cutoff that flags |
|
248
|
|
|
|
|
|
|
# that fraction of the training set. We place the threshold midway between |
|
249
|
|
|
|
|
|
|
# the k-th and (k+1)-th highest training scores, so it sits in the gap |
|
250
|
|
|
|
|
|
|
# between flagged and unflagged points -- unambiguous and robust to the |
|
251
|
|
|
|
|
|
|
# tiny float rounding introduced by JSON serialisation. |
|
252
|
19
|
100
|
|
|
|
79
|
if ( defined $self->{contamination} ) { |
|
253
|
3
|
|
|
|
|
19
|
my $scores = $self->score_samples($data); |
|
254
|
3
|
|
|
|
|
41
|
my @desc = sort { $b <=> $a } @$scores; |
|
|
3979
|
|
|
|
|
4379
|
|
|
255
|
3
|
|
|
|
|
8
|
my $n_pts = scalar @desc; |
|
256
|
3
|
|
|
|
|
35
|
my $k = int( $self->{contamination} * $n_pts + 0.5 ); |
|
257
|
3
|
50
|
|
|
|
12
|
$k = 1 if $k < 1; |
|
258
|
3
|
50
|
|
|
|
9
|
$k = $n_pts if $k > $n_pts; |
|
259
|
3
|
50
|
|
|
|
40
|
$self->{threshold} = $k < $n_pts |
|
260
|
|
|
|
|
|
|
? ( $desc[ $k - 1 ] + $desc[$k] ) / 2.0 # midpoint of the boundary |
|
261
|
|
|
|
|
|
|
: $desc[ $n_pts - 1 ] - 1e-9; # k == n: flag everything |
|
262
|
|
|
|
|
|
|
} ## end if ( defined $self->{contamination} ) |
|
263
|
|
|
|
|
|
|
|
|
264
|
19
|
|
|
|
|
108
|
return $self; |
|
265
|
|
|
|
|
|
|
} ## end sub fit |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head2 path_lengths(\@data) |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Returns the mean isolation depth per sample, for inspection. |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
my @lenghts = $forest->path_lengths(\@data); |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
print "x, y, length\n"; |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
my $int=0; |
|
276
|
|
|
|
|
|
|
while (defined($data[$int])) { |
|
277
|
|
|
|
|
|
|
print $data[$int][0].', '.$data[$int][1].', '.$lenghts[$int]."\n"; |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
$int++; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=cut |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub path_lengths { |
|
285
|
2
|
|
|
2
|
1
|
4711
|
my ( $self, $data ) = @_; |
|
286
|
2
|
|
|
|
|
10
|
$self->_check_fitted; |
|
287
|
1
|
|
|
|
|
2
|
my $t = scalar @{ $self->{trees} }; |
|
|
1
|
|
|
|
|
2
|
|
|
288
|
1
|
|
|
|
|
2
|
my @out; |
|
289
|
1
|
|
|
|
|
3
|
for my $x (@$data) { |
|
290
|
100
|
|
|
|
|
124
|
my $sum = 0; |
|
291
|
100
|
|
|
|
|
109
|
$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} }; |
|
|
100
|
|
|
|
|
213
|
|
|
292
|
100
|
|
|
|
|
283
|
push @out, $sum / $t; |
|
293
|
|
|
|
|
|
|
} |
|
294
|
1
|
|
|
|
|
11
|
return \@out; |
|
295
|
|
|
|
|
|
|
} ## end sub path_lengths |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head predict(\@data, $threshold) |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
Returns an arrayref of 0/1 labels for the specified data. |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
If theshold is not specified it uses whatever the set default. |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
my $results = $forest->predict(\@data, $threshold); |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
print "x, y, result\n"; |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
my $int=0; |
|
308
|
|
|
|
|
|
|
while (defined($data[$int])) { |
|
309
|
|
|
|
|
|
|
print $data[$int][0].', '.$data[$int][1].', '.$results->[$int]."\n"; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
$int++; |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=cut |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
sub predict { |
|
317
|
11
|
|
|
11
|
0
|
13460
|
my ( $self, $data, $threshold ) = @_; |
|
318
|
|
|
|
|
|
|
$threshold |
|
319
|
|
|
|
|
|
|
= defined $threshold ? $threshold |
|
320
|
|
|
|
|
|
|
: defined $self->{threshold} ? $self->{threshold} |
|
321
|
11
|
100
|
|
|
|
49
|
: 0.5; |
|
|
|
100
|
|
|
|
|
|
|
322
|
11
|
|
|
|
|
58
|
my $scores = $self->score_samples($data); |
|
323
|
10
|
100
|
|
|
|
79
|
return [ map { $_ >= $threshold ? 1 : 0 } @$scores ]; |
|
|
1107
|
|
|
|
|
1891
|
|
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head2 score_samples(\@data) |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Returns an arrayref of anomaly scores, between 0 and 1. |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Scores near 1 are strong anomalies (isolated quickly). |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
Scores well below 0.5 are normal. |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Scores ~0.5 means the points are hard to tell apart. |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
my $scores = $forest->path_lengths(\@data); |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
print "x, y, length\n"; |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
my $int=0; |
|
341
|
|
|
|
|
|
|
while (defined($data[$int])) { |
|
342
|
|
|
|
|
|
|
print $data[$int][0].', '.$data[$int][1].', '.$scores->[$int]."\n"; |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
$int++; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=cut |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub score_samples { |
|
350
|
31
|
|
|
31
|
1
|
14616
|
my ( $self, $data ) = @_; |
|
351
|
31
|
|
|
|
|
125
|
$self->_check_fitted; |
|
352
|
29
|
|
|
|
|
66
|
my $c = $self->{c_psi}; |
|
353
|
29
|
|
|
|
|
45
|
my $t = scalar @{ $self->{trees} }; |
|
|
29
|
|
|
|
|
54
|
|
|
354
|
|
|
|
|
|
|
|
|
355
|
29
|
|
|
|
|
47
|
my @scores; |
|
356
|
29
|
|
|
|
|
61
|
for my $x (@$data) { |
|
357
|
3042
|
|
|
|
|
3818
|
my $sum = 0; |
|
358
|
3042
|
|
|
|
|
3520
|
$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} }; |
|
|
3042
|
|
|
|
|
6648
|
|
|
359
|
3042
|
|
|
|
|
4436
|
my $avg = $sum / $t; |
|
360
|
3042
|
50
|
|
|
|
16148
|
push @scores, $c > 0 ? 2**( -$avg / $c ) : 0.5; |
|
361
|
|
|
|
|
|
|
} |
|
362
|
29
|
|
|
|
|
178
|
return \@scores; |
|
363
|
|
|
|
|
|
|
} ## end sub score_samples |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head2 score_predict_samples |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Returns a array ref of arrays. First value of each sub array is the score with the second being |
|
368
|
|
|
|
|
|
|
0/1 for if it is a anomaly or not. |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
my $results = $forest->predict(\@data, $threshold); |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
print "x, y, score, result\n"; |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
my $int=0; |
|
375
|
|
|
|
|
|
|
while (defined($data[$int])) { |
|
376
|
|
|
|
|
|
|
print $data[$int][0].', '.$data[$int][1].', '.$results->[$int][0].', '.$results->[$int][1]."\n"; |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
$int++; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=cut |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub score_predict_samples { |
|
384
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $data, $threshold ) = @_; |
|
385
|
|
|
|
|
|
|
$threshold |
|
386
|
|
|
|
|
|
|
= defined $threshold ? $threshold |
|
387
|
|
|
|
|
|
|
: defined $self->{threshold} ? $self->{threshold} |
|
388
|
0
|
0
|
|
|
|
0
|
: 0.5; |
|
|
|
0
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
0
|
my $scores = $self->score_samples($data); |
|
390
|
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
0
|
my @to_return; |
|
392
|
0
|
|
|
|
|
0
|
foreach my $score ( @{$scores} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
393
|
0
|
0
|
|
|
|
0
|
if ( $score >= $threshold ) { |
|
394
|
0
|
|
|
|
|
0
|
push @to_return, [ $score, 1 ]; |
|
395
|
|
|
|
|
|
|
} else { |
|
396
|
0
|
|
|
|
|
0
|
push @to_return, [ $score, 0 ]; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
0
|
|
|
|
|
0
|
return \@to_return; |
|
401
|
|
|
|
|
|
|
} ## end sub score_predict_samples |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 MODEL SAVE/LOAD METHODS |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head2 to_json |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Returns a JSON representation of the module. |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
Required being fit having to be called. |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
my $json = $iforest->to_json; |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=cut |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
sub to_json { |
|
416
|
4
|
|
|
4
|
1
|
242396
|
my ($self) = @_; |
|
417
|
4
|
|
|
|
|
15
|
$self->_check_fitted; |
|
418
|
|
|
|
|
|
|
my $payload = { |
|
419
|
|
|
|
|
|
|
format => 'Algorithm::Classifier::IsolationForest', |
|
420
|
|
|
|
|
|
|
version => 0, |
|
421
|
|
|
|
|
|
|
params => { |
|
422
|
|
|
|
|
|
|
n_trees => $self->{n_trees}, |
|
423
|
|
|
|
|
|
|
sample_size => $self->{sample_size}, |
|
424
|
|
|
|
|
|
|
mode => $self->{mode}, |
|
425
|
|
|
|
|
|
|
extension_level => $self->{extension_level_used}, |
|
426
|
|
|
|
|
|
|
contamination => $self->{contamination}, |
|
427
|
|
|
|
|
|
|
threshold => $self->{threshold}, |
|
428
|
|
|
|
|
|
|
n_features => $self->{n_features}, |
|
429
|
|
|
|
|
|
|
psi_used => $self->{psi_used}, |
|
430
|
|
|
|
|
|
|
c_psi => $self->{c_psi}, |
|
431
|
|
|
|
|
|
|
max_depth_used => $self->{max_depth_used}, |
|
432
|
|
|
|
|
|
|
}, |
|
433
|
|
|
|
|
|
|
trees => $self->{trees}, |
|
434
|
3
|
|
|
|
|
32
|
}; |
|
435
|
3
|
|
|
|
|
24
|
return JSON::PP->new->canonical(1)->encode($payload); |
|
436
|
|
|
|
|
|
|
} ## end sub to_json |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head2 from_json($json) |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Init the object from the model in the specified JSON string. |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
my $iforest = Algorithm::Classifier::IsolationForest->from_json($json); |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=cut |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
sub from_json { |
|
447
|
5
|
|
|
5
|
1
|
250913
|
my ( $class, $text ) = @_; |
|
448
|
5
|
|
|
|
|
33
|
my $payload = JSON::PP->new->decode($text); |
|
449
|
|
|
|
|
|
|
croak "not an IsolationForest model" |
|
450
|
|
|
|
|
|
|
unless ref $payload eq 'HASH' |
|
451
|
|
|
|
|
|
|
&& defined $payload->{format} |
|
452
|
5
|
100
|
100
|
|
|
2078138
|
&& $payload->{format} eq 'Algorithm::Classifier::IsolationForest'; |
|
|
|
|
66
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
|
454
|
3
|
|
100
|
|
|
14
|
my $p = $payload->{params} || {}; |
|
455
|
|
|
|
|
|
|
my $self = { |
|
456
|
|
|
|
|
|
|
n_trees => $p->{n_trees}, |
|
457
|
|
|
|
|
|
|
sample_size => $p->{sample_size}, |
|
458
|
|
|
|
|
|
|
max_depth => undef, |
|
459
|
|
|
|
|
|
|
seed => undef, |
|
460
|
|
|
|
|
|
|
mode => $p->{mode} // 'axis', |
|
461
|
|
|
|
|
|
|
extension_level => $p->{extension_level}, |
|
462
|
|
|
|
|
|
|
extension_level_used => $p->{extension_level}, |
|
463
|
|
|
|
|
|
|
contamination => $p->{contamination}, |
|
464
|
|
|
|
|
|
|
threshold => $p->{threshold}, |
|
465
|
|
|
|
|
|
|
n_features => $p->{n_features}, |
|
466
|
|
|
|
|
|
|
psi_used => $p->{psi_used}, |
|
467
|
|
|
|
|
|
|
c_psi => $p->{c_psi}, |
|
468
|
|
|
|
|
|
|
max_depth_used => $p->{max_depth_used}, |
|
469
|
3
|
|
100
|
|
|
49
|
trees => $payload->{trees} || [], |
|
|
|
|
50
|
|
|
|
|
|
470
|
|
|
|
|
|
|
}; |
|
471
|
3
|
100
|
|
|
|
13
|
croak "model contains no trees" unless @{ $self->{trees} }; |
|
|
3
|
|
|
|
|
122
|
|
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# Recompute the normalising constant from the (integer, exact) sub-sample |
|
474
|
|
|
|
|
|
|
# size rather than trusting the stored float, so a reloaded model's scores |
|
475
|
|
|
|
|
|
|
# are bit-for-bit identical to the original's. |
|
476
|
2
|
50
|
|
|
|
19
|
$self->{c_psi} = _c( $self->{psi_used} ) if defined $self->{psi_used}; |
|
477
|
|
|
|
|
|
|
|
|
478
|
2
|
|
|
|
|
21
|
return bless $self, $class; |
|
479
|
|
|
|
|
|
|
} ## end sub from_json |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=head2 save($path) |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
Saves the model to the specified path. |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
$iforest->save($path); |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=cut |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub save { |
|
490
|
1
|
|
|
1
|
1
|
5046
|
my ( $self, $path ) = @_; |
|
491
|
1
|
|
|
|
|
6
|
write_file( $path, { 'atomic' => 1 }, $self->to_json ); |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=head2 load($path); |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
Init the object from the model in the specified file. |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
my $iforest = Algorithm::Classifier::IsolationForest->load($path); |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=cut |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub load { |
|
503
|
2
|
|
|
2
|
1
|
233575
|
my ( $class, $path ) = @_; |
|
504
|
2
|
|
|
|
|
12
|
my $raw_model = read_file($path); |
|
505
|
1
|
|
|
|
|
294
|
return $class->from_json($raw_model); |
|
506
|
|
|
|
|
|
|
} |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head1 REFERENCES |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17. |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
L |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
L |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
Sahand Hariri, Matias Carrasco Kind, Robert J. Brunner (2020). Extended Isolation Forest. 1479 - 1489. 10.1109/TKDE.2019.2947676 |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
L |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
### |
|
523
|
|
|
|
|
|
|
### |
|
524
|
|
|
|
|
|
|
### internal stuff below |
|
525
|
|
|
|
|
|
|
### |
|
526
|
|
|
|
|
|
|
### |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
529
|
|
|
|
|
|
|
# c(n): the expected path length of an unsuccessful search in a binary search |
|
530
|
|
|
|
|
|
|
# tree of n nodes. Isolation Forest uses it (a) to adjust the path length when a |
|
531
|
|
|
|
|
|
|
# leaf still holds more than one point (depth limit reached), and (b) to |
|
532
|
|
|
|
|
|
|
# normalise the average path length into a 0..1 anomaly score. |
|
533
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
534
|
|
|
|
|
|
|
sub _c { |
|
535
|
237911
|
|
|
237911
|
|
289622
|
my ($n) = @_; |
|
536
|
237911
|
100
|
|
|
|
354205
|
return 0.0 if $n <= 1; |
|
537
|
200978
|
100
|
|
|
|
286565
|
return 1.0 if $n == 2; |
|
538
|
187800
|
|
|
|
|
235835
|
my $harmonic = log( $n - 1 ) + EULER; # H(n-1) ~= ln(n-1) + gamma |
|
539
|
187800
|
|
|
|
|
396780
|
return 2.0 * $harmonic - ( 2.0 * ( $n - 1 ) / $n ); |
|
540
|
|
|
|
|
|
|
} |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# One draw from the standard normal N(0,1) via Box-Muller. Used to pick the |
|
543
|
|
|
|
|
|
|
# random hyperplane orientations in Extended Isolation Forest mode. |
|
544
|
|
|
|
|
|
|
sub _randn { |
|
545
|
8667
|
|
50
|
8667
|
|
12882
|
my $u1 = rand() || 1e-12; |
|
546
|
8667
|
|
|
|
|
9668
|
my $u2 = rand(); |
|
547
|
8667
|
|
|
|
|
14701
|
return sqrt( -2.0 * log($u1) ) * cos( TWO_PI * $u2 ); |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
551
|
|
|
|
|
|
|
# Draw $k samples without replacement via a partial Fisher-Yates shuffle of the |
|
552
|
|
|
|
|
|
|
# index array. Returns an arrayref of (shared, read-only) sample refs. |
|
553
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
554
|
|
|
|
|
|
|
sub _subsample { |
|
555
|
747
|
|
|
747
|
|
1619
|
my ( $data, $k ) = @_; |
|
556
|
747
|
|
|
|
|
1180
|
my $n = scalar @$data; |
|
557
|
747
|
|
|
|
|
5207
|
my @idx = ( 0 .. $n - 1 ); |
|
558
|
747
|
|
|
|
|
1950
|
for my $i ( 0 .. $k - 1 ) { |
|
559
|
130400
|
|
|
|
|
165398
|
my $j = $i + int( rand( $n - $i ) ); |
|
560
|
130400
|
|
|
|
|
186194
|
@idx[ $i, $j ] = @idx[ $j, $i ]; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
747
|
|
|
|
|
7335
|
my @chosen = @idx[ 0 .. $k - 1 ]; |
|
563
|
747
|
|
|
|
|
3307
|
return [ @{$data}[@chosen] ]; |
|
|
747
|
|
|
|
|
13455
|
|
|
564
|
|
|
|
|
|
|
} ## end sub _subsample |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
567
|
|
|
|
|
|
|
# Recursively build one isolation tree. |
|
568
|
|
|
|
|
|
|
# |
|
569
|
|
|
|
|
|
|
# A node is one of: |
|
570
|
|
|
|
|
|
|
# leaf { leaf => 1, size => N } |
|
571
|
|
|
|
|
|
|
# axis { attr => A, split => S, left => ..., right => ... } |
|
572
|
|
|
|
|
|
|
# oblique { idx => [..], coef => [..], b => B, left => ..., right => ... } |
|
573
|
|
|
|
|
|
|
# |
|
574
|
|
|
|
|
|
|
# In both split styles the choice is restricted to features that actually vary |
|
575
|
|
|
|
|
|
|
# across the points reaching the node: this avoids wasted levels on constant |
|
576
|
|
|
|
|
|
|
# columns and lets a node leaf out exactly when its points are indistinguishable. |
|
577
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
578
|
|
|
|
|
|
|
sub _build_tree { |
|
579
|
49529
|
|
|
49529
|
|
71161
|
my ( $self, $X, $depth, $limit ) = @_; |
|
580
|
|
|
|
|
|
|
|
|
581
|
49529
|
|
|
|
|
57558
|
my $size = scalar @$X; |
|
582
|
49529
|
100
|
100
|
|
|
136123
|
return { leaf => 1, size => $size } |
|
583
|
|
|
|
|
|
|
if $depth >= $limit || $size <= 1; |
|
584
|
|
|
|
|
|
|
|
|
585
|
24391
|
|
|
|
|
31809
|
my $nf = $self->{n_features}; |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# Per-feature min and max within this node, in a single pass. |
|
588
|
24391
|
|
|
|
|
29598
|
my ( @lo, @hi ); |
|
589
|
24391
|
|
|
|
|
32677
|
for my $row (@$X) { |
|
590
|
985236
|
|
|
|
|
1273133
|
for my $f ( 0 .. $nf - 1 ) { |
|
591
|
1973283
|
|
|
|
|
2297821
|
my $v = $row->[$f]; |
|
592
|
1973283
|
100
|
100
|
|
|
4098513
|
$lo[$f] = $v if !defined $lo[$f] || $v < $lo[$f]; |
|
593
|
1973283
|
100
|
100
|
|
|
4428628
|
$hi[$f] = $v if !defined $hi[$f] || $v > $hi[$f]; |
|
594
|
|
|
|
|
|
|
} |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# Features with spread are the only ones that can split the data. |
|
598
|
24391
|
|
|
|
|
43187
|
my @varying = grep { $lo[$_] < $hi[$_] } 0 .. $nf - 1; |
|
|
48854
|
|
|
|
|
88594
|
|
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
# No spread on any feature => all points identical => cannot isolate. |
|
601
|
24391
|
50
|
|
|
|
37978
|
return { leaf => 1, size => $size } unless @varying; |
|
602
|
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
my $node |
|
604
|
24391
|
100
|
|
|
|
56844
|
= $self->{mode} eq 'extended' |
|
605
|
|
|
|
|
|
|
? $self->_oblique_split( $X, \@varying, \@lo, \@hi ) |
|
606
|
|
|
|
|
|
|
: _axis_split( $X, \@varying, \@lo, \@hi ); |
|
607
|
|
|
|
|
|
|
|
|
608
|
24391
|
|
|
|
|
52924
|
$node->{left} = $self->_build_tree( $node->{_left}, $depth + 1, $limit ); |
|
609
|
24391
|
|
|
|
|
39963
|
$node->{right} = $self->_build_tree( $node->{_right}, $depth + 1, $limit ); |
|
610
|
24391
|
|
|
|
|
29794
|
delete @{$node}{qw(_left _right)}; |
|
|
24391
|
|
|
|
|
60187
|
|
|
611
|
|
|
|
|
|
|
|
|
612
|
24391
|
|
|
|
|
56205
|
return $node; |
|
613
|
|
|
|
|
|
|
} ## end sub _build_tree |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# Axis-parallel cut: random varying feature, random threshold in its range. |
|
616
|
|
|
|
|
|
|
sub _axis_split { |
|
617
|
19950
|
|
|
19950
|
|
30718
|
my ( $X, $varying, $lo, $hi ) = @_; |
|
618
|
|
|
|
|
|
|
|
|
619
|
19950
|
|
|
|
|
35784
|
my $attr = $varying->[ int( rand( scalar @$varying ) ) ]; |
|
620
|
19950
|
|
|
|
|
32608
|
my $split = $lo->[$attr] + rand() * ( $hi->[$attr] - $lo->[$attr] ); |
|
621
|
|
|
|
|
|
|
|
|
622
|
19950
|
|
|
|
|
24911
|
my ( @left, @right ); |
|
623
|
19950
|
|
|
|
|
26520
|
for my $row (@$X) { |
|
624
|
784613
|
100
|
|
|
|
1038634
|
if ( $row->[$attr] < $split ) { push @left, $row } |
|
|
393279
|
|
|
|
|
532595
|
|
|
625
|
391334
|
|
|
|
|
551352
|
else { push @right, $row } |
|
626
|
|
|
|
|
|
|
} |
|
627
|
19950
|
|
|
|
|
75137
|
return { attr => $attr, split => $split, _left => \@left, _right => \@right }; |
|
628
|
|
|
|
|
|
|
} ## end sub _axis_split |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Oblique cut (Extended Isolation Forest): a random hyperplane. We activate |
|
631
|
|
|
|
|
|
|
# (extension_level + 1) of the varying features, give each a Gaussian |
|
632
|
|
|
|
|
|
|
# coefficient, and place the plane through a random point in the bounding box. |
|
633
|
|
|
|
|
|
|
# A point goes left when coef . x <= b, where b = coef . p. |
|
634
|
|
|
|
|
|
|
sub _oblique_split { |
|
635
|
4441
|
|
|
4441
|
|
6800
|
my ( $self, $X, $varying, $lo, $hi ) = @_; |
|
636
|
|
|
|
|
|
|
|
|
637
|
4441
|
|
|
|
|
5974
|
my $active = $self->{extension_level_used} + 1; |
|
638
|
4441
|
100
|
|
|
|
7245
|
$active = scalar @$varying if $active > scalar @$varying; |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# Pick which varying features take part (partial shuffle of their indices). |
|
641
|
4441
|
|
|
|
|
6162
|
my @pool = @$varying; |
|
642
|
4441
|
|
|
|
|
6379
|
for my $i ( 0 .. $active - 1 ) { |
|
643
|
8667
|
|
|
|
|
12385
|
my $j = $i + int( rand( scalar(@pool) - $i ) ); |
|
644
|
8667
|
|
|
|
|
13424
|
@pool[ $i, $j ] = @pool[ $j, $i ]; |
|
645
|
|
|
|
|
|
|
} |
|
646
|
4441
|
|
|
|
|
8279
|
my @idx = @pool[ 0 .. $active - 1 ]; |
|
647
|
|
|
|
|
|
|
|
|
648
|
4441
|
|
|
|
|
5740
|
my ( @coef, $b ); |
|
649
|
4441
|
|
|
|
|
5211
|
$b = 0.0; |
|
650
|
4441
|
|
|
|
|
5390
|
for my $f (@idx) { |
|
651
|
8667
|
|
|
|
|
11037
|
my $c = _randn(); |
|
652
|
8667
|
|
|
|
|
12672
|
my $p = $lo->[$f] + rand() * ( $hi->[$f] - $lo->[$f] ); # point in the box |
|
653
|
8667
|
|
|
|
|
11128
|
push @coef, $c; |
|
654
|
8667
|
|
|
|
|
10720
|
$b += $c * $p; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
4441
|
|
|
|
|
5345
|
my ( @left, @right ); |
|
658
|
4441
|
|
|
|
|
5785
|
for my $row (@$X) { |
|
659
|
200623
|
|
|
|
|
210273
|
my $dot = 0.0; |
|
660
|
200623
|
|
|
|
|
362543
|
$dot += $coef[$_] * $row->[ $idx[$_] ] for 0 .. $#idx; |
|
661
|
200623
|
100
|
|
|
|
250013
|
if ( $dot <= $b ) { push @left, $row } |
|
|
97910
|
|
|
|
|
123730
|
|
|
662
|
102713
|
|
|
|
|
134813
|
else { push @right, $row } |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
return { |
|
665
|
4441
|
|
|
|
|
18624
|
idx => \@idx, |
|
666
|
|
|
|
|
|
|
coef => \@coef, |
|
667
|
|
|
|
|
|
|
b => $b, |
|
668
|
|
|
|
|
|
|
_left => \@left, |
|
669
|
|
|
|
|
|
|
_right => \@right |
|
670
|
|
|
|
|
|
|
}; |
|
671
|
|
|
|
|
|
|
} ## end sub _oblique_split |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
674
|
|
|
|
|
|
|
# Path length of a single point in a single tree: edges traversed until a leaf, |
|
675
|
|
|
|
|
|
|
# plus c(leaf size) when the leaf still holds several points. Handles both axis |
|
676
|
|
|
|
|
|
|
# and oblique internal nodes, so a model of either mode scores correctly. |
|
677
|
|
|
|
|
|
|
#------------------------------------------------------------------------------- |
|
678
|
|
|
|
|
|
|
sub _path_length { |
|
679
|
237890
|
|
|
237890
|
|
334678
|
my ( $self, $x, $node, $depth ) = @_; |
|
680
|
237890
|
|
|
|
|
359427
|
while ( !$node->{leaf} ) { |
|
681
|
1734351
|
|
|
|
|
1776345
|
my $left; |
|
682
|
1734351
|
100
|
|
|
|
2125120
|
if ( exists $node->{attr} ) { # axis-parallel split |
|
683
|
1544332
|
|
|
|
|
2061302
|
$left = $x->[ $node->{attr} ] < $node->{split}; |
|
684
|
|
|
|
|
|
|
} else { # oblique (hyperplane) split |
|
685
|
190019
|
|
|
|
|
268315
|
my ( $idx, $coef ) = ( $node->{idx}, $node->{coef} ); |
|
686
|
190019
|
|
|
|
|
207604
|
my $dot = 0.0; |
|
687
|
190019
|
|
|
|
|
387757
|
$dot += $coef->[$_] * $x->[ $idx->[$_] ] for 0 .. $#$idx; |
|
688
|
190019
|
|
|
|
|
252778
|
$left = $dot <= $node->{b}; |
|
689
|
|
|
|
|
|
|
} |
|
690
|
1734351
|
100
|
|
|
|
2460752
|
$node = $left ? $node->{left} : $node->{right}; |
|
691
|
1734351
|
|
|
|
|
2552557
|
$depth++; |
|
692
|
|
|
|
|
|
|
} ## end while ( !$node->{leaf} ) |
|
693
|
237890
|
|
|
|
|
311781
|
return $depth + _c( $node->{size} ); |
|
694
|
|
|
|
|
|
|
} ## end sub _path_length |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
sub _check_fitted { |
|
697
|
37
|
|
|
37
|
|
78
|
my ($self) = @_; |
|
698
|
|
|
|
|
|
|
croak "model is not fitted yet; call fit() first" |
|
699
|
37
|
100
|
66
|
|
|
172
|
unless ref $self->{trees} eq 'ARRAY' && @{ $self->{trees} }; |
|
|
37
|
|
|
|
|
618
|
|
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
1; |