File Coverage

blib/lib/Algorithm/Classifier/IsolationForest.pm
Criterion Covered Total %
statement 206 216 95.3
branch 68 84 80.9
condition 46 50 92.0
subroutine 27 28 96.4
pod 10 11 90.9
total 357 389 91.7


line stmt bran cond sub pod time code
1             package Algorithm::Classifier::IsolationForest;
2              
3 8     8   665666 use strict;
  8         12  
  8         235  
4 8     8   29 use warnings;
  8         20  
  8         379  
5 8     8   38 use Carp qw(croak);
  8         13  
  8         391  
6 8     8   35 use List::Util qw(min);
  8         18  
  8         515  
7 8     8   3448 use POSIX qw(ceil);
  8         53939  
  8         41  
8 8     8   15031 use JSON::PP ();
  8         117900  
  8         271  
9 8     8   3714 use File::Slurp qw(read_file write_file);
  8         148811  
  8         640  
10              
11             our $VERSION = '0.1.0';
12              
13 8     8   56 use constant EULER => 0.5772156649015329;
  8         19  
  8         461  
14 8     8   37 use constant TWO_PI => 6.283185307179586;
  8         15  
  8         20294  
15              
16             =head1 NAME
17              
18             Algorithm::Classifier::IsolationForest - unsupervised anomaly detection via Isolation Forest or Extended Isolation Forest
19              
20             =head1 SYNOPSIS
21              
22             use Algorithm::Classifier::IsolationForest;
23              
24             my @data = ([0.1, -0.2], [0.0, 0.1], [5.0, 6.0], ...);
25              
26             # Classic, axis-parallel Isolation Forest
27             my $iforest = Algorithm::Classifier::IsolationForest->new(
28             n_trees => 100,
29             sample_size => 256,
30             seed => 42,
31             );
32             $iforest->fit(\@data);
33              
34             my $scores = $iforest->score_samples(\@data); # arrayref, each in (0,1]
35             my $flags = $iforest->predict(\@data, 0.6); # arrayref of 0/1
36              
37             # Save and reload
38             $iforest->save('model.json');
39             my $reloaded = Algorithm::Classifier::IsolationForest->load('model.json');
40              
41             # Extended Isolation Forest (oblique hyperplane splits)
42             my $eif = IsolationForest->new(mode => 'extended', seed => 42);
43             $eif->fit(\@data);
44              
45             =head1 DESCRIPTION
46              
47             Isolation Forest (Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua, 2008) detects anomalies by random
48             partitioning rather than by modelling normal points. Each tree repeatedly
49             splits the data. Points that get isolated after only a few splits are likely
50             anomalies. The score is the average isolation depth across many trees,
51             normalised so values approach 1 for anomalies and stay below 0.5 for normal
52             points.
53              
54             In extended mode the module implements the Extended Isolation Forest
55             variant. Each split is a random hyperplane instead of an axis-aligned cut,
56             which removes the rectangular, axis-aligned bias in the score field and
57             tends to help on elongated or multi-modal data.
58              
59             psi refernced below is ψ or the pitchfork math symbol refrenced in paper,
60             Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
61              
62             ... or max samples.
63              
64             L
65              
66             =head1 GENERAL METHODS
67              
68             =head2 new(%args)
69              
70             Inits the object.
71              
72             - n_trees :: number of isolation trees in the ensemble
73             default :: 100
74              
75             - sample_size :: sub-sample size used to build each tree... max samples
76             default :: 256
77              
78             - max_depth :: per-tree height limit... if not defined is set to ceil(log2(psi))
79             default :: undef
80              
81             - seed :: optional integer to seed srand with for reproducible trees...
82             see perldoc -f srand for more info. This number is processed via abs(int()).
83             default :: undef
84              
85             - mode :: if it should be IF or EIF
86             axis :: classic axis-parallel splits (IF)
87             extended :: oblique hyperplane splits (EIF)
88             default :: axis
89              
90             - extension_level :: extended mode only... how many features take partin each
91             split. 0 behaves like a single-feature (axis) cut; the
92             maximum (n_features - 1) uses every varying feature. undef
93             => maximum. Clamped to [0, n_features - 1] at fit time.
94              
95             - contamination :: expected fraction of anomalies, in (0, 0.5]. When given,
96             fit() learns a score threshold that flags this fraction of
97             the training set, and predict() uses it by default. undef
98             => no learned threshold (predict() falls back to 0.5).
99             default :: undef
100              
101             Note: log2 under Perl is as below...
102              
103             log($psi) / log(2)
104              
105             =cut
106              
107             sub new {
108 39     39 1 947975 my ( $class, %args ) = @_;
109              
110 39   100     192 my $mode = $args{mode} // 'axis';
111 39 100 100     284 croak "mode must be 'axis' or 'extended'"
112             unless $mode eq 'axis' || $mode eq 'extended';
113              
114 38 100       99 if ( defined( $args{seed} ) ) {
115 20         64 $args{seed} = abs( int( $args{seed} ) );
116             }
117              
118             my $self = {
119             n_trees => $args{n_trees} // 100,
120             sample_size => $args{sample_size} // 256,
121             max_depth => $args{max_depth}, # undef => auto
122             seed => $args{seed}, # undef => non-deterministic
123             mode => $mode,
124             extension_level => $args{extension_level}, # undef => max, resolved in fit()
125             contamination => $args{contamination}, # undef => no learned threshold
126 38   100     444 threshold => undef, # learned in fit() if contamination set
      100        
127             trees => [],
128             c_psi => undef, # c(psi), set during fit()
129             n_features => undef,
130             };
131              
132 38 100       188 croak "n_trees must be >= 1" unless $self->{n_trees} >= 1;
133 37 100       189 croak "sample_size must be >= 1" unless $self->{sample_size} >= 1;
134             croak "extension_level must be >= 0"
135 36 100 100     200 if defined $self->{extension_level} && $self->{extension_level} < 0;
136             croak "contamination must be a number in (0, 0.5]"
137             if defined $self->{contamination}
138 35 100 100     390 && !( $self->{contamination} > 0 && $self->{contamination} <= 0.5 );
      100        
139              
140 32         106 return bless $self, $class;
141             } ## end sub new
142              
143             =head2 decision_threshold
144              
145             The score cutoff C uses by default; undef unless C was
146             set.
147              
148             =cut
149              
150 6     6 1 2780 sub decision_threshold { return $_[0]->{threshold} }
151              
152             =head2 fit
153              
154             Trains the model on the specified data.
155              
156             The data taken is a array of arrays, with each sub array containing two numbers.
157              
158             @training_data = (
159             [ 3, 5 ],
160             [ 2.3, 1 ],
161             [ 5, 9 ],
162             ...
163             );
164              
165             Below shows a example of building a gausing cluster and using that for training.
166              
167             # so it is reproducible
168             srand(7);
169              
170             # build a gaussian cluster and add a handful out outliers...
171              
172             use constant PI => 3.14159265358979;
173             sub gaussian {
174             my ($mu, $sigma) = @_;
175             my $u1 = rand() || 1e-12;
176             my $u2 = rand();
177             my $z = sqrt(-2 * log($u1)) * cos(2 * PI * $u2);
178             return $mu + $sigma * $z;
179             }
180              
181             # add some normal items
182             for (1 .. 500) {
183             push @data, [ gaussian(0, 1), gaussian(0, 1) ];
184             push @truth, 0;
185             }
186             # add some outliers
187             for (1 .. 20) {
188             my $angle = rand() * 2 * PI;
189             my $radius = 5 + rand() * 3; # distance 5..8 from the origin
190             push @data, [ $radius * cos($angle), $radius * sin($angle) ];
191             push @truth, 1;
192             }
193              
194             $iforest->fit(\@training_data);
195              
196             =cut
197              
198             sub fit {
199 24     24 1 2610 my ( $self, $data ) = @_;
200              
201 24 100 100     450 croak "fit() expects a non-empty arrayref of samples"
202             unless ref $data eq 'ARRAY' && @$data;
203             croak "each sample must be an arrayref of features"
204 21 100 100     168 unless ref $data->[0] eq 'ARRAY' && @{ $data->[0] };
  20         183  
205              
206 19         37 my $n = scalar @$data;
207 19         29 my $n_features = scalar @{ $data->[0] };
  19         34  
208 19         50 $self->{n_features} = $n_features;
209              
210             # The sub-sample cannot be larger than the data set itself.
211 19         107 my $psi = min( $self->{sample_size}, $n );
212 19         76 $self->{c_psi} = _c($psi);
213 19         49 $self->{psi_used} = $psi;
214              
215             # Resolve the extension level against the data's dimensionality.
216 19 100       67 if ( $self->{mode} eq 'extended' ) {
217 6         12 my $max_ext = $n_features - 1;
218             my $ext
219             = defined $self->{extension_level}
220             ? $self->{extension_level}
221 6 100       18 : $max_ext;
222 6 50       15 $ext = 0 if $ext < 0;
223 6 100       15 $ext = $max_ext if $ext > $max_ext;
224 6         13 $self->{extension_level_used} = $ext;
225             } else {
226 13         30 $self->{extension_level_used} = undef;
227             }
228              
229             # Height limit: the average tree height ceil(log2(psi)). Past this depth the
230             # remaining points are scored using the c(size) adjustment instead.
231             my $limit
232             = defined $self->{max_depth}
233             ? $self->{max_depth}
234 19 50       119 : ceil( log($psi) / log(2) );
235 19 50       53 $limit = 1 if $limit < 1;
236 19         62 $self->{max_depth_used} = $limit;
237              
238 19 50       75 srand( $self->{seed} ) if defined $self->{seed};
239              
240 19         64 my @trees;
241 19         64 for ( 1 .. $self->{n_trees} ) {
242 747         1756 my $sample = _subsample( $data, $psi );
243 747         1845 push @trees, $self->_build_tree( $sample, 0, $limit );
244             }
245 19         69 $self->{trees} = \@trees;
246              
247             # If a contamination rate was requested, learn the score cutoff that flags
248             # that fraction of the training set. We place the threshold midway between
249             # the k-th and (k+1)-th highest training scores, so it sits in the gap
250             # between flagged and unflagged points -- unambiguous and robust to the
251             # tiny float rounding introduced by JSON serialisation.
252 19 100       86 if ( defined $self->{contamination} ) {
253 3         20 my $scores = $self->score_samples($data);
254 3         60 my @desc = sort { $b <=> $a } @$scores;
  3979         4200  
255 3         6 my $n_pts = scalar @desc;
256 3         15 my $k = int( $self->{contamination} * $n_pts + 0.5 );
257 3 50       11 $k = 1 if $k < 1;
258 3 50       54 $k = $n_pts if $k > $n_pts;
259 3 50       41 $self->{threshold} = $k < $n_pts
260             ? ( $desc[ $k - 1 ] + $desc[$k] ) / 2.0 # midpoint of the boundary
261             : $desc[ $n_pts - 1 ] - 1e-9; # k == n: flag everything
262             } ## end if ( defined $self->{contamination} )
263              
264 19         111 return $self;
265             } ## end sub fit
266              
267             =head2 path_lengths(\@data)
268              
269             Returns the mean isolation depth per sample, for inspection.
270              
271             my @lenghts = $forest->path_lengths(\@data);
272              
273             print "x, y, length\n";
274              
275             my $int=0;
276             while (defined($data[$int])) {
277             print $data[$int][0].', '.$data[$int][1].', '.$lenghts[$int]."\n";
278              
279             $int++;
280             }
281              
282             =cut
283              
284             sub path_lengths {
285 2     2 1 4610 my ( $self, $data ) = @_;
286 2         8 $self->_check_fitted;
287 1         2 my $t = scalar @{ $self->{trees} };
  1         2  
288 1         2 my @out;
289 1         41 for my $x (@$data) {
290 100         125 my $sum = 0;
291 100         120 $sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
  100         184  
292 100         208 push @out, $sum / $t;
293             }
294 1         10 return \@out;
295             } ## end sub path_lengths
296              
297             =head predict(\@data, $threshold)
298              
299             Returns an arrayref of 0/1 labels for the specified data.
300              
301             If theshold is not specified it uses whatever the set default.
302              
303             my $results = $forest->predict(\@data, $threshold);
304              
305             print "x, y, result\n";
306              
307             my $int=0;
308             while (defined($data[$int])) {
309             print $data[$int][0].', '.$data[$int][1].', '.$results->[$int]."\n";
310              
311             $int++;
312             }
313              
314             =cut
315              
316             sub predict {
317 11     11 0 15236 my ( $self, $data, $threshold ) = @_;
318             $threshold
319             = defined $threshold ? $threshold
320             : defined $self->{threshold} ? $self->{threshold}
321 11 100       49 : 0.5;
    100          
322 11         33 my $scores = $self->score_samples($data);
323 10 100       86 return [ map { $_ >= $threshold ? 1 : 0 } @$scores ];
  1107         1683  
324             }
325              
326             =head2 score_samples(\@data)
327              
328             Returns an arrayref of anomaly scores, between 0 and 1.
329              
330             Scores near 1 are strong anomalies (isolated quickly).
331              
332             Scores well below 0.5 are normal.
333              
334             Scores ~0.5 means the points are hard to tell apart.
335              
336             my $scores = $forest->path_lengths(\@data);
337              
338             print "x, y, length\n";
339              
340             my $int=0;
341             while (defined($data[$int])) {
342             print $data[$int][0].', '.$data[$int][1].', '.$scores->[$int]."\n";
343              
344             $int++;
345             }
346              
347             =cut
348              
349             sub score_samples {
350 31     31 1 15892 my ( $self, $data ) = @_;
351 31         118 $self->_check_fitted;
352 29         74 my $c = $self->{c_psi};
353 29         41 my $t = scalar @{ $self->{trees} };
  29         48  
354              
355 29         48 my @scores;
356 29         57 for my $x (@$data) {
357 3042         3934 my $sum = 0;
358 3042         3640 $sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
  3042         7247  
359 3042         4862 my $avg = $sum / $t;
360 3042 50       11953 push @scores, $c > 0 ? 2**( -$avg / $c ) : 0.5;
361             }
362 29         162 return \@scores;
363             } ## end sub score_samples
364              
365             =head2 score_predict_samples
366              
367             Returns a array ref of arrays. First value of each sub array is the score with the second being
368             0/1 for if it is a anomaly or not.
369              
370             my $results = $forest->predict(\@data, $threshold);
371              
372             print "x, y, score, result\n";
373              
374             my $int=0;
375             while (defined($data[$int])) {
376             print $data[$int][0].', '.$data[$int][1].', '.$results->[$int][0].', '.$results->[$int][1]."\n";
377              
378             $int++;
379             }
380              
381             =cut
382              
383             sub score_predict_samples {
384 0     0 1 0 my ( $self, $data, $threshold ) = @_;
385             $threshold
386             = defined $threshold ? $threshold
387             : defined $self->{threshold} ? $self->{threshold}
388 0 0       0 : 0.5;
    0          
389 0         0 my $scores = $self->score_samples($data);
390              
391 0         0 my @to_return;
392 0         0 foreach my $score ( @{$scores} ) {
  0         0  
393 0 0       0 if ( $score >= $threshold ) {
394 0         0 push @to_return, [ $score, 1 ];
395             } else {
396 0         0 push @to_return, [ $score, 0 ];
397             }
398             }
399              
400 0         0 return \@to_return;
401             } ## end sub score_predict_samples
402              
403             =head1 MODEL SAVE/LOAD METHODS
404              
405             =head2 to_json
406              
407             Returns a JSON representation of the module.
408              
409             Required being fit having to be called.
410              
411             my $json = $iforest->to_json;
412              
413             =cut
414              
415             sub to_json {
416 4     4 1 239076 my ($self) = @_;
417 4         15 $self->_check_fitted;
418             my $payload = {
419             format => 'Algorithm::Classifier::IsolationForest',
420             version => 0,
421             params => {
422             n_trees => $self->{n_trees},
423             sample_size => $self->{sample_size},
424             mode => $self->{mode},
425             extension_level => $self->{extension_level_used},
426             contamination => $self->{contamination},
427             threshold => $self->{threshold},
428             n_features => $self->{n_features},
429             psi_used => $self->{psi_used},
430             c_psi => $self->{c_psi},
431             max_depth_used => $self->{max_depth_used},
432             },
433             trees => $self->{trees},
434 3         34 };
435 3         24 return JSON::PP->new->canonical(1)->encode($payload);
436             } ## end sub to_json
437              
438             =head2 from_json($json)
439              
440             Init the object from the model in the specified JSON string.
441              
442             my $iforest = Algorithm::Classifier::IsolationForest->from_json($json);
443              
444             =cut
445              
446             sub from_json {
447 5     5 1 235889 my ( $class, $text ) = @_;
448 5         28 my $payload = JSON::PP->new->decode($text);
449             croak "not an IsolationForest model"
450             unless ref $payload eq 'HASH'
451             && defined $payload->{format}
452 5 100 100     2040093 && $payload->{format} eq 'Algorithm::Classifier::IsolationForest';
      66        
453              
454 3   100     15 my $p = $payload->{params} || {};
455             my $self = {
456             n_trees => $p->{n_trees},
457             sample_size => $p->{sample_size},
458             max_depth => undef,
459             seed => undef,
460             mode => $p->{mode} // 'axis',
461             extension_level => $p->{extension_level},
462             extension_level_used => $p->{extension_level},
463             contamination => $p->{contamination},
464             threshold => $p->{threshold},
465             n_features => $p->{n_features},
466             psi_used => $p->{psi_used},
467             c_psi => $p->{c_psi},
468             max_depth_used => $p->{max_depth_used},
469 3   100     45 trees => $payload->{trees} || [],
      50        
470             };
471 3 100       12 croak "model contains no trees" unless @{ $self->{trees} };
  3         111  
472              
473             # Recompute the normalising constant from the (integer, exact) sub-sample
474             # size rather than trusting the stored float, so a reloaded model's scores
475             # are bit-for-bit identical to the original's.
476 2 50       17 $self->{c_psi} = _c( $self->{psi_used} ) if defined $self->{psi_used};
477              
478 2         19 return bless $self, $class;
479             } ## end sub from_json
480              
481             =head2 save($path)
482              
483             Saves the model to the specified path.
484              
485             $iforest->save($path);
486              
487             =cut
488              
489             sub save {
490 1     1 1 7016 my ( $self, $path ) = @_;
491 1         6 write_file( $path, { 'atomic' => 1 }, $self->to_json );
492             }
493              
494             =head2 load($path);
495              
496             Init the object from the model in the specified file.
497              
498             my $iforest = Algorithm::Classifier::IsolationForest->load($path);
499              
500             =cut
501              
502             sub load {
503 2     2 1 251850 my ( $class, $path ) = @_;
504 2         10 my $raw_model = read_file($path);
505 1         201 return $class->from_json($raw_model);
506             }
507              
508             =head1 REFERENCES
509              
510             Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
511              
512             L
513              
514             L
515              
516             Sahand Hariri, Matias Carrasco Kind, Robert J. Brunner (2020). Extended Isolation Forest. 1479 - 1489. 10.1109/TKDE.2019.2947676
517              
518             L
519              
520             =cut
521              
522             ###
523             ###
524             ### internal stuff below
525             ###
526             ###
527              
528             #-------------------------------------------------------------------------------
529             # c(n): the expected path length of an unsuccessful search in a binary search
530             # tree of n nodes. Isolation Forest uses it (a) to adjust the path length when a
531             # leaf still holds more than one point (depth limit reached), and (b) to
532             # normalise the average path length into a 0..1 anomaly score.
533             #-------------------------------------------------------------------------------
534             sub _c {
535 237911     237911   295870 my ($n) = @_;
536 237911 100       361868 return 0.0 if $n <= 1;
537 200978 100       290040 return 1.0 if $n == 2;
538 187800         249109 my $harmonic = log( $n - 1 ) + EULER; # H(n-1) ~= ln(n-1) + gamma
539 187800         434716 return 2.0 * $harmonic - ( 2.0 * ( $n - 1 ) / $n );
540             }
541              
542             # One draw from the standard normal N(0,1) via Box-Muller. Used to pick the
543             # random hyperplane orientations in Extended Isolation Forest mode.
544             sub _randn {
545 8667   50 8667   12677 my $u1 = rand() || 1e-12;
546 8667         9776 my $u2 = rand();
547 8667         15160 return sqrt( -2.0 * log($u1) ) * cos( TWO_PI * $u2 );
548             }
549              
550             #-------------------------------------------------------------------------------
551             # Draw $k samples without replacement via a partial Fisher-Yates shuffle of the
552             # index array. Returns an arrayref of (shared, read-only) sample refs.
553             #-------------------------------------------------------------------------------
554             sub _subsample {
555 747     747   1499 my ( $data, $k ) = @_;
556 747         1154 my $n = scalar @$data;
557 747         4648 my @idx = ( 0 .. $n - 1 );
558 747         1773 for my $i ( 0 .. $k - 1 ) {
559 130400         154660 my $j = $i + int( rand( $n - $i ) );
560 130400         175097 @idx[ $i, $j ] = @idx[ $j, $i ];
561             }
562 747         6879 my @chosen = @idx[ 0 .. $k - 1 ];
563 747         3322 return [ @{$data}[@chosen] ];
  747         12146  
564             } ## end sub _subsample
565              
566             #-------------------------------------------------------------------------------
567             # Recursively build one isolation tree.
568             #
569             # A node is one of:
570             # leaf { leaf => 1, size => N }
571             # axis { attr => A, split => S, left => ..., right => ... }
572             # oblique { idx => [..], coef => [..], b => B, left => ..., right => ... }
573             #
574             # In both split styles the choice is restricted to features that actually vary
575             # across the points reaching the node: this avoids wasted levels on constant
576             # columns and lets a node leaf out exactly when its points are indistinguishable.
577             #-------------------------------------------------------------------------------
578             sub _build_tree {
579 49529     49529   69359 my ( $self, $X, $depth, $limit ) = @_;
580              
581 49529         56940 my $size = scalar @$X;
582 49529 100 100     126280 return { leaf => 1, size => $size }
583             if $depth >= $limit || $size <= 1;
584              
585 24391         31822 my $nf = $self->{n_features};
586              
587             # Per-feature min and max within this node, in a single pass.
588 24391         28143 my ( @lo, @hi );
589 24391         30662 for my $row (@$X) {
590 985236         1213078 for my $f ( 0 .. $nf - 1 ) {
591 1973283         2200984 my $v = $row->[$f];
592 1973283 100 100     3849619 $lo[$f] = $v if !defined $lo[$f] || $v < $lo[$f];
593 1973283 100 100     4145345 $hi[$f] = $v if !defined $hi[$f] || $v > $hi[$f];
594             }
595             }
596              
597             # Features with spread are the only ones that can split the data.
598 24391         40361 my @varying = grep { $lo[$_] < $hi[$_] } 0 .. $nf - 1;
  48854         82284  
599              
600             # No spread on any feature => all points identical => cannot isolate.
601 24391 50       35816 return { leaf => 1, size => $size } unless @varying;
602              
603             my $node
604 24391 100       53504 = $self->{mode} eq 'extended'
605             ? $self->_oblique_split( $X, \@varying, \@lo, \@hi )
606             : _axis_split( $X, \@varying, \@lo, \@hi );
607              
608 24391         48984 $node->{left} = $self->_build_tree( $node->{_left}, $depth + 1, $limit );
609 24391         38121 $node->{right} = $self->_build_tree( $node->{_right}, $depth + 1, $limit );
610 24391         28402 delete @{$node}{qw(_left _right)};
  24391         57218  
611              
612 24391         50214 return $node;
613             } ## end sub _build_tree
614              
615             # Axis-parallel cut: random varying feature, random threshold in its range.
616             sub _axis_split {
617 19950     19950   28619 my ( $X, $varying, $lo, $hi ) = @_;
618              
619 19950         33289 my $attr = $varying->[ int( rand( scalar @$varying ) ) ];
620 19950         30601 my $split = $lo->[$attr] + rand() * ( $hi->[$attr] - $lo->[$attr] );
621              
622 19950         23145 my ( @left, @right );
623 19950         24627 for my $row (@$X) {
624 784613 100       969518 if ( $row->[$attr] < $split ) { push @left, $row }
  393279         497925  
625 391334         519059 else { push @right, $row }
626             }
627 19950         67672 return { attr => $attr, split => $split, _left => \@left, _right => \@right };
628             } ## end sub _axis_split
629              
630             # Oblique cut (Extended Isolation Forest): a random hyperplane. We activate
631             # (extension_level + 1) of the varying features, give each a Gaussian
632             # coefficient, and place the plane through a random point in the bounding box.
633             # A point goes left when coef . x <= b, where b = coef . p.
634             sub _oblique_split {
635 4441     4441   7089 my ( $self, $X, $varying, $lo, $hi ) = @_;
636              
637 4441         5543 my $active = $self->{extension_level_used} + 1;
638 4441 100       7127 $active = scalar @$varying if $active > scalar @$varying;
639              
640             # Pick which varying features take part (partial shuffle of their indices).
641 4441         6141 my @pool = @$varying;
642 4441         6277 for my $i ( 0 .. $active - 1 ) {
643 8667         12740 my $j = $i + int( rand( scalar(@pool) - $i ) );
644 8667         13827 @pool[ $i, $j ] = @pool[ $j, $i ];
645             }
646 4441         8468 my @idx = @pool[ 0 .. $active - 1 ];
647              
648 4441         6028 my ( @coef, $b );
649 4441         5037 $b = 0.0;
650 4441         5725 for my $f (@idx) {
651 8667         11193 my $c = _randn();
652 8667         12961 my $p = $lo->[$f] + rand() * ( $hi->[$f] - $lo->[$f] ); # point in the box
653 8667         11527 push @coef, $c;
654 8667         11876 $b += $c * $p;
655             }
656              
657 4441         4985 my ( @left, @right );
658 4441         5414 for my $row (@$X) {
659 200623         208196 my $dot = 0.0;
660 200623         364031 $dot += $coef[$_] * $row->[ $idx[$_] ] for 0 .. $#idx;
661 200623 100       250593 if ( $dot <= $b ) { push @left, $row }
  97910         121753  
662 102713         134410 else { push @right, $row }
663             }
664             return {
665 4441         20214 idx => \@idx,
666             coef => \@coef,
667             b => $b,
668             _left => \@left,
669             _right => \@right
670             };
671             } ## end sub _oblique_split
672              
673             #-------------------------------------------------------------------------------
674             # Path length of a single point in a single tree: edges traversed until a leaf,
675             # plus c(leaf size) when the leaf still holds several points. Handles both axis
676             # and oblique internal nodes, so a model of either mode scores correctly.
677             #-------------------------------------------------------------------------------
678             sub _path_length {
679 237890     237890   343816 my ( $self, $x, $node, $depth ) = @_;
680 237890         374312 while ( !$node->{leaf} ) {
681 1734351         1835530 my $left;
682 1734351 100       2203699 if ( exists $node->{attr} ) { # axis-parallel split
683 1544332         2142557 $left = $x->[ $node->{attr} ] < $node->{split};
684             } else { # oblique (hyperplane) split
685 190019         264082 my ( $idx, $coef ) = ( $node->{idx}, $node->{coef} );
686 190019         204258 my $dot = 0.0;
687 190019         402155 $dot += $coef->[$_] * $x->[ $idx->[$_] ] for 0 .. $#$idx;
688 190019         259095 $left = $dot <= $node->{b};
689             }
690 1734351 100       2477950 $node = $left ? $node->{left} : $node->{right};
691 1734351         2710504 $depth++;
692             } ## end while ( !$node->{leaf} )
693 237890         327100 return $depth + _c( $node->{size} );
694             } ## end sub _path_length
695              
696             sub _check_fitted {
697 37     37   87 my ($self) = @_;
698             croak "model is not fitted yet; call fit() first"
699 37 100 66     167 unless ref $self->{trees} eq 'ARRAY' && @{ $self->{trees} };
  37         538  
700             }
701              
702             1;