File Coverage

blib/lib/Algorithm/Classifier/IsolationForest.pm
Criterion Covered Total %
statement 206 216 95.3
branch 68 84 80.9
condition 46 50 92.0
subroutine 27 28 96.4
pod 10 11 90.9
total 357 389 91.7


line stmt bran cond sub pod time code
1             package Algorithm::Classifier::IsolationForest;
2              
3 8     8   703549 use strict;
  8         16  
  8         308  
4 8     8   32 use warnings;
  8         27  
  8         400  
5 8     8   40 use Carp qw(croak);
  8         13  
  8         378  
6 8     8   34 use List::Util qw(min);
  8         18  
  8         490  
7 8     8   3543 use POSIX qw(ceil);
  8         59001  
  8         38  
8 8     8   15930 use JSON::PP ();
  8         128821  
  8         246  
9 8     8   4048 use File::Slurp qw(read_file write_file);
  8         156552  
  8         687  
10              
11             our $VERSION = '0.0.1';
12              
13 8     8   59 use constant EULER => 0.5772156649015329;
  8         17  
  8         442  
14 8     8   36 use constant TWO_PI => 6.283185307179586;
  8         11  
  8         18447  
15              
16             =head1 NAME
17              
18             Algorithm::Classifier::IsolationForest - unsupervised anomaly detection via Isolation Forest or Extended Isolation Forest
19              
20             =head1 SYNOPSIS
21              
22             use Algorithm::Classifier::IsolationForest;
23              
24             my @data = ([0.1, -0.2], [0.0, 0.1], [5.0, 6.0], ...);
25              
26             # Classic, axis-parallel Isolation Forest
27             my $iforest = Algorithm::Classifier::IsolationForest->new(
28             n_trees => 100,
29             sample_size => 256,
30             seed => 42,
31             );
32             $iforest->fit(\@data);
33              
34             my $scores = $iforest->score_samples(\@data); # arrayref, each in (0,1]
35             my $flags = $iforest->predict(\@data, 0.6); # arrayref of 0/1
36              
37             # Save and reload
38             $iforest->save('model.json');
39             my $reloaded = Algorithm::Classifier::IsolationForest->load('model.json');
40              
41             # Extended Isolation Forest (oblique hyperplane splits)
42             my $eif = IsolationForest->new(mode => 'extended', seed => 42);
43             $eif->fit(\@data);
44              
45             =head1 DESCRIPTION
46              
47             Isolation Forest (Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua, 2008) detects anomalies by random
48             partitioning rather than by modelling normal points. Each tree repeatedly
49             splits the data. Points that get isolated after only a few splits are likely
50             anomalies. The score is the average isolation depth across many trees,
51             normalised so values approach 1 for anomalies and stay below 0.5 for normal
52             points.
53              
54             In extended mode the module implements the Extended Isolation Forest
55             variant. Each split is a random hyperplane instead of an axis-aligned cut,
56             which removes the rectangular, axis-aligned bias in the score field and
57             tends to help on elongated or multi-modal data.
58              
59             psi refernced below is ψ or the pitchfork math symbol refrenced in paper,
60             Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
61              
62             ... or max samples.
63              
64             L
65              
66             =head1 GENERAL METHODS
67              
68             =head2 new(%args)
69              
70             Inits the object.
71              
72             - n_trees :: number of isolation trees in the ensemble
73             default :: 100
74              
75             - sample_size :: sub-sample size used to build each tree... max samples
76             default :: 256
77              
78             - max_depth :: per-tree height limit... if not defined is set to ceil(log2(psi))
79             default :: undef
80              
81             - seed :: optional integer to seed srand with for reproducible trees...
82             see perldoc -f srand for more info. This number is processed via abs(int()).
83             default :: undef
84              
85             - mode :: if it should be IF or EIF
86             axis :: classic axis-parallel splits (IF)
87             extended :: oblique hyperplane splits (EIF)
88             default :: axis
89              
90             - extension_level :: extended mode only... how many features take partin each
91             split. 0 behaves like a single-feature (axis) cut; the
92             maximum (n_features - 1) uses every varying feature. undef
93             => maximum. Clamped to [0, n_features - 1] at fit time.
94              
95             - contamination :: expected fraction of anomalies, in (0, 0.5]. When given,
96             fit() learns a score threshold that flags this fraction of
97             the training set, and predict() uses it by default. undef
98             => no learned threshold (predict() falls back to 0.5).
99             default :: undef
100              
101             Note: log2 under Perl is as below...
102              
103             log($psi) / log(2)
104              
105             =cut
106              
107             sub new {
108 39     39 1 937750 my ( $class, %args ) = @_;
109              
110 39   100     193 my $mode = $args{mode} // 'axis';
111 39 100 100     331 croak "mode must be 'axis' or 'extended'"
112             unless $mode eq 'axis' || $mode eq 'extended';
113              
114 38 100       99 if ( defined( $args{seed} ) ) {
115 20         63 $args{seed} = abs( int( $args{seed} ) );
116             }
117              
118             my $self = {
119             n_trees => $args{n_trees} // 100,
120             sample_size => $args{sample_size} // 256,
121             max_depth => $args{max_depth}, # undef => auto
122             seed => $args{seed}, # undef => non-deterministic
123             mode => $mode,
124             extension_level => $args{extension_level}, # undef => max, resolved in fit()
125             contamination => $args{contamination}, # undef => no learned threshold
126 38   100     442 threshold => undef, # learned in fit() if contamination set
      100        
127             trees => [],
128             c_psi => undef, # c(psi), set during fit()
129             n_features => undef,
130             };
131              
132 38 100       189 croak "n_trees must be >= 1" unless $self->{n_trees} >= 1;
133 37 100       161 croak "sample_size must be >= 1" unless $self->{sample_size} >= 1;
134             croak "extension_level must be >= 0"
135 36 100 100     190 if defined $self->{extension_level} && $self->{extension_level} < 0;
136             croak "contamination must be a number in (0, 0.5]"
137             if defined $self->{contamination}
138 35 100 100     392 && !( $self->{contamination} > 0 && $self->{contamination} <= 0.5 );
      100        
139              
140 32         100 return bless $self, $class;
141             } ## end sub new
142              
143             =head2 decision_threshold
144              
145             The score cutoff C uses by default; undef unless C was
146             set.
147              
148             =cut
149              
150 6     6 1 2846 sub decision_threshold { return $_[0]->{threshold} }
151              
152             =head2 fit
153              
154             Trains the model on the specified data.
155              
156             The data taken is a array of arrays, with each sub array containing two numbers.
157              
158             @training_data = (
159             [ 3, 5 ],
160             [ 2.3, 1 ],
161             [ 5, 9 ],
162             ...
163             );
164              
165             Below shows a example of building a gausing cluster and using that for training.
166              
167             # so it is reproducible
168             srand(7);
169              
170             # build a gaussian cluster and add a handful out outliers...
171              
172             use constant PI => 3.14159265358979;
173             sub gaussian {
174             my ($mu, $sigma) = @_;
175             my $u1 = rand() || 1e-12;
176             my $u2 = rand();
177             my $z = sqrt(-2 * log($u1)) * cos(2 * PI * $u2);
178             return $mu + $sigma * $z;
179             }
180              
181             # add some normal items
182             for (1 .. 500) {
183             push @data, [ gaussian(0, 1), gaussian(0, 1) ];
184             push @truth, 0;
185             }
186             # add some outliers
187             for (1 .. 20) {
188             my $angle = rand() * 2 * PI;
189             my $radius = 5 + rand() * 3; # distance 5..8 from the origin
190             push @data, [ $radius * cos($angle), $radius * sin($angle) ];
191             push @truth, 1;
192             }
193              
194             $iforest->fit(\@training_data);
195              
196             =cut
197              
198             sub fit {
199 24     24 1 2452 my ( $self, $data ) = @_;
200              
201 24 100 100     446 croak "fit() expects a non-empty arrayref of samples"
202             unless ref $data eq 'ARRAY' && @$data;
203             croak "each sample must be an arrayref of features"
204 21 100 100     170 unless ref $data->[0] eq 'ARRAY' && @{ $data->[0] };
  20         189  
205              
206 19         37 my $n = scalar @$data;
207 19         27 my $n_features = scalar @{ $data->[0] };
  19         33  
208 19         44 $self->{n_features} = $n_features;
209              
210             # The sub-sample cannot be larger than the data set itself.
211 19         135 my $psi = min( $self->{sample_size}, $n );
212 19         80 $self->{c_psi} = _c($psi);
213 19         54 $self->{psi_used} = $psi;
214              
215             # Resolve the extension level against the data's dimensionality.
216 19 100       59 if ( $self->{mode} eq 'extended' ) {
217 6         11 my $max_ext = $n_features - 1;
218             my $ext
219             = defined $self->{extension_level}
220             ? $self->{extension_level}
221 6 100       13 : $max_ext;
222 6 50       13 $ext = 0 if $ext < 0;
223 6 100       12 $ext = $max_ext if $ext > $max_ext;
224 6         19 $self->{extension_level_used} = $ext;
225             } else {
226 13         28 $self->{extension_level_used} = undef;
227             }
228              
229             # Height limit: the average tree height ceil(log2(psi)). Past this depth the
230             # remaining points are scored using the c(size) adjustment instead.
231             my $limit
232             = defined $self->{max_depth}
233             ? $self->{max_depth}
234 19 50       162 : ceil( log($psi) / log(2) );
235 19 50       45 $limit = 1 if $limit < 1;
236 19         37 $self->{max_depth_used} = $limit;
237              
238 19 50       64 srand( $self->{seed} ) if defined $self->{seed};
239              
240 19         26 my @trees;
241 19         52 for ( 1 .. $self->{n_trees} ) {
242 747         2120 my $sample = _subsample( $data, $psi );
243 747         1985 push @trees, $self->_build_tree( $sample, 0, $limit );
244             }
245 19         77 $self->{trees} = \@trees;
246              
247             # If a contamination rate was requested, learn the score cutoff that flags
248             # that fraction of the training set. We place the threshold midway between
249             # the k-th and (k+1)-th highest training scores, so it sits in the gap
250             # between flagged and unflagged points -- unambiguous and robust to the
251             # tiny float rounding introduced by JSON serialisation.
252 19 100       79 if ( defined $self->{contamination} ) {
253 3         19 my $scores = $self->score_samples($data);
254 3         41 my @desc = sort { $b <=> $a } @$scores;
  3979         4379  
255 3         8 my $n_pts = scalar @desc;
256 3         35 my $k = int( $self->{contamination} * $n_pts + 0.5 );
257 3 50       12 $k = 1 if $k < 1;
258 3 50       9 $k = $n_pts if $k > $n_pts;
259 3 50       40 $self->{threshold} = $k < $n_pts
260             ? ( $desc[ $k - 1 ] + $desc[$k] ) / 2.0 # midpoint of the boundary
261             : $desc[ $n_pts - 1 ] - 1e-9; # k == n: flag everything
262             } ## end if ( defined $self->{contamination} )
263              
264 19         108 return $self;
265             } ## end sub fit
266              
267             =head2 path_lengths(\@data)
268              
269             Returns the mean isolation depth per sample, for inspection.
270              
271             my @lenghts = $forest->path_lengths(\@data);
272              
273             print "x, y, length\n";
274              
275             my $int=0;
276             while (defined($data[$int])) {
277             print $data[$int][0].', '.$data[$int][1].', '.$lenghts[$int]."\n";
278              
279             $int++;
280             }
281              
282             =cut
283              
284             sub path_lengths {
285 2     2 1 4711 my ( $self, $data ) = @_;
286 2         10 $self->_check_fitted;
287 1         2 my $t = scalar @{ $self->{trees} };
  1         2  
288 1         2 my @out;
289 1         3 for my $x (@$data) {
290 100         124 my $sum = 0;
291 100         109 $sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
  100         213  
292 100         283 push @out, $sum / $t;
293             }
294 1         11 return \@out;
295             } ## end sub path_lengths
296              
297             =head predict(\@data, $threshold)
298              
299             Returns an arrayref of 0/1 labels for the specified data.
300              
301             If theshold is not specified it uses whatever the set default.
302              
303             my $results = $forest->predict(\@data, $threshold);
304              
305             print "x, y, result\n";
306              
307             my $int=0;
308             while (defined($data[$int])) {
309             print $data[$int][0].', '.$data[$int][1].', '.$results->[$int]."\n";
310              
311             $int++;
312             }
313              
314             =cut
315              
316             sub predict {
317 11     11 0 13460 my ( $self, $data, $threshold ) = @_;
318             $threshold
319             = defined $threshold ? $threshold
320             : defined $self->{threshold} ? $self->{threshold}
321 11 100       49 : 0.5;
    100          
322 11         58 my $scores = $self->score_samples($data);
323 10 100       79 return [ map { $_ >= $threshold ? 1 : 0 } @$scores ];
  1107         1891  
324             }
325              
326             =head2 score_samples(\@data)
327              
328             Returns an arrayref of anomaly scores, between 0 and 1.
329              
330             Scores near 1 are strong anomalies (isolated quickly).
331              
332             Scores well below 0.5 are normal.
333              
334             Scores ~0.5 means the points are hard to tell apart.
335              
336             my $scores = $forest->path_lengths(\@data);
337              
338             print "x, y, length\n";
339              
340             my $int=0;
341             while (defined($data[$int])) {
342             print $data[$int][0].', '.$data[$int][1].', '.$scores->[$int]."\n";
343              
344             $int++;
345             }
346              
347             =cut
348              
349             sub score_samples {
350 31     31 1 14616 my ( $self, $data ) = @_;
351 31         125 $self->_check_fitted;
352 29         66 my $c = $self->{c_psi};
353 29         45 my $t = scalar @{ $self->{trees} };
  29         54  
354              
355 29         47 my @scores;
356 29         61 for my $x (@$data) {
357 3042         3818 my $sum = 0;
358 3042         3520 $sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
  3042         6648  
359 3042         4436 my $avg = $sum / $t;
360 3042 50       16148 push @scores, $c > 0 ? 2**( -$avg / $c ) : 0.5;
361             }
362 29         178 return \@scores;
363             } ## end sub score_samples
364              
365             =head2 score_predict_samples
366              
367             Returns a array ref of arrays. First value of each sub array is the score with the second being
368             0/1 for if it is a anomaly or not.
369              
370             my $results = $forest->predict(\@data, $threshold);
371              
372             print "x, y, score, result\n";
373              
374             my $int=0;
375             while (defined($data[$int])) {
376             print $data[$int][0].', '.$data[$int][1].', '.$results->[$int][0].', '.$results->[$int][1]."\n";
377              
378             $int++;
379             }
380              
381             =cut
382              
383             sub score_predict_samples {
384 0     0 1 0 my ( $self, $data, $threshold ) = @_;
385             $threshold
386             = defined $threshold ? $threshold
387             : defined $self->{threshold} ? $self->{threshold}
388 0 0       0 : 0.5;
    0          
389 0         0 my $scores = $self->score_samples($data);
390              
391 0         0 my @to_return;
392 0         0 foreach my $score ( @{$scores} ) {
  0         0  
393 0 0       0 if ( $score >= $threshold ) {
394 0         0 push @to_return, [ $score, 1 ];
395             } else {
396 0         0 push @to_return, [ $score, 0 ];
397             }
398             }
399              
400 0         0 return \@to_return;
401             } ## end sub score_predict_samples
402              
403             =head1 MODEL SAVE/LOAD METHODS
404              
405             =head2 to_json
406              
407             Returns a JSON representation of the module.
408              
409             Required being fit having to be called.
410              
411             my $json = $iforest->to_json;
412              
413             =cut
414              
415             sub to_json {
416 4     4 1 242396 my ($self) = @_;
417 4         15 $self->_check_fitted;
418             my $payload = {
419             format => 'Algorithm::Classifier::IsolationForest',
420             version => 0,
421             params => {
422             n_trees => $self->{n_trees},
423             sample_size => $self->{sample_size},
424             mode => $self->{mode},
425             extension_level => $self->{extension_level_used},
426             contamination => $self->{contamination},
427             threshold => $self->{threshold},
428             n_features => $self->{n_features},
429             psi_used => $self->{psi_used},
430             c_psi => $self->{c_psi},
431             max_depth_used => $self->{max_depth_used},
432             },
433             trees => $self->{trees},
434 3         32 };
435 3         24 return JSON::PP->new->canonical(1)->encode($payload);
436             } ## end sub to_json
437              
438             =head2 from_json($json)
439              
440             Init the object from the model in the specified JSON string.
441              
442             my $iforest = Algorithm::Classifier::IsolationForest->from_json($json);
443              
444             =cut
445              
446             sub from_json {
447 5     5 1 250913 my ( $class, $text ) = @_;
448 5         33 my $payload = JSON::PP->new->decode($text);
449             croak "not an IsolationForest model"
450             unless ref $payload eq 'HASH'
451             && defined $payload->{format}
452 5 100 100     2078138 && $payload->{format} eq 'Algorithm::Classifier::IsolationForest';
      66        
453              
454 3   100     14 my $p = $payload->{params} || {};
455             my $self = {
456             n_trees => $p->{n_trees},
457             sample_size => $p->{sample_size},
458             max_depth => undef,
459             seed => undef,
460             mode => $p->{mode} // 'axis',
461             extension_level => $p->{extension_level},
462             extension_level_used => $p->{extension_level},
463             contamination => $p->{contamination},
464             threshold => $p->{threshold},
465             n_features => $p->{n_features},
466             psi_used => $p->{psi_used},
467             c_psi => $p->{c_psi},
468             max_depth_used => $p->{max_depth_used},
469 3   100     49 trees => $payload->{trees} || [],
      50        
470             };
471 3 100       13 croak "model contains no trees" unless @{ $self->{trees} };
  3         122  
472              
473             # Recompute the normalising constant from the (integer, exact) sub-sample
474             # size rather than trusting the stored float, so a reloaded model's scores
475             # are bit-for-bit identical to the original's.
476 2 50       19 $self->{c_psi} = _c( $self->{psi_used} ) if defined $self->{psi_used};
477              
478 2         21 return bless $self, $class;
479             } ## end sub from_json
480              
481             =head2 save($path)
482              
483             Saves the model to the specified path.
484              
485             $iforest->save($path);
486              
487             =cut
488              
489             sub save {
490 1     1 1 5046 my ( $self, $path ) = @_;
491 1         6 write_file( $path, { 'atomic' => 1 }, $self->to_json );
492             }
493              
494             =head2 load($path);
495              
496             Init the object from the model in the specified file.
497              
498             my $iforest = Algorithm::Classifier::IsolationForest->load($path);
499              
500             =cut
501              
502             sub load {
503 2     2 1 233575 my ( $class, $path ) = @_;
504 2         12 my $raw_model = read_file($path);
505 1         294 return $class->from_json($raw_model);
506             }
507              
508             =head1 REFERENCES
509              
510             Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
511              
512             L
513              
514             L
515              
516             Sahand Hariri, Matias Carrasco Kind, Robert J. Brunner (2020). Extended Isolation Forest. 1479 - 1489. 10.1109/TKDE.2019.2947676
517              
518             L
519              
520             =cut
521              
522             ###
523             ###
524             ### internal stuff below
525             ###
526             ###
527              
528             #-------------------------------------------------------------------------------
529             # c(n): the expected path length of an unsuccessful search in a binary search
530             # tree of n nodes. Isolation Forest uses it (a) to adjust the path length when a
531             # leaf still holds more than one point (depth limit reached), and (b) to
532             # normalise the average path length into a 0..1 anomaly score.
533             #-------------------------------------------------------------------------------
534             sub _c {
535 237911     237911   289622 my ($n) = @_;
536 237911 100       354205 return 0.0 if $n <= 1;
537 200978 100       286565 return 1.0 if $n == 2;
538 187800         235835 my $harmonic = log( $n - 1 ) + EULER; # H(n-1) ~= ln(n-1) + gamma
539 187800         396780 return 2.0 * $harmonic - ( 2.0 * ( $n - 1 ) / $n );
540             }
541              
542             # One draw from the standard normal N(0,1) via Box-Muller. Used to pick the
543             # random hyperplane orientations in Extended Isolation Forest mode.
544             sub _randn {
545 8667   50 8667   12882 my $u1 = rand() || 1e-12;
546 8667         9668 my $u2 = rand();
547 8667         14701 return sqrt( -2.0 * log($u1) ) * cos( TWO_PI * $u2 );
548             }
549              
550             #-------------------------------------------------------------------------------
551             # Draw $k samples without replacement via a partial Fisher-Yates shuffle of the
552             # index array. Returns an arrayref of (shared, read-only) sample refs.
553             #-------------------------------------------------------------------------------
554             sub _subsample {
555 747     747   1619 my ( $data, $k ) = @_;
556 747         1180 my $n = scalar @$data;
557 747         5207 my @idx = ( 0 .. $n - 1 );
558 747         1950 for my $i ( 0 .. $k - 1 ) {
559 130400         165398 my $j = $i + int( rand( $n - $i ) );
560 130400         186194 @idx[ $i, $j ] = @idx[ $j, $i ];
561             }
562 747         7335 my @chosen = @idx[ 0 .. $k - 1 ];
563 747         3307 return [ @{$data}[@chosen] ];
  747         13455  
564             } ## end sub _subsample
565              
566             #-------------------------------------------------------------------------------
567             # Recursively build one isolation tree.
568             #
569             # A node is one of:
570             # leaf { leaf => 1, size => N }
571             # axis { attr => A, split => S, left => ..., right => ... }
572             # oblique { idx => [..], coef => [..], b => B, left => ..., right => ... }
573             #
574             # In both split styles the choice is restricted to features that actually vary
575             # across the points reaching the node: this avoids wasted levels on constant
576             # columns and lets a node leaf out exactly when its points are indistinguishable.
577             #-------------------------------------------------------------------------------
578             sub _build_tree {
579 49529     49529   71161 my ( $self, $X, $depth, $limit ) = @_;
580              
581 49529         57558 my $size = scalar @$X;
582 49529 100 100     136123 return { leaf => 1, size => $size }
583             if $depth >= $limit || $size <= 1;
584              
585 24391         31809 my $nf = $self->{n_features};
586              
587             # Per-feature min and max within this node, in a single pass.
588 24391         29598 my ( @lo, @hi );
589 24391         32677 for my $row (@$X) {
590 985236         1273133 for my $f ( 0 .. $nf - 1 ) {
591 1973283         2297821 my $v = $row->[$f];
592 1973283 100 100     4098513 $lo[$f] = $v if !defined $lo[$f] || $v < $lo[$f];
593 1973283 100 100     4428628 $hi[$f] = $v if !defined $hi[$f] || $v > $hi[$f];
594             }
595             }
596              
597             # Features with spread are the only ones that can split the data.
598 24391         43187 my @varying = grep { $lo[$_] < $hi[$_] } 0 .. $nf - 1;
  48854         88594  
599              
600             # No spread on any feature => all points identical => cannot isolate.
601 24391 50       37978 return { leaf => 1, size => $size } unless @varying;
602              
603             my $node
604 24391 100       56844 = $self->{mode} eq 'extended'
605             ? $self->_oblique_split( $X, \@varying, \@lo, \@hi )
606             : _axis_split( $X, \@varying, \@lo, \@hi );
607              
608 24391         52924 $node->{left} = $self->_build_tree( $node->{_left}, $depth + 1, $limit );
609 24391         39963 $node->{right} = $self->_build_tree( $node->{_right}, $depth + 1, $limit );
610 24391         29794 delete @{$node}{qw(_left _right)};
  24391         60187  
611              
612 24391         56205 return $node;
613             } ## end sub _build_tree
614              
615             # Axis-parallel cut: random varying feature, random threshold in its range.
616             sub _axis_split {
617 19950     19950   30718 my ( $X, $varying, $lo, $hi ) = @_;
618              
619 19950         35784 my $attr = $varying->[ int( rand( scalar @$varying ) ) ];
620 19950         32608 my $split = $lo->[$attr] + rand() * ( $hi->[$attr] - $lo->[$attr] );
621              
622 19950         24911 my ( @left, @right );
623 19950         26520 for my $row (@$X) {
624 784613 100       1038634 if ( $row->[$attr] < $split ) { push @left, $row }
  393279         532595  
625 391334         551352 else { push @right, $row }
626             }
627 19950         75137 return { attr => $attr, split => $split, _left => \@left, _right => \@right };
628             } ## end sub _axis_split
629              
630             # Oblique cut (Extended Isolation Forest): a random hyperplane. We activate
631             # (extension_level + 1) of the varying features, give each a Gaussian
632             # coefficient, and place the plane through a random point in the bounding box.
633             # A point goes left when coef . x <= b, where b = coef . p.
634             sub _oblique_split {
635 4441     4441   6800 my ( $self, $X, $varying, $lo, $hi ) = @_;
636              
637 4441         5974 my $active = $self->{extension_level_used} + 1;
638 4441 100       7245 $active = scalar @$varying if $active > scalar @$varying;
639              
640             # Pick which varying features take part (partial shuffle of their indices).
641 4441         6162 my @pool = @$varying;
642 4441         6379 for my $i ( 0 .. $active - 1 ) {
643 8667         12385 my $j = $i + int( rand( scalar(@pool) - $i ) );
644 8667         13424 @pool[ $i, $j ] = @pool[ $j, $i ];
645             }
646 4441         8279 my @idx = @pool[ 0 .. $active - 1 ];
647              
648 4441         5740 my ( @coef, $b );
649 4441         5211 $b = 0.0;
650 4441         5390 for my $f (@idx) {
651 8667         11037 my $c = _randn();
652 8667         12672 my $p = $lo->[$f] + rand() * ( $hi->[$f] - $lo->[$f] ); # point in the box
653 8667         11128 push @coef, $c;
654 8667         10720 $b += $c * $p;
655             }
656              
657 4441         5345 my ( @left, @right );
658 4441         5785 for my $row (@$X) {
659 200623         210273 my $dot = 0.0;
660 200623         362543 $dot += $coef[$_] * $row->[ $idx[$_] ] for 0 .. $#idx;
661 200623 100       250013 if ( $dot <= $b ) { push @left, $row }
  97910         123730  
662 102713         134813 else { push @right, $row }
663             }
664             return {
665 4441         18624 idx => \@idx,
666             coef => \@coef,
667             b => $b,
668             _left => \@left,
669             _right => \@right
670             };
671             } ## end sub _oblique_split
672              
673             #-------------------------------------------------------------------------------
674             # Path length of a single point in a single tree: edges traversed until a leaf,
675             # plus c(leaf size) when the leaf still holds several points. Handles both axis
676             # and oblique internal nodes, so a model of either mode scores correctly.
677             #-------------------------------------------------------------------------------
678             sub _path_length {
679 237890     237890   334678 my ( $self, $x, $node, $depth ) = @_;
680 237890         359427 while ( !$node->{leaf} ) {
681 1734351         1776345 my $left;
682 1734351 100       2125120 if ( exists $node->{attr} ) { # axis-parallel split
683 1544332         2061302 $left = $x->[ $node->{attr} ] < $node->{split};
684             } else { # oblique (hyperplane) split
685 190019         268315 my ( $idx, $coef ) = ( $node->{idx}, $node->{coef} );
686 190019         207604 my $dot = 0.0;
687 190019         387757 $dot += $coef->[$_] * $x->[ $idx->[$_] ] for 0 .. $#$idx;
688 190019         252778 $left = $dot <= $node->{b};
689             }
690 1734351 100       2460752 $node = $left ? $node->{left} : $node->{right};
691 1734351         2552557 $depth++;
692             } ## end while ( !$node->{leaf} )
693 237890         311781 return $depth + _c( $node->{size} );
694             } ## end sub _path_length
695              
696             sub _check_fitted {
697 37     37   78 my ($self) = @_;
698             croak "model is not fitted yet; call fit() first"
699 37 100 66     172 unless ref $self->{trees} eq 'ARRAY' && @{ $self->{trees} };
  37         618  
700             }
701              
702             1;