File Coverage

blib/lib/Algorithm/Classifier/IsolationForest.pm

Criterion	Covered	Total	%
statement	206	216	95.3
branch	68	84	80.9
condition	46	50	92.0
subroutine	27	28	96.4
pod	10	11	90.9
total	357	389	91.7

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::Classifier::IsolationForest;
2
3	8			8		665666	use strict;
	8					12
	8					235
4	8			8		29	use warnings;
	8					20
	8					379
5	8			8		38	use Carp qw(croak);
	8					13
	8					391
6	8			8		35	use List::Util qw(min);
	8					18
	8					515
7	8			8		3448	use POSIX qw(ceil);
	8					53939
	8					41
8	8			8		15031	use JSON::PP ();
	8					117900
	8					271
9	8			8		3714	use File::Slurp qw(read_file write_file);
	8					148811
	8					640
10
11							our $VERSION = '0.1.0';
12
13	8			8		56	use constant EULER => 0.5772156649015329;
	8					19
	8					461
14	8			8		37	use constant TWO_PI => 6.283185307179586;
	8					15
	8					20294
15
16							=head1 NAME
17
18							Algorithm::Classifier::IsolationForest - unsupervised anomaly detection via Isolation Forest or Extended Isolation Forest
19
20							=head1 SYNOPSIS
21
22							use Algorithm::Classifier::IsolationForest;
23
24							my @data = ([0.1, -0.2], [0.0, 0.1], [5.0, 6.0], ...);
25
26							# Classic, axis-parallel Isolation Forest
27							my $iforest = Algorithm::Classifier::IsolationForest->new(
28							n_trees => 100,
29							sample_size => 256,
30							seed => 42,
31							);
32							$iforest->fit(\@data);
33
34							my $scores = $iforest->score_samples(\@data); # arrayref, each in (0,1]
35							my $flags = $iforest->predict(\@data, 0.6); # arrayref of 0/1
36
37							# Save and reload
38							$iforest->save('model.json');
39							my $reloaded = Algorithm::Classifier::IsolationForest->load('model.json');
40
41							# Extended Isolation Forest (oblique hyperplane splits)
42							my $eif = IsolationForest->new(mode => 'extended', seed => 42);
43							$eif->fit(\@data);
44
45							=head1 DESCRIPTION
46
47							Isolation Forest (Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua, 2008) detects anomalies by random
48							partitioning rather than by modelling normal points. Each tree repeatedly
49							splits the data. Points that get isolated after only a few splits are likely
50							anomalies. The score is the average isolation depth across many trees,
51							normalised so values approach 1 for anomalies and stay below 0.5 for normal
52							points.
53
54							In extended mode the module implements the Extended Isolation Forest
55							variant. Each split is a random hyperplane instead of an axis-aligned cut,
56							which removes the rectangular, axis-aligned bias in the score field and
57							tends to help on elongated or multi-modal data.
58
59							psi refernced below is ψ or the pitchfork math symbol refrenced in paper,
60							Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
61
62							... or max samples.
63
64							L
65
66							=head1 GENERAL METHODS
67
68							=head2 new(%args)
69
70							Inits the object.
71
72							- n_trees :: number of isolation trees in the ensemble
73							default :: 100
74
75							- sample_size :: sub-sample size used to build each tree... max samples
76							default :: 256
77
78							- max_depth :: per-tree height limit... if not defined is set to ceil(log2(psi))
79							default :: undef
80
81							- seed :: optional integer to seed srand with for reproducible trees...
82							see perldoc -f srand for more info. This number is processed via abs(int()).
83							default :: undef
84
85							- mode :: if it should be IF or EIF
86							axis :: classic axis-parallel splits (IF)
87							extended :: oblique hyperplane splits (EIF)
88							default :: axis
89
90							- extension_level :: extended mode only... how many features take partin each
91							split. 0 behaves like a single-feature (axis) cut; the
92							maximum (n_features - 1) uses every varying feature. undef
93							=> maximum. Clamped to [0, n_features - 1] at fit time.
94
95							- contamination :: expected fraction of anomalies, in (0, 0.5]. When given,
96							fit() learns a score threshold that flags this fraction of
97							the training set, and predict() uses it by default. undef
98							=> no learned threshold (predict() falls back to 0.5).
99							default :: undef
100
101							Note: log2 under Perl is as below...
102
103							log($psi) / log(2)
104
105							=cut
106
107							sub new {
108	39			39	1	947975	my ( $class, %args ) = @_;
109
110	39		100			192	my $mode = $args{mode} // 'axis';
111	39	100	100			284	croak "mode must be 'axis' or 'extended'"
112							unless $mode eq 'axis' \|\| $mode eq 'extended';
113
114	38	100				99	if ( defined( $args{seed} ) ) {
115	20					64	$args{seed} = abs( int( $args{seed} ) );
116							}
117
118							my $self = {
119							n_trees => $args{n_trees} // 100,
120							sample_size => $args{sample_size} // 256,
121							max_depth => $args{max_depth}, # undef => auto
122							seed => $args{seed}, # undef => non-deterministic
123							mode => $mode,
124							extension_level => $args{extension_level}, # undef => max, resolved in fit()
125							contamination => $args{contamination}, # undef => no learned threshold
126	38		100			444	threshold => undef, # learned in fit() if contamination set
			100
127							trees => [],
128							c_psi => undef, # c(psi), set during fit()
129							n_features => undef,
130							};
131
132	38	100				188	croak "n_trees must be >= 1" unless $self->{n_trees} >= 1;
133	37	100				189	croak "sample_size must be >= 1" unless $self->{sample_size} >= 1;
134							croak "extension_level must be >= 0"
135	36	100	100			200	if defined $self->{extension_level} && $self->{extension_level} < 0;
136							croak "contamination must be a number in (0, 0.5]"
137							if defined $self->{contamination}
138	35	100	100			390	&& !( $self->{contamination} > 0 && $self->{contamination} <= 0.5 );
			100
139
140	32					106	return bless $self, $class;
141							} ## end sub new
142
143							=head2 decision_threshold
144
145							The score cutoff C uses by default; undef unless C was
146							set.
147
148							=cut
149
150	6			6	1	2780	sub decision_threshold { return $_[0]->{threshold} }
151
152							=head2 fit
153
154							Trains the model on the specified data.
155
156							The data taken is a array of arrays, with each sub array containing two numbers.
157
158							@training_data = (
159							[ 3, 5 ],
160							[ 2.3, 1 ],
161							[ 5, 9 ],
162							...
163							);
164
165							Below shows a example of building a gausing cluster and using that for training.
166
167							# so it is reproducible
168							srand(7);
169
170							# build a gaussian cluster and add a handful out outliers...
171
172							use constant PI => 3.14159265358979;
173							sub gaussian {
174							my ($mu, $sigma) = @_;
175							my $u1 = rand() \|\| 1e-12;
176							my $u2 = rand();
177							my $z = sqrt(-2 * log($u1)) * cos(2 * PI * $u2);
178							return $mu + $sigma * $z;
179							}
180
181							# add some normal items
182							for (1 .. 500) {
183							push @data, [ gaussian(0, 1), gaussian(0, 1) ];
184							push @truth, 0;
185							}
186							# add some outliers
187							for (1 .. 20) {
188							my $angle = rand() * 2 * PI;
189							my $radius = 5 + rand() * 3; # distance 5..8 from the origin
190							push @data, [ $radius * cos($angle), $radius * sin($angle) ];
191							push @truth, 1;
192							}
193
194							$iforest->fit(\@training_data);
195
196							=cut
197
198							sub fit {
199	24			24	1	2610	my ( $self, $data ) = @_;
200
201	24	100	100			450	croak "fit() expects a non-empty arrayref of samples"
202							unless ref $data eq 'ARRAY' && @$data;
203							croak "each sample must be an arrayref of features"
204	21	100	100			168	unless ref $data->[0] eq 'ARRAY' && @{ $data->[0] };
	20					183
205
206	19					37	my $n = scalar @$data;
207	19					29	my $n_features = scalar @{ $data->[0] };
	19					34
208	19					50	$self->{n_features} = $n_features;
209
210							# The sub-sample cannot be larger than the data set itself.
211	19					107	my $psi = min( $self->{sample_size}, $n );
212	19					76	$self->{c_psi} = _c($psi);
213	19					49	$self->{psi_used} = $psi;
214
215							# Resolve the extension level against the data's dimensionality.
216	19	100				67	if ( $self->{mode} eq 'extended' ) {
217	6					12	my $max_ext = $n_features - 1;
218							my $ext
219							= defined $self->{extension_level}
220							? $self->{extension_level}
221	6	100				18	: $max_ext;
222	6	50				15	$ext = 0 if $ext < 0;
223	6	100				15	$ext = $max_ext if $ext > $max_ext;
224	6					13	$self->{extension_level_used} = $ext;
225							} else {
226	13					30	$self->{extension_level_used} = undef;
227							}
228
229							# Height limit: the average tree height ceil(log2(psi)). Past this depth the
230							# remaining points are scored using the c(size) adjustment instead.
231							my $limit
232							= defined $self->{max_depth}
233							? $self->{max_depth}
234	19	50				119	: ceil( log($psi) / log(2) );
235	19	50				53	$limit = 1 if $limit < 1;
236	19					62	$self->{max_depth_used} = $limit;
237
238	19	50				75	srand( $self->{seed} ) if defined $self->{seed};
239
240	19					64	my @trees;
241	19					64	for ( 1 .. $self->{n_trees} ) {
242	747					1756	my $sample = _subsample( $data, $psi );
243	747					1845	push @trees, $self->_build_tree( $sample, 0, $limit );
244							}
245	19					69	$self->{trees} = \@trees;
246
247							# If a contamination rate was requested, learn the score cutoff that flags
248							# that fraction of the training set. We place the threshold midway between
249							# the k-th and (k+1)-th highest training scores, so it sits in the gap
250							# between flagged and unflagged points -- unambiguous and robust to the
251							# tiny float rounding introduced by JSON serialisation.
252	19	100				86	if ( defined $self->{contamination} ) {
253	3					20	my $scores = $self->score_samples($data);
254	3					60	my @desc = sort { $b <=> $a } @$scores;
	3979					4200
255	3					6	my $n_pts = scalar @desc;
256	3					15	my $k = int( $self->{contamination} * $n_pts + 0.5 );
257	3	50				11	$k = 1 if $k < 1;
258	3	50				54	$k = $n_pts if $k > $n_pts;
259	3	50				41	$self->{threshold} = $k < $n_pts
260							? ( $desc[ $k - 1 ] + $desc[$k] ) / 2.0 # midpoint of the boundary
261							: $desc[ $n_pts - 1 ] - 1e-9; # k == n: flag everything
262							} ## end if ( defined $self->{contamination} )
263
264	19					111	return $self;
265							} ## end sub fit
266
267							=head2 path_lengths(\@data)
268
269							Returns the mean isolation depth per sample, for inspection.
270
271							my @lenghts = $forest->path_lengths(\@data);
272
273							print "x, y, length\n";
274
275							my $int=0;
276							while (defined($data[$int])) {
277							print $data[$int][0].', '.$data[$int][1].', '.$lenghts[$int]."\n";
278
279							$int++;
280							}
281
282							=cut
283
284							sub path_lengths {
285	2			2	1	4610	my ( $self, $data ) = @_;
286	2					8	$self->_check_fitted;
287	1					2	my $t = scalar @{ $self->{trees} };
	1					2
288	1					2	my @out;
289	1					41	for my $x (@$data) {
290	100					125	my $sum = 0;
291	100					120	$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
	100					184
292	100					208	push @out, $sum / $t;
293							}
294	1					10	return \@out;
295							} ## end sub path_lengths
296
297							=head predict(\@data, $threshold)
298
299							Returns an arrayref of 0/1 labels for the specified data.
300
301							If theshold is not specified it uses whatever the set default.
302
303							my $results = $forest->predict(\@data, $threshold);
304
305							print "x, y, result\n";
306
307							my $int=0;
308							while (defined($data[$int])) {
309							print $data[$int][0].', '.$data[$int][1].', '.$results->[$int]."\n";
310
311							$int++;
312							}
313
314							=cut
315
316							sub predict {
317	11			11	0	15236	my ( $self, $data, $threshold ) = @_;
318							$threshold
319							= defined $threshold ? $threshold
320							: defined $self->{threshold} ? $self->{threshold}
321	11	100				49	: 0.5;
		100
322	11					33	my $scores = $self->score_samples($data);
323	10	100				86	return [ map { $_ >= $threshold ? 1 : 0 } @$scores ];
	1107					1683
324							}
325
326							=head2 score_samples(\@data)
327
328							Returns an arrayref of anomaly scores, between 0 and 1.
329
330							Scores near 1 are strong anomalies (isolated quickly).
331
332							Scores well below 0.5 are normal.
333
334							Scores ~0.5 means the points are hard to tell apart.
335
336							my $scores = $forest->path_lengths(\@data);
337
338							print "x, y, length\n";
339
340							my $int=0;
341							while (defined($data[$int])) {
342							print $data[$int][0].', '.$data[$int][1].', '.$scores->[$int]."\n";
343
344							$int++;
345							}
346
347							=cut
348
349							sub score_samples {
350	31			31	1	15892	my ( $self, $data ) = @_;
351	31					118	$self->_check_fitted;
352	29					74	my $c = $self->{c_psi};
353	29					41	my $t = scalar @{ $self->{trees} };
	29					48
354
355	29					48	my @scores;
356	29					57	for my $x (@$data) {
357	3042					3934	my $sum = 0;
358	3042					3640	$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
	3042					7247
359	3042					4862	my $avg = $sum / $t;
360	3042	50				11953	push @scores, $c > 0 ? 2**( -$avg / $c ) : 0.5;
361							}
362	29					162	return \@scores;
363							} ## end sub score_samples
364
365							=head2 score_predict_samples
366
367							Returns a array ref of arrays. First value of each sub array is the score with the second being
368							0/1 for if it is a anomaly or not.
369
370							my $results = $forest->predict(\@data, $threshold);
371
372							print "x, y, score, result\n";
373
374							my $int=0;
375							while (defined($data[$int])) {
376							print $data[$int][0].', '.$data[$int][1].', '.$results->[$int][0].', '.$results->[$int][1]."\n";
377
378							$int++;
379							}
380
381							=cut
382
383							sub score_predict_samples {
384	0			0	1	0	my ( $self, $data, $threshold ) = @_;
385							$threshold
386							= defined $threshold ? $threshold
387							: defined $self->{threshold} ? $self->{threshold}
388	0	0				0	: 0.5;
		0
389	0					0	my $scores = $self->score_samples($data);
390
391	0					0	my @to_return;
392	0					0	foreach my $score ( @{$scores} ) {
	0					0
393	0	0				0	if ( $score >= $threshold ) {
394	0					0	push @to_return, [ $score, 1 ];
395							} else {
396	0					0	push @to_return, [ $score, 0 ];
397							}
398							}
399
400	0					0	return \@to_return;
401							} ## end sub score_predict_samples
402
403							=head1 MODEL SAVE/LOAD METHODS
404
405							=head2 to_json
406
407							Returns a JSON representation of the module.
408
409							Required being fit having to be called.
410
411							my $json = $iforest->to_json;
412
413							=cut
414
415							sub to_json {
416	4			4	1	239076	my ($self) = @_;
417	4					15	$self->_check_fitted;
418							my $payload = {
419							format => 'Algorithm::Classifier::IsolationForest',
420							version => 0,
421							params => {
422							n_trees => $self->{n_trees},
423							sample_size => $self->{sample_size},
424							mode => $self->{mode},
425							extension_level => $self->{extension_level_used},
426							contamination => $self->{contamination},
427							threshold => $self->{threshold},
428							n_features => $self->{n_features},
429							psi_used => $self->{psi_used},
430							c_psi => $self->{c_psi},
431							max_depth_used => $self->{max_depth_used},
432							},
433							trees => $self->{trees},
434	3					34	};
435	3					24	return JSON::PP->new->canonical(1)->encode($payload);
436							} ## end sub to_json
437
438							=head2 from_json($json)
439
440							Init the object from the model in the specified JSON string.
441
442							my $iforest = Algorithm::Classifier::IsolationForest->from_json($json);
443
444							=cut
445
446							sub from_json {
447	5			5	1	235889	my ( $class, $text ) = @_;
448	5					28	my $payload = JSON::PP->new->decode($text);
449							croak "not an IsolationForest model"
450							unless ref $payload eq 'HASH'
451							&& defined $payload->{format}
452	5	100	100			2040093	&& $payload->{format} eq 'Algorithm::Classifier::IsolationForest';
			66
453
454	3		100			15	my $p = $payload->{params} \|\| {};
455							my $self = {
456							n_trees => $p->{n_trees},
457							sample_size => $p->{sample_size},
458							max_depth => undef,
459							seed => undef,
460							mode => $p->{mode} // 'axis',
461							extension_level => $p->{extension_level},
462							extension_level_used => $p->{extension_level},
463							contamination => $p->{contamination},
464							threshold => $p->{threshold},
465							n_features => $p->{n_features},
466							psi_used => $p->{psi_used},
467							c_psi => $p->{c_psi},
468							max_depth_used => $p->{max_depth_used},
469	3		100			45	trees => $payload->{trees} \|\| [],
			50
470							};
471	3	100				12	croak "model contains no trees" unless @{ $self->{trees} };
	3					111
472
473							# Recompute the normalising constant from the (integer, exact) sub-sample
474							# size rather than trusting the stored float, so a reloaded model's scores
475							# are bit-for-bit identical to the original's.
476	2	50				17	$self->{c_psi} = _c( $self->{psi_used} ) if defined $self->{psi_used};
477
478	2					19	return bless $self, $class;
479							} ## end sub from_json
480
481							=head2 save($path)
482
483							Saves the model to the specified path.
484
485							$iforest->save($path);
486
487							=cut
488
489							sub save {
490	1			1	1	7016	my ( $self, $path ) = @_;
491	1					6	write_file( $path, { 'atomic' => 1 }, $self->to_json );
492							}
493
494							=head2 load($path);
495
496							Init the object from the model in the specified file.
497
498							my $iforest = Algorithm::Classifier::IsolationForest->load($path);
499
500							=cut
501
502							sub load {
503	2			2	1	251850	my ( $class, $path ) = @_;
504	2					10	my $raw_model = read_file($path);
505	1					201	return $class->from_json($raw_model);
506							}
507
508							=head1 REFERENCES
509
510							Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
511
512							L
513
514							L
515
516							Sahand Hariri, Matias Carrasco Kind, Robert J. Brunner (2020). Extended Isolation Forest. 1479 - 1489. 10.1109/TKDE.2019.2947676
517
518							L
519
520							=cut
521
522							###
523							###
524							### internal stuff below
525							###
526							###
527
528							#-------------------------------------------------------------------------------
529							# c(n): the expected path length of an unsuccessful search in a binary search
530							# tree of n nodes. Isolation Forest uses it (a) to adjust the path length when a
531							# leaf still holds more than one point (depth limit reached), and (b) to
532							# normalise the average path length into a 0..1 anomaly score.
533							#-------------------------------------------------------------------------------
534							sub _c {
535	237911			237911		295870	my ($n) = @_;
536	237911	100				361868	return 0.0 if $n <= 1;
537	200978	100				290040	return 1.0 if $n == 2;
538	187800					249109	my $harmonic = log( $n - 1 ) + EULER; # H(n-1) ~= ln(n-1) + gamma
539	187800					434716	return 2.0 * $harmonic - ( 2.0 * ( $n - 1 ) / $n );
540							}
541
542							# One draw from the standard normal N(0,1) via Box-Muller. Used to pick the
543							# random hyperplane orientations in Extended Isolation Forest mode.
544							sub _randn {
545	8667		50	8667		12677	my $u1 = rand() \|\| 1e-12;
546	8667					9776	my $u2 = rand();
547	8667					15160	return sqrt( -2.0 * log($u1) ) * cos( TWO_PI * $u2 );
548							}
549
550							#-------------------------------------------------------------------------------
551							# Draw $k samples without replacement via a partial Fisher-Yates shuffle of the
552							# index array. Returns an arrayref of (shared, read-only) sample refs.
553							#-------------------------------------------------------------------------------
554							sub _subsample {
555	747			747		1499	my ( $data, $k ) = @_;
556	747					1154	my $n = scalar @$data;
557	747					4648	my @idx = ( 0 .. $n - 1 );
558	747					1773	for my $i ( 0 .. $k - 1 ) {
559	130400					154660	my $j = $i + int( rand( $n - $i ) );
560	130400					175097	@idx[ $i, $j ] = @idx[ $j, $i ];
561							}
562	747					6879	my @chosen = @idx[ 0 .. $k - 1 ];
563	747					3322	return [ @{$data}[@chosen] ];
	747					12146
564							} ## end sub _subsample
565
566							#-------------------------------------------------------------------------------
567							# Recursively build one isolation tree.
568							#
569							# A node is one of:
570							# leaf { leaf => 1, size => N }
571							# axis { attr => A, split => S, left => ..., right => ... }
572							# oblique { idx => [..], coef => [..], b => B, left => ..., right => ... }
573							#
574							# In both split styles the choice is restricted to features that actually vary
575							# across the points reaching the node: this avoids wasted levels on constant
576							# columns and lets a node leaf out exactly when its points are indistinguishable.
577							#-------------------------------------------------------------------------------
578							sub _build_tree {
579	49529			49529		69359	my ( $self, $X, $depth, $limit ) = @_;
580
581	49529					56940	my $size = scalar @$X;
582	49529	100	100			126280	return { leaf => 1, size => $size }
583							if $depth >= $limit \|\| $size <= 1;
584
585	24391					31822	my $nf = $self->{n_features};
586
587							# Per-feature min and max within this node, in a single pass.
588	24391					28143	my ( @lo, @hi );
589	24391					30662	for my $row (@$X) {
590	985236					1213078	for my $f ( 0 .. $nf - 1 ) {
591	1973283					2200984	my $v = $row->[$f];
592	1973283	100	100			3849619	$lo[$f] = $v if !defined $lo[$f] \|\| $v < $lo[$f];
593	1973283	100	100			4145345	$hi[$f] = $v if !defined $hi[$f] \|\| $v > $hi[$f];
594							}
595							}
596
597							# Features with spread are the only ones that can split the data.
598	24391					40361	my @varying = grep { $lo[$_] < $hi[$_] } 0 .. $nf - 1;
	48854					82284
599
600							# No spread on any feature => all points identical => cannot isolate.
601	24391	50				35816	return { leaf => 1, size => $size } unless @varying;
602
603							my $node
604	24391	100				53504	= $self->{mode} eq 'extended'
605							? $self->_oblique_split( $X, \@varying, \@lo, \@hi )
606							: _axis_split( $X, \@varying, \@lo, \@hi );
607
608	24391					48984	$node->{left} = $self->_build_tree( $node->{_left}, $depth + 1, $limit );
609	24391					38121	$node->{right} = $self->_build_tree( $node->{_right}, $depth + 1, $limit );
610	24391					28402	delete @{$node}{qw(_left _right)};
	24391					57218
611
612	24391					50214	return $node;
613							} ## end sub _build_tree
614
615							# Axis-parallel cut: random varying feature, random threshold in its range.
616							sub _axis_split {
617	19950			19950		28619	my ( $X, $varying, $lo, $hi ) = @_;
618
619	19950					33289	my $attr = $varying->[ int( rand( scalar @$varying ) ) ];
620	19950					30601	my $split = $lo->[$attr] + rand() * ( $hi->[$attr] - $lo->[$attr] );
621
622	19950					23145	my ( @left, @right );
623	19950					24627	for my $row (@$X) {
624	784613	100				969518	if ( $row->[$attr] < $split ) { push @left, $row }
	393279					497925
625	391334					519059	else { push @right, $row }
626							}
627	19950					67672	return { attr => $attr, split => $split, _left => \@left, _right => \@right };
628							} ## end sub _axis_split
629
630							# Oblique cut (Extended Isolation Forest): a random hyperplane. We activate
631							# (extension_level + 1) of the varying features, give each a Gaussian
632							# coefficient, and place the plane through a random point in the bounding box.
633							# A point goes left when coef . x <= b, where b = coef . p.
634							sub _oblique_split {
635	4441			4441		7089	my ( $self, $X, $varying, $lo, $hi ) = @_;
636
637	4441					5543	my $active = $self->{extension_level_used} + 1;
638	4441	100				7127	$active = scalar @$varying if $active > scalar @$varying;
639
640							# Pick which varying features take part (partial shuffle of their indices).
641	4441					6141	my @pool = @$varying;
642	4441					6277	for my $i ( 0 .. $active - 1 ) {
643	8667					12740	my $j = $i + int( rand( scalar(@pool) - $i ) );
644	8667					13827	@pool[ $i, $j ] = @pool[ $j, $i ];
645							}
646	4441					8468	my @idx = @pool[ 0 .. $active - 1 ];
647
648	4441					6028	my ( @coef, $b );
649	4441					5037	$b = 0.0;
650	4441					5725	for my $f (@idx) {
651	8667					11193	my $c = _randn();
652	8667					12961	my $p = $lo->[$f] + rand() * ( $hi->[$f] - $lo->[$f] ); # point in the box
653	8667					11527	push @coef, $c;
654	8667					11876	$b += $c * $p;
655							}
656
657	4441					4985	my ( @left, @right );
658	4441					5414	for my $row (@$X) {
659	200623					208196	my $dot = 0.0;
660	200623					364031	$dot += $coef[$_] * $row->[ $idx[$_] ] for 0 .. $#idx;
661	200623	100				250593	if ( $dot <= $b ) { push @left, $row }
	97910					121753
662	102713					134410	else { push @right, $row }
663							}
664							return {
665	4441					20214	idx => \@idx,
666							coef => \@coef,
667							b => $b,
668							_left => \@left,
669							_right => \@right
670							};
671							} ## end sub _oblique_split
672
673							#-------------------------------------------------------------------------------
674							# Path length of a single point in a single tree: edges traversed until a leaf,
675							# plus c(leaf size) when the leaf still holds several points. Handles both axis
676							# and oblique internal nodes, so a model of either mode scores correctly.
677							#-------------------------------------------------------------------------------
678							sub _path_length {
679	237890			237890		343816	my ( $self, $x, $node, $depth ) = @_;
680	237890					374312	while ( !$node->{leaf} ) {
681	1734351					1835530	my $left;
682	1734351	100				2203699	if ( exists $node->{attr} ) { # axis-parallel split
683	1544332					2142557	$left = $x->[ $node->{attr} ] < $node->{split};
684							} else { # oblique (hyperplane) split
685	190019					264082	my ( $idx, $coef ) = ( $node->{idx}, $node->{coef} );
686	190019					204258	my $dot = 0.0;
687	190019					402155	$dot += $coef->[$_] * $x->[ $idx->[$_] ] for 0 .. $#$idx;
688	190019					259095	$left = $dot <= $node->{b};
689							}
690	1734351	100				2477950	$node = $left ? $node->{left} : $node->{right};
691	1734351					2710504	$depth++;
692							} ## end while ( !$node->{leaf} )
693	237890					327100	return $depth + _c( $node->{size} );
694							} ## end sub _path_length
695
696							sub _check_fitted {
697	37			37		87	my ($self) = @_;
698							croak "model is not fitted yet; call fit() first"
699	37	100	66			167	unless ref $self->{trees} eq 'ARRAY' && @{ $self->{trees} };
	37					538
700							}
701
702							1;