File Coverage

blib/lib/Algorithm/Classifier/IsolationForest.pm

Criterion	Covered	Total	%
statement	206	216	95.3
branch	68	84	80.9
condition	46	50	92.0
subroutine	27	28	96.4
pod	10	11	90.9
total	357	389	91.7

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::Classifier::IsolationForest;
2
3	8			8		703549	use strict;
	8					16
	8					308
4	8			8		32	use warnings;
	8					27
	8					400
5	8			8		40	use Carp qw(croak);
	8					13
	8					378
6	8			8		34	use List::Util qw(min);
	8					18
	8					490
7	8			8		3543	use POSIX qw(ceil);
	8					59001
	8					38
8	8			8		15930	use JSON::PP ();
	8					128821
	8					246
9	8			8		4048	use File::Slurp qw(read_file write_file);
	8					156552
	8					687
10
11							our $VERSION = '0.0.1';
12
13	8			8		59	use constant EULER => 0.5772156649015329;
	8					17
	8					442
14	8			8		36	use constant TWO_PI => 6.283185307179586;
	8					11
	8					18447
15
16							=head1 NAME
17
18							Algorithm::Classifier::IsolationForest - unsupervised anomaly detection via Isolation Forest or Extended Isolation Forest
19
20							=head1 SYNOPSIS
21
22							use Algorithm::Classifier::IsolationForest;
23
24							my @data = ([0.1, -0.2], [0.0, 0.1], [5.0, 6.0], ...);
25
26							# Classic, axis-parallel Isolation Forest
27							my $iforest = Algorithm::Classifier::IsolationForest->new(
28							n_trees => 100,
29							sample_size => 256,
30							seed => 42,
31							);
32							$iforest->fit(\@data);
33
34							my $scores = $iforest->score_samples(\@data); # arrayref, each in (0,1]
35							my $flags = $iforest->predict(\@data, 0.6); # arrayref of 0/1
36
37							# Save and reload
38							$iforest->save('model.json');
39							my $reloaded = Algorithm::Classifier::IsolationForest->load('model.json');
40
41							# Extended Isolation Forest (oblique hyperplane splits)
42							my $eif = IsolationForest->new(mode => 'extended', seed => 42);
43							$eif->fit(\@data);
44
45							=head1 DESCRIPTION
46
47							Isolation Forest (Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua, 2008) detects anomalies by random
48							partitioning rather than by modelling normal points. Each tree repeatedly
49							splits the data. Points that get isolated after only a few splits are likely
50							anomalies. The score is the average isolation depth across many trees,
51							normalised so values approach 1 for anomalies and stay below 0.5 for normal
52							points.
53
54							In extended mode the module implements the Extended Isolation Forest
55							variant. Each split is a random hyperplane instead of an axis-aligned cut,
56							which removes the rectangular, axis-aligned bias in the score field and
57							tends to help on elongated or multi-modal data.
58
59							psi refernced below is ψ or the pitchfork math symbol refrenced in paper,
60							Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
61
62							... or max samples.
63
64							L
65
66							=head1 GENERAL METHODS
67
68							=head2 new(%args)
69
70							Inits the object.
71
72							- n_trees :: number of isolation trees in the ensemble
73							default :: 100
74
75							- sample_size :: sub-sample size used to build each tree... max samples
76							default :: 256
77
78							- max_depth :: per-tree height limit... if not defined is set to ceil(log2(psi))
79							default :: undef
80
81							- seed :: optional integer to seed srand with for reproducible trees...
82							see perldoc -f srand for more info. This number is processed via abs(int()).
83							default :: undef
84
85							- mode :: if it should be IF or EIF
86							axis :: classic axis-parallel splits (IF)
87							extended :: oblique hyperplane splits (EIF)
88							default :: axis
89
90							- extension_level :: extended mode only... how many features take partin each
91							split. 0 behaves like a single-feature (axis) cut; the
92							maximum (n_features - 1) uses every varying feature. undef
93							=> maximum. Clamped to [0, n_features - 1] at fit time.
94
95							- contamination :: expected fraction of anomalies, in (0, 0.5]. When given,
96							fit() learns a score threshold that flags this fraction of
97							the training set, and predict() uses it by default. undef
98							=> no learned threshold (predict() falls back to 0.5).
99							default :: undef
100
101							Note: log2 under Perl is as below...
102
103							log($psi) / log(2)
104
105							=cut
106
107							sub new {
108	39			39	1	937750	my ( $class, %args ) = @_;
109
110	39		100			193	my $mode = $args{mode} // 'axis';
111	39	100	100			331	croak "mode must be 'axis' or 'extended'"
112							unless $mode eq 'axis' \|\| $mode eq 'extended';
113
114	38	100				99	if ( defined( $args{seed} ) ) {
115	20					63	$args{seed} = abs( int( $args{seed} ) );
116							}
117
118							my $self = {
119							n_trees => $args{n_trees} // 100,
120							sample_size => $args{sample_size} // 256,
121							max_depth => $args{max_depth}, # undef => auto
122							seed => $args{seed}, # undef => non-deterministic
123							mode => $mode,
124							extension_level => $args{extension_level}, # undef => max, resolved in fit()
125							contamination => $args{contamination}, # undef => no learned threshold
126	38		100			442	threshold => undef, # learned in fit() if contamination set
			100
127							trees => [],
128							c_psi => undef, # c(psi), set during fit()
129							n_features => undef,
130							};
131
132	38	100				189	croak "n_trees must be >= 1" unless $self->{n_trees} >= 1;
133	37	100				161	croak "sample_size must be >= 1" unless $self->{sample_size} >= 1;
134							croak "extension_level must be >= 0"
135	36	100	100			190	if defined $self->{extension_level} && $self->{extension_level} < 0;
136							croak "contamination must be a number in (0, 0.5]"
137							if defined $self->{contamination}
138	35	100	100			392	&& !( $self->{contamination} > 0 && $self->{contamination} <= 0.5 );
			100
139
140	32					100	return bless $self, $class;
141							} ## end sub new
142
143							=head2 decision_threshold
144
145							The score cutoff C uses by default; undef unless C was
146							set.
147
148							=cut
149
150	6			6	1	2846	sub decision_threshold { return $_[0]->{threshold} }
151
152							=head2 fit
153
154							Trains the model on the specified data.
155
156							The data taken is a array of arrays, with each sub array containing two numbers.
157
158							@training_data = (
159							[ 3, 5 ],
160							[ 2.3, 1 ],
161							[ 5, 9 ],
162							...
163							);
164
165							Below shows a example of building a gausing cluster and using that for training.
166
167							# so it is reproducible
168							srand(7);
169
170							# build a gaussian cluster and add a handful out outliers...
171
172							use constant PI => 3.14159265358979;
173							sub gaussian {
174							my ($mu, $sigma) = @_;
175							my $u1 = rand() \|\| 1e-12;
176							my $u2 = rand();
177							my $z = sqrt(-2 * log($u1)) * cos(2 * PI * $u2);
178							return $mu + $sigma * $z;
179							}
180
181							# add some normal items
182							for (1 .. 500) {
183							push @data, [ gaussian(0, 1), gaussian(0, 1) ];
184							push @truth, 0;
185							}
186							# add some outliers
187							for (1 .. 20) {
188							my $angle = rand() * 2 * PI;
189							my $radius = 5 + rand() * 3; # distance 5..8 from the origin
190							push @data, [ $radius * cos($angle), $radius * sin($angle) ];
191							push @truth, 1;
192							}
193
194							$iforest->fit(\@training_data);
195
196							=cut
197
198							sub fit {
199	24			24	1	2452	my ( $self, $data ) = @_;
200
201	24	100	100			446	croak "fit() expects a non-empty arrayref of samples"
202							unless ref $data eq 'ARRAY' && @$data;
203							croak "each sample must be an arrayref of features"
204	21	100	100			170	unless ref $data->[0] eq 'ARRAY' && @{ $data->[0] };
	20					189
205
206	19					37	my $n = scalar @$data;
207	19					27	my $n_features = scalar @{ $data->[0] };
	19					33
208	19					44	$self->{n_features} = $n_features;
209
210							# The sub-sample cannot be larger than the data set itself.
211	19					135	my $psi = min( $self->{sample_size}, $n );
212	19					80	$self->{c_psi} = _c($psi);
213	19					54	$self->{psi_used} = $psi;
214
215							# Resolve the extension level against the data's dimensionality.
216	19	100				59	if ( $self->{mode} eq 'extended' ) {
217	6					11	my $max_ext = $n_features - 1;
218							my $ext
219							= defined $self->{extension_level}
220							? $self->{extension_level}
221	6	100				13	: $max_ext;
222	6	50				13	$ext = 0 if $ext < 0;
223	6	100				12	$ext = $max_ext if $ext > $max_ext;
224	6					19	$self->{extension_level_used} = $ext;
225							} else {
226	13					28	$self->{extension_level_used} = undef;
227							}
228
229							# Height limit: the average tree height ceil(log2(psi)). Past this depth the
230							# remaining points are scored using the c(size) adjustment instead.
231							my $limit
232							= defined $self->{max_depth}
233							? $self->{max_depth}
234	19	50				162	: ceil( log($psi) / log(2) );
235	19	50				45	$limit = 1 if $limit < 1;
236	19					37	$self->{max_depth_used} = $limit;
237
238	19	50				64	srand( $self->{seed} ) if defined $self->{seed};
239
240	19					26	my @trees;
241	19					52	for ( 1 .. $self->{n_trees} ) {
242	747					2120	my $sample = _subsample( $data, $psi );
243	747					1985	push @trees, $self->_build_tree( $sample, 0, $limit );
244							}
245	19					77	$self->{trees} = \@trees;
246
247							# If a contamination rate was requested, learn the score cutoff that flags
248							# that fraction of the training set. We place the threshold midway between
249							# the k-th and (k+1)-th highest training scores, so it sits in the gap
250							# between flagged and unflagged points -- unambiguous and robust to the
251							# tiny float rounding introduced by JSON serialisation.
252	19	100				79	if ( defined $self->{contamination} ) {
253	3					19	my $scores = $self->score_samples($data);
254	3					41	my @desc = sort { $b <=> $a } @$scores;
	3979					4379
255	3					8	my $n_pts = scalar @desc;
256	3					35	my $k = int( $self->{contamination} * $n_pts + 0.5 );
257	3	50				12	$k = 1 if $k < 1;
258	3	50				9	$k = $n_pts if $k > $n_pts;
259	3	50				40	$self->{threshold} = $k < $n_pts
260							? ( $desc[ $k - 1 ] + $desc[$k] ) / 2.0 # midpoint of the boundary
261							: $desc[ $n_pts - 1 ] - 1e-9; # k == n: flag everything
262							} ## end if ( defined $self->{contamination} )
263
264	19					108	return $self;
265							} ## end sub fit
266
267							=head2 path_lengths(\@data)
268
269							Returns the mean isolation depth per sample, for inspection.
270
271							my @lenghts = $forest->path_lengths(\@data);
272
273							print "x, y, length\n";
274
275							my $int=0;
276							while (defined($data[$int])) {
277							print $data[$int][0].', '.$data[$int][1].', '.$lenghts[$int]."\n";
278
279							$int++;
280							}
281
282							=cut
283
284							sub path_lengths {
285	2			2	1	4711	my ( $self, $data ) = @_;
286	2					10	$self->_check_fitted;
287	1					2	my $t = scalar @{ $self->{trees} };
	1					2
288	1					2	my @out;
289	1					3	for my $x (@$data) {
290	100					124	my $sum = 0;
291	100					109	$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
	100					213
292	100					283	push @out, $sum / $t;
293							}
294	1					11	return \@out;
295							} ## end sub path_lengths
296
297							=head predict(\@data, $threshold)
298
299							Returns an arrayref of 0/1 labels for the specified data.
300
301							If theshold is not specified it uses whatever the set default.
302
303							my $results = $forest->predict(\@data, $threshold);
304
305							print "x, y, result\n";
306
307							my $int=0;
308							while (defined($data[$int])) {
309							print $data[$int][0].', '.$data[$int][1].', '.$results->[$int]."\n";
310
311							$int++;
312							}
313
314							=cut
315
316							sub predict {
317	11			11	0	13460	my ( $self, $data, $threshold ) = @_;
318							$threshold
319							= defined $threshold ? $threshold
320							: defined $self->{threshold} ? $self->{threshold}
321	11	100				49	: 0.5;
		100
322	11					58	my $scores = $self->score_samples($data);
323	10	100				79	return [ map { $_ >= $threshold ? 1 : 0 } @$scores ];
	1107					1891
324							}
325
326							=head2 score_samples(\@data)
327
328							Returns an arrayref of anomaly scores, between 0 and 1.
329
330							Scores near 1 are strong anomalies (isolated quickly).
331
332							Scores well below 0.5 are normal.
333
334							Scores ~0.5 means the points are hard to tell apart.
335
336							my $scores = $forest->path_lengths(\@data);
337
338							print "x, y, length\n";
339
340							my $int=0;
341							while (defined($data[$int])) {
342							print $data[$int][0].', '.$data[$int][1].', '.$scores->[$int]."\n";
343
344							$int++;
345							}
346
347							=cut
348
349							sub score_samples {
350	31			31	1	14616	my ( $self, $data ) = @_;
351	31					125	$self->_check_fitted;
352	29					66	my $c = $self->{c_psi};
353	29					45	my $t = scalar @{ $self->{trees} };
	29					54
354
355	29					47	my @scores;
356	29					61	for my $x (@$data) {
357	3042					3818	my $sum = 0;
358	3042					3520	$sum += $self->_path_length( $x, $_, 0 ) for @{ $self->{trees} };
	3042					6648
359	3042					4436	my $avg = $sum / $t;
360	3042	50				16148	push @scores, $c > 0 ? 2**( -$avg / $c ) : 0.5;
361							}
362	29					178	return \@scores;
363							} ## end sub score_samples
364
365							=head2 score_predict_samples
366
367							Returns a array ref of arrays. First value of each sub array is the score with the second being
368							0/1 for if it is a anomaly or not.
369
370							my $results = $forest->predict(\@data, $threshold);
371
372							print "x, y, score, result\n";
373
374							my $int=0;
375							while (defined($data[$int])) {
376							print $data[$int][0].', '.$data[$int][1].', '.$results->[$int][0].', '.$results->[$int][1]."\n";
377
378							$int++;
379							}
380
381							=cut
382
383							sub score_predict_samples {
384	0			0	1	0	my ( $self, $data, $threshold ) = @_;
385							$threshold
386							= defined $threshold ? $threshold
387							: defined $self->{threshold} ? $self->{threshold}
388	0	0				0	: 0.5;
		0
389	0					0	my $scores = $self->score_samples($data);
390
391	0					0	my @to_return;
392	0					0	foreach my $score ( @{$scores} ) {
	0					0
393	0	0				0	if ( $score >= $threshold ) {
394	0					0	push @to_return, [ $score, 1 ];
395							} else {
396	0					0	push @to_return, [ $score, 0 ];
397							}
398							}
399
400	0					0	return \@to_return;
401							} ## end sub score_predict_samples
402
403							=head1 MODEL SAVE/LOAD METHODS
404
405							=head2 to_json
406
407							Returns a JSON representation of the module.
408
409							Required being fit having to be called.
410
411							my $json = $iforest->to_json;
412
413							=cut
414
415							sub to_json {
416	4			4	1	242396	my ($self) = @_;
417	4					15	$self->_check_fitted;
418							my $payload = {
419							format => 'Algorithm::Classifier::IsolationForest',
420							version => 0,
421							params => {
422							n_trees => $self->{n_trees},
423							sample_size => $self->{sample_size},
424							mode => $self->{mode},
425							extension_level => $self->{extension_level_used},
426							contamination => $self->{contamination},
427							threshold => $self->{threshold},
428							n_features => $self->{n_features},
429							psi_used => $self->{psi_used},
430							c_psi => $self->{c_psi},
431							max_depth_used => $self->{max_depth_used},
432							},
433							trees => $self->{trees},
434	3					32	};
435	3					24	return JSON::PP->new->canonical(1)->encode($payload);
436							} ## end sub to_json
437
438							=head2 from_json($json)
439
440							Init the object from the model in the specified JSON string.
441
442							my $iforest = Algorithm::Classifier::IsolationForest->from_json($json);
443
444							=cut
445
446							sub from_json {
447	5			5	1	250913	my ( $class, $text ) = @_;
448	5					33	my $payload = JSON::PP->new->decode($text);
449							croak "not an IsolationForest model"
450							unless ref $payload eq 'HASH'
451							&& defined $payload->{format}
452	5	100	100			2078138	&& $payload->{format} eq 'Algorithm::Classifier::IsolationForest';
			66
453
454	3		100			14	my $p = $payload->{params} \|\| {};
455							my $self = {
456							n_trees => $p->{n_trees},
457							sample_size => $p->{sample_size},
458							max_depth => undef,
459							seed => undef,
460							mode => $p->{mode} // 'axis',
461							extension_level => $p->{extension_level},
462							extension_level_used => $p->{extension_level},
463							contamination => $p->{contamination},
464							threshold => $p->{threshold},
465							n_features => $p->{n_features},
466							psi_used => $p->{psi_used},
467							c_psi => $p->{c_psi},
468							max_depth_used => $p->{max_depth_used},
469	3		100			49	trees => $payload->{trees} \|\| [],
			50
470							};
471	3	100				13	croak "model contains no trees" unless @{ $self->{trees} };
	3					122
472
473							# Recompute the normalising constant from the (integer, exact) sub-sample
474							# size rather than trusting the stored float, so a reloaded model's scores
475							# are bit-for-bit identical to the original's.
476	2	50				19	$self->{c_psi} = _c( $self->{psi_used} ) if defined $self->{psi_used};
477
478	2					21	return bless $self, $class;
479							} ## end sub from_json
480
481							=head2 save($path)
482
483							Saves the model to the specified path.
484
485							$iforest->save($path);
486
487							=cut
488
489							sub save {
490	1			1	1	5046	my ( $self, $path ) = @_;
491	1					6	write_file( $path, { 'atomic' => 1 }, $self->to_json );
492							}
493
494							=head2 load($path);
495
496							Init the object from the model in the specified file.
497
498							my $iforest = Algorithm::Classifier::IsolationForest->load($path);
499
500							=cut
501
502							sub load {
503	2			2	1	233575	my ( $class, $path ) = @_;
504	2					12	my $raw_model = read_file($path);
505	1					294	return $class->from_json($raw_model);
506							}
507
508							=head1 REFERENCES
509
510							Liu, Fei Tony & Ting, Kai & Zhou, Zhi-Hua. (2008). Isolation Forest. 413 - 422. 10.1109/ICDM.2008.17.
511
512							L
513
514							L
515
516							Sahand Hariri, Matias Carrasco Kind, Robert J. Brunner (2020). Extended Isolation Forest. 1479 - 1489. 10.1109/TKDE.2019.2947676
517
518							L
519
520							=cut
521
522							###
523							###
524							### internal stuff below
525							###
526							###
527
528							#-------------------------------------------------------------------------------
529							# c(n): the expected path length of an unsuccessful search in a binary search
530							# tree of n nodes. Isolation Forest uses it (a) to adjust the path length when a
531							# leaf still holds more than one point (depth limit reached), and (b) to
532							# normalise the average path length into a 0..1 anomaly score.
533							#-------------------------------------------------------------------------------
534							sub _c {
535	237911			237911		289622	my ($n) = @_;
536	237911	100				354205	return 0.0 if $n <= 1;
537	200978	100				286565	return 1.0 if $n == 2;
538	187800					235835	my $harmonic = log( $n - 1 ) + EULER; # H(n-1) ~= ln(n-1) + gamma
539	187800					396780	return 2.0 * $harmonic - ( 2.0 * ( $n - 1 ) / $n );
540							}
541
542							# One draw from the standard normal N(0,1) via Box-Muller. Used to pick the
543							# random hyperplane orientations in Extended Isolation Forest mode.
544							sub _randn {
545	8667		50	8667		12882	my $u1 = rand() \|\| 1e-12;
546	8667					9668	my $u2 = rand();
547	8667					14701	return sqrt( -2.0 * log($u1) ) * cos( TWO_PI * $u2 );
548							}
549
550							#-------------------------------------------------------------------------------
551							# Draw $k samples without replacement via a partial Fisher-Yates shuffle of the
552							# index array. Returns an arrayref of (shared, read-only) sample refs.
553							#-------------------------------------------------------------------------------
554							sub _subsample {
555	747			747		1619	my ( $data, $k ) = @_;
556	747					1180	my $n = scalar @$data;
557	747					5207	my @idx = ( 0 .. $n - 1 );
558	747					1950	for my $i ( 0 .. $k - 1 ) {
559	130400					165398	my $j = $i + int( rand( $n - $i ) );
560	130400					186194	@idx[ $i, $j ] = @idx[ $j, $i ];
561							}
562	747					7335	my @chosen = @idx[ 0 .. $k - 1 ];
563	747					3307	return [ @{$data}[@chosen] ];
	747					13455
564							} ## end sub _subsample
565
566							#-------------------------------------------------------------------------------
567							# Recursively build one isolation tree.
568							#
569							# A node is one of:
570							# leaf { leaf => 1, size => N }
571							# axis { attr => A, split => S, left => ..., right => ... }
572							# oblique { idx => [..], coef => [..], b => B, left => ..., right => ... }
573							#
574							# In both split styles the choice is restricted to features that actually vary
575							# across the points reaching the node: this avoids wasted levels on constant
576							# columns and lets a node leaf out exactly when its points are indistinguishable.
577							#-------------------------------------------------------------------------------
578							sub _build_tree {
579	49529			49529		71161	my ( $self, $X, $depth, $limit ) = @_;
580
581	49529					57558	my $size = scalar @$X;
582	49529	100	100			136123	return { leaf => 1, size => $size }
583							if $depth >= $limit \|\| $size <= 1;
584
585	24391					31809	my $nf = $self->{n_features};
586
587							# Per-feature min and max within this node, in a single pass.
588	24391					29598	my ( @lo, @hi );
589	24391					32677	for my $row (@$X) {
590	985236					1273133	for my $f ( 0 .. $nf - 1 ) {
591	1973283					2297821	my $v = $row->[$f];
592	1973283	100	100			4098513	$lo[$f] = $v if !defined $lo[$f] \|\| $v < $lo[$f];
593	1973283	100	100			4428628	$hi[$f] = $v if !defined $hi[$f] \|\| $v > $hi[$f];
594							}
595							}
596
597							# Features with spread are the only ones that can split the data.
598	24391					43187	my @varying = grep { $lo[$_] < $hi[$_] } 0 .. $nf - 1;
	48854					88594
599
600							# No spread on any feature => all points identical => cannot isolate.
601	24391	50				37978	return { leaf => 1, size => $size } unless @varying;
602
603							my $node
604	24391	100				56844	= $self->{mode} eq 'extended'
605							? $self->_oblique_split( $X, \@varying, \@lo, \@hi )
606							: _axis_split( $X, \@varying, \@lo, \@hi );
607
608	24391					52924	$node->{left} = $self->_build_tree( $node->{_left}, $depth + 1, $limit );
609	24391					39963	$node->{right} = $self->_build_tree( $node->{_right}, $depth + 1, $limit );
610	24391					29794	delete @{$node}{qw(_left _right)};
	24391					60187
611
612	24391					56205	return $node;
613							} ## end sub _build_tree
614
615							# Axis-parallel cut: random varying feature, random threshold in its range.
616							sub _axis_split {
617	19950			19950		30718	my ( $X, $varying, $lo, $hi ) = @_;
618
619	19950					35784	my $attr = $varying->[ int( rand( scalar @$varying ) ) ];
620	19950					32608	my $split = $lo->[$attr] + rand() * ( $hi->[$attr] - $lo->[$attr] );
621
622	19950					24911	my ( @left, @right );
623	19950					26520	for my $row (@$X) {
624	784613	100				1038634	if ( $row->[$attr] < $split ) { push @left, $row }
	393279					532595
625	391334					551352	else { push @right, $row }
626							}
627	19950					75137	return { attr => $attr, split => $split, _left => \@left, _right => \@right };
628							} ## end sub _axis_split
629
630							# Oblique cut (Extended Isolation Forest): a random hyperplane. We activate
631							# (extension_level + 1) of the varying features, give each a Gaussian
632							# coefficient, and place the plane through a random point in the bounding box.
633							# A point goes left when coef . x <= b, where b = coef . p.
634							sub _oblique_split {
635	4441			4441		6800	my ( $self, $X, $varying, $lo, $hi ) = @_;
636
637	4441					5974	my $active = $self->{extension_level_used} + 1;
638	4441	100				7245	$active = scalar @$varying if $active > scalar @$varying;
639
640							# Pick which varying features take part (partial shuffle of their indices).
641	4441					6162	my @pool = @$varying;
642	4441					6379	for my $i ( 0 .. $active - 1 ) {
643	8667					12385	my $j = $i + int( rand( scalar(@pool) - $i ) );
644	8667					13424	@pool[ $i, $j ] = @pool[ $j, $i ];
645							}
646	4441					8279	my @idx = @pool[ 0 .. $active - 1 ];
647
648	4441					5740	my ( @coef, $b );
649	4441					5211	$b = 0.0;
650	4441					5390	for my $f (@idx) {
651	8667					11037	my $c = _randn();
652	8667					12672	my $p = $lo->[$f] + rand() * ( $hi->[$f] - $lo->[$f] ); # point in the box
653	8667					11128	push @coef, $c;
654	8667					10720	$b += $c * $p;
655							}
656
657	4441					5345	my ( @left, @right );
658	4441					5785	for my $row (@$X) {
659	200623					210273	my $dot = 0.0;
660	200623					362543	$dot += $coef[$_] * $row->[ $idx[$_] ] for 0 .. $#idx;
661	200623	100				250013	if ( $dot <= $b ) { push @left, $row }
	97910					123730
662	102713					134813	else { push @right, $row }
663							}
664							return {
665	4441					18624	idx => \@idx,
666							coef => \@coef,
667							b => $b,
668							_left => \@left,
669							_right => \@right
670							};
671							} ## end sub _oblique_split
672
673							#-------------------------------------------------------------------------------
674							# Path length of a single point in a single tree: edges traversed until a leaf,
675							# plus c(leaf size) when the leaf still holds several points. Handles both axis
676							# and oblique internal nodes, so a model of either mode scores correctly.
677							#-------------------------------------------------------------------------------
678							sub _path_length {
679	237890			237890		334678	my ( $self, $x, $node, $depth ) = @_;
680	237890					359427	while ( !$node->{leaf} ) {
681	1734351					1776345	my $left;
682	1734351	100				2125120	if ( exists $node->{attr} ) { # axis-parallel split
683	1544332					2061302	$left = $x->[ $node->{attr} ] < $node->{split};
684							} else { # oblique (hyperplane) split
685	190019					268315	my ( $idx, $coef ) = ( $node->{idx}, $node->{coef} );
686	190019					207604	my $dot = 0.0;
687	190019					387757	$dot += $coef->[$_] * $x->[ $idx->[$_] ] for 0 .. $#$idx;
688	190019					252778	$left = $dot <= $node->{b};
689							}
690	1734351	100				2460752	$node = $left ? $node->{left} : $node->{right};
691	1734351					2552557	$depth++;
692							} ## end while ( !$node->{leaf} )
693	237890					311781	return $depth + _c( $node->{size} );
694							} ## end sub _path_length
695
696							sub _check_fitted {
697	37			37		78	my ($self) = @_;
698							croak "model is not fitted yet; call fit() first"
699	37	100	66			172	unless ref $self->{trees} eq 'ARRAY' && @{ $self->{trees} };
	37					618
700							}
701
702							1;