File Coverage

blib/lib/PDLA/Stats/Kmeans.pm

Criterion	Covered	Total	%
statement	98	114	85.9
branch	28	44	63.6
condition	7	15	46.6
subroutine	12	13	92.3
pod	1	4	25.0
total	146	190	76.8

line	stmt	bran	cond	sub	pod	time	code
1
2							#
3							# GENERATED WITH PDLA::PP! Don't modify!
4							#
5							package PDLA::Stats::Kmeans;
6
7							@EXPORT_OK = qw( random_cluster iv_cluster PDLA::PP _random_cluster PDLA::PP which_cluster PDLA::PP assign PDLA::PP centroid PDLA::PP _d_p2l );
8							%EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9
10	4			4		1511	use PDLA::Core;
	4					9
	4					30
11	4			4		932	use PDLA::Exporter;
	4					7
	4					21
12	4			4		98	use DynaLoader;
	4					7
	4					200
13
14
15
16
17							@ISA = ( 'PDLA::Exporter','DynaLoader' );
18							push @PDLA::Core::PP, __PACKAGE__;
19							bootstrap PDLA::Stats::Kmeans ;
20
21
22
23
24
25	4			4		20	use Carp;
	4					7
	4					227
26	4			4		21	use PDLA::LiteF;
	4					7
	4					44
27	4			4		5906	use PDLA::NiceSlice;
	4					9
	4					30
28	4			4		68595	use PDLA::Stats::Basic;
	4					11
	4					46
29
30							$PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc;
31
32							=head1 NAME
33
34							PDLA::Stats::Kmeans -- classic k-means cluster analysis
35
36							=head1 DESCRIPTION
37
38							Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster".
39
40							The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are non-threadable, respectively.
41
42							=head1 SYNOPSIS
43
44							Implement a basic k-means procedure,
45
46							use PDLA::LiteF;
47							use PDLA::NiceSlice;
48							use PDLA::Stats;
49
50							my ($data, $idv, $ido) = rtable( $file );
51
52							my ($cluster, $centroid, $ss_centroid, $cluster_last);
53
54							# start out with 8 random clusters
55							$cluster = random_cluster( $data->dim(0), 8 );
56							# iterate to minimize total ss
57							# stop when no more changes in cluster membership
58							do {
59							$cluster_last = $cluster;
60							($centroid, $ss_centroid) = $data->centroid( $cluster );
61							$cluster = $data->assign( $centroid );
62							}
63							while ( sum(abs($cluster - $cluster_last)) > 0 );
64
65							or, use the B function provided here,
66
67							my %k = $data->kmeans( \%opt );
68							print "$_\t$k{$_}\n" for (sort keys %k);
69
70							plot the clusters if there are only 2 vars in $data,
71
72							use PDLA::Graphics::PGPLOT::Window;
73
74							my ($win, $c);
75							$win = pgwin 'xs';
76							$win->env($data( ,0)->minmax, $data( ,1)->minmax);
77
78							$win->points( $data->dice_axis(0,which($k{cluster}->(,$_)))->dog,
79							{COLOR=>++$c} )
80							for (0 .. $k{cluster}->dim(1)-1);
81
82							=cut
83
84
85
86
87
88
89
90							=head1 FUNCTIONS
91
92
93
94							=cut
95
96
97
98
99
100							# my tmp var for PDLA 2.007 slice upate
101							my $_tmp;
102
103							=head2 random_cluster
104
105							=for sig
106
107							Signature: (short [o]cluster(o,c); int obs=>o; int clu=>c)
108
109							=for ref
110
111							Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed.
112
113							=for usage
114
115							my $cluster = random_cluster( $num_obs, $num_cluster );
116
117							=cut
118
119							# can't be called on pdl
120							sub random_cluster {
121	4			4	1	8	my ($obs, $clu) = @_;
122							# extra param in @_ made into extra dim
123	4					13	my $cluster = zeroes @_;
124	4					258	do {
125	4					13	$cluster->inplace->_random_cluster();
126							} while (PDLA::any $cluster->sumover == 0 );
127	4					467	return $cluster;
128							}
129
130
131
132
133
134							*_random_cluster = \&PDLA::_random_cluster;
135
136
137
138
139
140							=head2 which_cluster
141
142							=for sig
143
144							Signature: (short a(o,c); indx [o]b(o))
145
146							Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong.
147
148							Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set.
149
150							Usage:
151
152							# create a cluster mask dim [obs x clu]
153							perldl> p $c_mask = iv_cluster [qw(a a b b c c)]
154							[
155							[1 1 0 0 0 0]
156							[0 0 1 1 0 0]
157							[0 0 0 0 1 1]
158							]
159							# get cluster membership list dim [obs]
160							perldl> p $ic = $c_mask->which_cluster
161							[0 0 1 1 2 2]
162
163
164
165							=for bad
166
167							which_cluster processes bad values.
168							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
169
170
171							=cut
172
173
174
175
176
177
178							*which_cluster = \&PDLA::which_cluster;
179
180
181
182
183
184							=head2 assign
185
186							=for sig
187
188							Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c))
189
190
191
192							=for ref
193
194							Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables.
195
196							=for usage
197
198							perldl> $centroid = ones 2, 3
199							perldl> $centroid(0,) .= 0
200							perldl> p $centroid
201							[
202							[0 1]
203							[0 1]
204							[0 1]
205							]
206
207							perldl> $b = qsort( random 4, 3 )
208							perldl> p $b
209							[
210							[0.022774068 0.032513883 0.13890034 0.30942479]
211							[ 0.16943853 0.50262636 0.56251531 0.7152271]
212							[ 0.23964483 0.59932745 0.60967495 0.78452117]
213							]
214							# notice that 1st 3 obs in $b are on average closer to 0
215							# and last obs closer to 1
216							perldl> p $b->assign( $centroid )
217							[
218							[1 1 1 0] # cluster 0 membership
219							[0 0 0 1] # cluster 1 membership
220							]
221
222
223
224
225							=for bad
226
227							assign processes bad values.
228							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
229
230
231							=cut
232
233
234
235
236
237
238							*assign = \&PDLA::assign;
239
240
241
242
243
244							=head2 centroid
245
246							=for sig
247
248							Signature: (data(o,v); cluster(o,c); float+ [o]m(c,v); float+ [o]ss(c,v))
249
250
251							=for ref
252
253							Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster.
254
255							=for usage
256
257							# data is 10 obs x 3 var
258							perldl> p $d = sequence 10, 3
259							[
260							[ 0 1 2 3 4 5 6 7 8 9]
261							[10 11 12 13 14 15 16 17 18 19]
262							[20 21 22 23 24 25 26 27 28 29]
263							]
264							# create two clusters by value on 1st var
265							perldl> p $a = $d( ,(0)) <= 5
266							[1 1 1 1 1 1 0 0 0 0]
267
268							perldl> p $b = $d( ,(0)) > 5
269							[0 0 0 0 0 0 1 1 1 1]
270
271							perldl> p $c = cat $a, $b
272							[
273							[1 1 1 1 1 1 0 0 0 0]
274							[0 0 0 0 0 0 1 1 1 1]
275							]
276
277							perldl> p $d->centroid($c)
278							# mean for 2 cluster x 3 var
279							[
280							[ 2.5 7.5]
281							[12.5 17.5]
282							[22.5 27.5]
283							]
284							# ss for 2 cluster x 3 var
285							[
286							[17.5 5]
287							[17.5 5]
288							[17.5 5]
289							]
290
291
292
293
294							=for bad
295
296							centroid processes bad values.
297							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
298
299
300							=cut
301
302
303
304
305
306
307							*centroid = \&PDLA::centroid;
308
309
310
311
312							sub _scree_ind {
313							# use as scree cutoff the point with max distance to the line formed
314							# by the 1st and last points in $self
315							# it's a heuristic--whether we can get "good" results depends on
316							# the number of components in $self.
317
318	0			0		0	my ($self) = @_;
319
320	0					0	$self = $self->squeeze;
321	0	0				0	$self->ndims > 1 and
322							croak "1D pdl only please";
323
324	0					0	my $a = zeroes 2, $self->nelem;
325	0					0	($_tmp = $a((0), )) .= sequence $self->nelem;
326	0					0	($_tmp = $a((1), )) .= $self;
327
328	0					0	my $d = _d_point2line( $a, $a( ,(0)), $a( ,(-1)) );
329
330	0					0	return $d->maximum_ind;
331							}
332
333							sub _d_point2line {
334	1			1		587	my ($self, $p1, $p2) = @_;
335
336	1					3	for ($self, $p1, $p2) {
337	3	50				10	$_->dim(0) != 2 and
338							carp "point pdl dim(0) != 2";
339							}
340
341	1					10	return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog );
342							}
343
344
345
346
347
348							*_d_p2l = \&PDLA::_d_p2l;
349
350
351
352
353							=head2 kmeans
354
355							=for ref
356
357							Implements classic k-means cluster analysis. Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials.
358
359							Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries.
360
361							There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds",
362
363							(1) a centroid could come from a previous study with a different set of observations;
364
365							(2) a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like
366
367							[
368							[5 0 0 0]
369							[0 5 0 0]
370							[0 0 5 0]
371							[0 0 0 5]
372							]
373
374							will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable.
375
376							If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans,
377
378							my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster );
379
380							my %k = $rating->kmeans( { CNTRD=>$centroid } );
381
382							kmeans supports bad value*.
383
384							=for options
385
386							Default options (case insensitive):
387
388							V => 1, # prints simple status
389							FULL => 0, # returns results for all seeding trials
390
391							CNTRD => PDLA->null, # optional. pdl [clu x var]. disables next 3 opts
392
393							NTRY => 5, # num of random seeding trials
394							NSEED => 1000, # num of initial seeds, use NSEED up to max obs
395							NCLUS => 3, # num of clusters
396
397							=for usage
398
399							Usage:
400
401							# suppose we have 4 person's ratings on 5 movies
402
403							perldl> p $rating = ceil( random(4, 5) * 5 )
404							[
405							[3 2 2 3]
406							[2 4 5 4]
407							[5 3 2 3]
408							[3 3 1 5]
409							[4 3 3 2]
410							]
411
412							# we want to put the 4 persons into 2 groups
413
414							perldl> %k = $rating->kmeans( {NCLUS=>2} )
415
416							# by default prints back options used
417							# as well as info for all tries and iterations
418
419							CNTRD => Null
420							FULL => 0
421							NCLUS => 2
422							NSEED => 4
423							NTRY => 5
424							V => 1
425							ss total: 20.5
426							iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748]
427							iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748]
428
429							perldl> p "$_\t$k{$_}\n" for (sort keys %k)
430
431							R2 0.479674796747968
432							centroid # mean ratings for 2 group x 5 movies
433							[
434							[ 3 2.3333333]
435							[ 2 4.3333333]
436							[ 5 2.6666667]
437							[ 3 3]
438							[ 4 2.6666667]
439							]
440
441							cluster # 4 persons' membership in two groups
442							[
443							[1 0 0 0]
444							[0 1 1 1]
445							]
446
447							n [1 3] # cluster size
448							ss
449							[
450							[ 0 0.66666667]
451							[ 0 0.66666667]
452							[ 0 0.66666667]
453							[ 0 8]
454							[ 0 0.66666667]
455							]
456
457							Now, for the valiant, kmeans is threadable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to threading (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See stats_kmeans.t for examples w 3D and 4D data.
458
459							*With bad value, R2 is based on average of variances instead of sum squared error.
460
461							=cut
462
463							*kmeans = \&PDLA::kmeans;
464							sub PDLA::kmeans {
465	5			5	0	11877	my ($self, $opt) = @_;
466	5					14	my %opt = (
467							V => 1, # prints simple status
468							FULL => 0, # returns results for all seeding trials
469
470							CNTRD => PDLA->null, # optional. pdl [clu x var]. disables next 3 opts
471
472							NTRY => 5, # num of random seeding trials
473							NSEED => 1000, # num of initial seeds, use NSEED up to max obs
474							NCLUS => 3, # num of clusters
475							);
476	5		33			86	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
477	5	100	66			32	if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) {
478	1					4	$opt{NTRY} = 1;
479	1					3	$opt{NSEED} = $self->dim(0);
480	1					4	$opt{NCLUS} = $opt{CNTRD}->dim(0);
481							}
482							else {
483	4					17	$opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min;
484							}
485	5		33			282	$opt{V} and print STDERR "$_\t=> $opt{$_}\n" for (sort keys %opt);
486
487	5	100				19	my $ss_ms = $self->badflag? 'ms' : 'ss';
488	5	100				183	my $ss_total
489							= $self->badflag? $self->var->average : $self->ss->sumover;
490	5	50				26	$opt{V} and print STDERR "overall $ss_ms:\t$ss_total\n";
491
492	5					10	my ($centroid, $ss_cv, $R2, $clus_this, $clus_last);
493
494							# NTRY made into extra dim in $cluster for threading
495	5	100				17	my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS});
	2					5
496							$clus_this
497							= (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ?
498							$self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY
499							: random_cluster($opt{NSEED}, @nclus, $opt{NTRY} )
500	5	100	66			37	;
501
502	5					64	($centroid, $ss_cv) = $self(0:$opt{NSEED} - 1, )->centroid( $clus_this );
503							# now obs in $clus_this matches $self
504	5					494	$clus_this = $self->assign( $centroid );
505	5					206	($centroid, $ss_cv) = $self->centroid( $clus_this );
506
507	5					22	my $iter = 0;
508	5					6	do {
509	8	100				690	$R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total
510							: 1 - $ss_cv->sumover->sumover / $ss_total
511							;
512	8	50				53	$opt{V} and print STDERR join(' ',('iter', $iter++, 'R2', $R2)) . "\n";
513
514	8					12	$clus_last = $clus_this;
515
516	8					375	$clus_this = $self->assign( $centroid );
517	8					448	($centroid, $ss_cv) = $self->centroid( $clus_this );
518							}
519							while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 );
520
521							$opt{FULL} and
522							return (
523	5	50				634	centroid => PDLA::squeeze( $centroid ),
524							cluster => PDLA::squeeze( $clus_this ),
525							n => PDLA::squeeze( $clus_this )->sumover,
526							R2 => PDLA::squeeze( $R2 ),
527							$ss_ms => PDLA::squeeze( $ss_cv ),
528							);
529
530							# xchg/mv(-1,0) leaves it as was if single dim--unlike transpose
531	5					57	my $i_best = $R2->mv(-1,0)->maximum_ind;
532
533	5	100				33	$R2->getndims == 1 and
534							return (
535							centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze,
536							cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze,
537							n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover,
538							R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze,
539							$ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze,
540							);
541
542							# now for threading beyond 2D data
543
544							# can't believe i'm using a perl loop :P
545
546	3					10	$i_best = $i_best->flat->sever;
547	3					37	my @i_best = map { $opt{NTRY} * $_ + $i_best(($_)) }
	10					159
548							0 .. $i_best->nelem - 1;
549
550	3					55	my @shapes;
551	3					8	for ($centroid, $clus_this, $R2) {
552	9					20	my @dims = $_->dims;
553	9					165	pop @dims;
554	9					17	push @shapes, \@dims;
555							}
556
557	3					319	$clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever,
558
559							return (
560							centroid =>
561	3					364	$centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
562
563							cluster => $clus_this,
564							n => $clus_this->sumover,
565
566							R2 =>
567	3					345	$R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever,
568
569							$ss_ms =>
570	3					21	$ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever,
	3					452
571							);
572							}
573
574							=head1 METHODS
575
576							=head2 iv_cluster
577
578							=for ref
579
580							Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context.
581
582							This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)].
583
584							=for usage
585
586							Usage:
587
588							perldl> @bake = qw( y y y n n n )
589
590							# accepts @ ref or 1d pdl
591
592							perldl> p $bake = iv_cluster( \@bake )
593							[
594							[1 1 1 0 0 0]
595							[0 0 0 1 1 1]
596							]
597
598							perldl> p $rating = sequence 6
599							[0 1 2 3 4 5]
600
601							perldl> p $rating->centroid( $bake )
602							# mean for each iv level
603							[
604							[1 4]
605							]
606							# ss
607							[
608							[2 2]
609							]
610
611							=cut
612
613							*iv_cluster = \&PDLA::iv_cluster;
614							sub PDLA::iv_cluster {
615	13			13	0	1024	my ($var_ref) = @_;
616
617							# pdl->uniq puts elem in order. so instead list it to maintain old order
618	13	100				50	if (ref $var_ref eq 'PDLA') {
619	3					10	$var_ref = $var_ref->squeeze;
620	3	50				122	$var_ref->getndims > 1 and
621							croak "multidim pdl passed for single var!";
622	3					12	$var_ref = [ list $var_ref ];
623							}
624
625	13					110	my ($var, $map_ref) = PDLA::Stats::Basic::_array_to_pdl( $var_ref );
626	13					48	my $var_a = zeroes( short, $var->nelem, $var->max + 1 );
627
628	13					1513	for my $l (0 .. $var->max) {
629	29					1493	my $v = $var_a( ,$l);
630	29					636	($_tmp = $v->index( which $var == $l )) .= 1;
631							}
632
633	13	100				727	if ($var->badflag) {
634	1					13	my $ibad = which $var->isbad;
635	1					28	($_tmp = $var_a($ibad, )) .= -1;
636	1					62	$var_a->inplace->setvaltobad(-1);
637							}
638
639	13	50				109	return wantarray? ($var_a, $map_ref) : $var_a;
640							}
641
642							=head2 pca_cluster
643
644							Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP.
645
646							Default options (case insensitive):
647
648							V => 1,
649							ABS => 1, # high pos and neg loadings on a comp in same cluster
650							NCOMP => undef, # max number of components to consider. determined by
651							# scree plot black magic if not specified
652							PLOT => 1, # pca scree plot with cutoff at NCOMP
653
654							Usage:
655
656							# say we need to cluster a group of documents
657							# $data is pdl dim [word x doc]
658							($data, $idd, $idw) = get_data 'doc_word_info.txt';
659
660							perldl> %p = $data->pca;
661							# $cluster is pdl mask dim [doc x ncomp]
662							perldl> $cluster = $p{loading}->pca_cluster;
663
664							# pca clusters var while kmeans clusters obs. hence transpose
665							perldl> ($m, $ss) = $data->transpose->centroid( $cluster );
666							perldl> %k = $data->transpose->kmeans( { cntrd=>$m } );
667
668							# take a look at cluster 0 doc ids
669							perldl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]);
670
671							=cut
672
673							*pca_cluster = \&PDLA::pca_cluster;
674							sub PDLA::pca_cluster {
675	1			1	0	1185	my ($self, $opt) = @_;
676
677	1					6	my %opt = (
678							V => 1,
679							ABS => 1, # high pos and neg loadings on a comp in same cluster
680							NCOMP => undef, # max number of components to consider. determined by
681							# scree plot black magic if not specified
682							PLOT => 1, # pca scree plot with cutoff at NCOMP
683							);
684	1		33			9	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
685
686	1					31	my $var = sumover($self ** 2) / $self->dim(0);
687	1	50				13	if (!$opt{NCOMP}) {
688							# here's the black magic part
689	0	0				0	my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1)
690							: pdl($self->dim(1), 30)->min
691							;
692	0					0	$var = $var(0:$comps-1)->sever;
693	0					0	$opt{NCOMP} = _scree_ind( $var );
694							}
695	1	50				3	$opt{PLOT} and do {
696	0					0	require PDLA::Stats::GLM;
697	0					0	$var->plot_scree( {NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}} );
698							};
699
700	1					24	my $c = $self->( ,0:$opt{NCOMP}-1)->transpose->abs->maximum_ind;
701
702	1	50				48	if ($opt{ABS}) {
703	1					13	$c = $c->iv_cluster;
704							}
705							else {
706	0	0				0	my @c = map { ($self->($_,$c($_)) >= 0)? $c($_)2 : $c($_)2 + 1 }
	0					0
707							( 0 .. $c->dim(0)-1 );
708	0					0	$c = iv_cluster( \@c );
709							}
710	1	50				5	$opt{V} and print STDERR "cluster membership mask as " . $c->info . "\n";
711
712	1					5	return $c;
713							}
714
715							=head1 REFERENCES
716
717							Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29.
718
719							Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786.
720
721							Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press.
722
723							Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm
724
725							=head1 AUTHOR
726
727							Copyright (C) 2009 Maggie J. Xiong
728
729							All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution.
730
731							=cut
732
733
734
735							;
736
737
738
739							# Exit with OK status
740
741							1;
742
743