File Coverage

blib/lib/PDL/Cluster.pm

Criterion	Covered	Total	%
statement	9	52	17.3
branch	0	20	0.0
condition			n/a
subroutine	3	11	27.2
pod	8	8	100.0
total	20	91	21.9

line	stmt	bran	sub	pod	time	code
1
2						#
3						# GENERATED WITH PDL::PP! Don't modify!
4						#
5						package PDL::Cluster;
6
7						@EXPORT_OK = qw( PDL::PP cmean PDL::PP cmedian PDL::PP calculate_weights PDL::PP clusterdistance PDL::PP distancematrix PDL::PP getclustercentroids getclustermean getclustermedian PDL::PP getclustermedoids PDL::PP kcluster PDL::PP kmedoids PDL::PP treecluster PDL::PP treeclusterd PDL::PP cuttree PDL::PP somcluster PDL::PP pca PDL::PP rowdistances PDL::PP clusterdistances PDL::PP clustersizes PDL::PP clusterelements PDL::PP clusterelementmask PDL::PP clusterdistancematrix clusterenc clusterdec clusteroffsets PDL::PP clusterdistancematrixenc PDL::PP clusterdistancesenc PDL::PP getclusterwsum PDL::PP attachtonearest attachtonearestd PDL::PP checkprototypes PDL::PP checkpartitions randomprototypes randompartition );
8						%EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9
10	9		9		2212547	use PDL::Core;
	9				36
	9				51
11	9		9		2003	use PDL::Exporter;
	9				17
	9				46
12	9		9		204	use DynaLoader;
	9				15
	9				9251
13
14
15
16						$PDL::Cluster::VERSION = '1.54.001';
17						@ISA = ( 'PDL::Exporter','DynaLoader' );
18						push @PDL::Core::PP, __PACKAGE__;
19						bootstrap PDL::Cluster $VERSION;
20
21
22
23
24
25						#---------------------------------------------------------------------------
26						# File: PDL::Cluster.pm
27						# Author: Bryan Jurish
28						# Description: PDL wrappers for the C Clustering library.
29						#
30						# Copyright (c) 2005-2018 Bryan Jurish. All rights reserved.
31						# This program is free software. You may modify and/or
32						# distribute it under the same terms as Perl itself.
33						#
34						#---------------------------------------------------------------------------
35						# Based on the C clustering library for cDNA microarray data,
36						# Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
37						#
38						# The C clustering library was written at the Laboratory of DNA Information
39						# Analysis, Human Genome Center, Institute of Medical Science, University of
40						# Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
41						# Contact: michiel.dehoon 'AT' riken.jp
42						#
43						# See the files "cluster.c" and "cluster.h" in the PDL::Cluster distribution
44						# for details.
45						#---------------------------------------------------------------------------
46
47						=pod
48
49						=head1 NAME
50
51						PDL::Cluster - PDL interface to the C Clustering Library
52
53						=head1 SYNOPSIS
54
55						use PDL::Cluster;
56
57						##-----------------------------------------------------
58						## Data Format
59						$d = 42; ##-- number of features
60						$n = 1024; ##-- number of data elements
61
62						$data = random($d,$n); ##-- data matrix
63						$elt = $data->slice(",($i)"); ##-- element data vector
64						$ftr = $data->slice("($j),"); ##-- feature vector over all elements
65
66						$wts = ones($d)/$d; ##-- feature weights
67						$msk = ones($d,$n); ##-- missing-datum mask (1=ok)
68
69						##-----------------------------------------------------
70						## Library Utilties
71
72						$mean = $ftr->cmean();
73						$median = $ftr->cmedian();
74
75						calculate_weights($data,$msk,$wts, $cutoff,$expnt,
76						$weights);
77
78						##-----------------------------------------------------
79						## Distance Functions
80
81						clusterdistance($data,$msk,$wts, $n1,$n2,$idx1,$idx2,
82						$dist,
83						$distFlag, $methodFlag2);
84
85						distancematrix($data,$msk,$wts, $distmat, $distFlag);
86
87						##-----------------------------------------------------
88						## Partitioning Algorithms
89
90						getclustermean($data,$msk,$clusterids,
91						$ctrdata, $ctrmask);
92
93						getclustermedian($data,$msk,$clusterids,
94						$ctrdata, $ctrmask);
95
96						getclustermedoid($distmat,$clusterids,$centroids,
97						$errorsums);
98
99						kcluster($k, $data,$msk,$wts, $npass,
100						$clusterids, $error, $nfound,
101						$distFlag, $methodFlag);
102
103						kmedoids($k, $distmat,$npass,
104						$clusterids, $error, $nfound);
105
106						##-----------------------------------------------------
107						## Hierarchical Algorithms
108
109						treecluster($data,$msk,$wts,
110						$tree, $lnkdist,
111						$distFlag, $methodFlag);
112
113						treeclusterd($data,$msk,$wts, $distmat,
114						$tree, $lnkdist,
115						$distFlag, $methodFlag);
116
117						cuttree($tree, $nclusters,
118						$clusterids);
119
120						##-----------------------------------------------------
121						## Self-Organizing Maps
122
123						somcluster($data,$msk,$wts, $nx,$ny,$tau,$niter,
124						$clusterids,
125						$distFlag);
126
127						##-----------------------------------------------------
128						## Principal Component Analysis
129
130						pca($U, $S, $V);
131
132						##-----------------------------------------------------
133						## Extensions
134
135						rowdistances($data,$msk,$wts, $rowids1,$rowids2, $distvec, $distFlag);
136						clusterdistances($data,$msk,$wts, $rowids, $index2,
137						$dist,
138						$distFlag, $methodFlag);
139
140						clustersizes($clusterids, $clustersizes);
141						clusterelements($clustierids, $clustersizes, $eltids);
142						clusterelementmask($clusterids, $eltmask);
143
144						clusterdistancematrix($data,$msk,$wts,
145						$rowids, $clustersizes, $eltids,
146						$dist,
147						$distFlag, $methodFlag);
148
149						clusterenc($clusterids, $clens,$cvals,$crowids, $k);
150						clusterdec($clens,$cvals,$crowids, $clusterids, $k);
151						clusteroffsets($clusterids, $coffsets,$cvals,$crowids, $k);
152						clusterdistancematrixenc($data,$msk,$wts,
153						$clens1,$crowids1, $clens2,$crowids2,
154						$dist,
155						$distFlag, $methodFlag);
156
157						=cut
158
159
160
161
162
163
164
165						=head1 FUNCTIONS
166
167
168
169						=cut
170
171
172
173
174
175
176						=head2 cmean
177
178						=for sig
179
180						Signature: (double a(n); double [o]b())
181
182						=for ref
183
184						Computes arithmetic mean of the vector $a(). See also PDL::Primitive::avg().
185
186						=for bad
187
188						cmean does not process bad values.
189						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
190
191
192						=cut
193
194
195
196
197
198
199						*cmean = \&PDL::cmean;
200
201
202
203
204
205						=head2 cmedian
206
207						=for sig
208
209						Signature: (double a(n); double [o]b())
210
211						=for ref
212
213						Computes median of the vector $a(). See also PDL::Primitive::median().
214
215						=for bad
216
217						cmedian does not process bad values.
218						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
219
220
221						=cut
222
223
224
225
226
227
228						*cmedian = \&PDL::cmedian;
229
230
231
232
233
234						=head2 calculate_weights
235
236						=for sig
237
238						Signature: (
239						double data(d,n);
240						int mask(d,n);
241						double weight(d);
242						double cutoff();
243						double exponent();
244						double [o]oweights(d);
245						; char *distFlag;
246						)
247
248
249						This function calculates weights for the features using the weighting scheme
250						proposed by Michael Eisen:
251
252						w[i] = 1.0 / sum_{j where dist(i,j)
253
254						where the cutoff and the exponent are specified by the user.
255
256
257						=for bad
258
259						calculate_weights does not process bad values.
260						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
261
262
263						=cut
264
265
266
267
268
269
270						*calculate_weights = \&PDL::calculate_weights;
271
272
273
274
275
276						=head2 clusterdistance
277
278						=for sig
279
280						Signature: (
281						double data(d,n);
282						int mask(d,n);
283						double weight(d);
284						int n1();
285						int n2();
286						int index1(n1);
287						int index2(n2);
288						double [o]dist();
289						;
290						char *distFlag;
291						char *methodFlag;
292						)
293
294
295						Computes distance between two clusters $index1() and $index2().
296						Each of the $index() vectors represents a single cluster whose values
297						are the row-indices in the $data() matrix of the elements assigned
298						to the respective cluster. $n1() and $n2() are the number of elements
299						in $index1() and $index2(), respectively. Each $index$i() must have
300						at least $n$i() elements allocated.
301
302						B the $methodFlag argument is interpreted differently than
303						by the treecluster() method, namely:
304
305						=over 4
306
307						=item a
308
309						Distance between the arithmetic means of the two clusters,
310						as for treecluster() "f".
311
312						=item m
313
314						Distance between the medians of the two clusters,
315						as for treecluster() "c".
316
317						=item s
318
319						Minimum pairwise distance between members of the two clusters,
320						as for treecluster() "s".
321
322						=item x
323
324						Maximum pairwise distance between members of the two clusters
325						as for treecluster() "m".
326
327						=item v
328
329						Average of the pairwise distances between members of the two clusters,
330						as for treecluster() "a".
331
332						=back
333
334
335
336						=for bad
337
338						clusterdistance does not process bad values.
339						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
340
341
342						=cut
343
344
345
346
347
348
349						*clusterdistance = \&PDL::clusterdistance;
350
351
352
353
354
355						=head2 distancematrix
356
357						=for sig
358
359						Signature: (
360						double data(d,n);
361						int mask(d,n);
362						double weight(d);
363						double [o]dists(n,n);
364						; char *distFlag;
365						)
366
367						=for ref
368
369						Compute triangular distance matrix over all data points.
370
371						=for bad
372
373						distancematrix does not process bad values.
374						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
375
376
377						=cut
378
379
380
381
382
383
384						*distancematrix = \&PDL::distancematrix;
385
386
387
388
389
390						=head2 getclustercentroids
391
392						=for sig
393
394						Signature: (
395						double data(d,n);
396						int mask(d,n);
397						int clusterids(n);
398						double [o]cdata(d,k);
399						int [o]cmask(d,k);
400						; char *ctrMethodFlag;
401						)
402
403						=for ref
404
405						Find cluster centroids by arithmetic mean (C) or median over each dimension (C).
406
407						=for bad
408
409						getclustercentroids does not process bad values.
410						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
411
412
413						=cut
414
415
416
417
418
419
420						*getclustercentroids = \&PDL::getclustercentroids;
421
422
423
424
425						=pod
426
427						=head2 getclustermean
428
429						=for sig
430
431						Signature: (
432						double data(d,n);
433						int mask(d,n);
434						int clusterids(n);
435						double [o]cdata(d,k);
436						int [o]cmask(d,k);
437						)
438
439						Really just a wrapper for getclustercentroids(...,"a").
440
441						=cut
442
443						sub getclustermean {
444	0		0	1		my ($data,$mask,$cids,$cdata,$cmask) = @_;
445	0					return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a');
446						}
447
448
449
450
451						=pod
452
453						=head2 getclustermedian
454
455						=for sig
456
457						Signature: (
458						double data(d,n);
459						int mask(d,n);
460						int clusterids(n);
461						double [o]cdata(d,k);
462						int [o]cmask(d,k);
463						)
464
465						Really just a wrapper for getclustercentroids(...,"m").
466
467						=cut
468
469						sub getclustermedian {
470	0		0	1		my ($data,$mask,$cids,$cdata,$cmask) = @_;
471	0					return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m');
472						}
473
474
475
476
477
478						=head2 getclustermedoids
479
480						=for sig
481
482						Signature: (
483						double distance(n,n);
484						int clusterids(n);
485						int [o]centroids(k);
486						double [o]errors(k);
487						)
488
489						The getclustermedoid routine calculates the cluster centroids, given to which
490						cluster each element belongs. The centroid is defined as the element with the
491						smallest sum of distances to the other elements.
492
493
494						=for bad
495
496						getclustermedoids does not process bad values.
497						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
498
499
500						=cut
501
502
503
504
505
506
507						*getclustermedoids = \&PDL::getclustermedoids;
508
509
510
511
512
513						=head2 kcluster
514
515						=for sig
516
517						Signature: (
518						int nclusters();
519						double data(d,n);
520						int mask(d,n);
521						double weight(d);
522						int npass();
523						int [o]clusterids(n);
524						double [o]error();
525						int [o]nfound();
526						;
527						char *distFlag;
528						char *ctrMethodFlag;
529						)
530
531						K-Means clustering algorithm. The "ctrMethodFlag" determines how
532						clusters centroids are to be computed; see getclustercentroids() for details.
533
534						See also: kmedoids().
535
536
537						=for bad
538
539						kcluster does not process bad values.
540						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
541
542
543						=cut
544
545
546
547
548
549
550						*kcluster = \&PDL::kcluster;
551
552
553
554
555
556						=head2 kmedoids
557
558						=for sig
559
560						Signature: (
561						int nclusters();
562						double distance(n,n);
563						int npass();
564						int [o]clusterids(n);
565						double [o]error();
566						int [o]nfound();
567						)
568
569						K-Medoids clustering algorithm (uses distance matrix).
570
571						See also: kcluster().
572
573
574						=for bad
575
576						kmedoids does not process bad values.
577						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
578
579
580						=cut
581
582
583
584
585
586
587						*kmedoids = \&PDL::kmedoids;
588
589
590
591
592
593						=head2 treecluster
594
595						=for sig
596
597						Signature: (
598						double data(d,n);
599						int mask(d,n);
600						double weight(d);
601						int [o]tree(2,n);
602						double [o]lnkdist(n);
603						;
604						char *distFlag;
605						char *methodFlag;
606						)
607
608
609						Hierachical agglomerative clustering.
610
611						$tree(2,n) represents the clustering solution.
612						Each row in the matrix describes one linking event,
613						with the two columns containing the name of the nodes that were joined.
614						The original genes are numbered 0..(n-1), nodes are numbered
615						-1..-(n-1).
616						$tree(2,n) thus actually uses only (2,n-1) cells.
617
618						$lnkdist(n) represents the distance between the two subnodes that were joined.
619						As for $tree(), $lnkdist() uses only (n-1) cells.
620
621
622						=for bad
623
624						treecluster does not process bad values.
625						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
626
627
628						=cut
629
630
631
632
633
634
635						*treecluster = \&PDL::treecluster;
636
637
638
639
640
641						=head2 treeclusterd
642
643						=for sig
644
645						Signature: (
646						double data(d,n);
647						int mask(d,n);
648						double weight(d);
649						double distances(n,n);
650						int [o]tree(2,n);
651						double [o]lnkdist(n);
652						;
653						char *distFlag;
654						char *methodFlag;
655						)
656
657
658						Hierachical agglomerative clustering using given distance matrix.
659
660						See distancematrix() and treecluster(), above.
661
662
663						=for bad
664
665						treeclusterd does not process bad values.
666						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
667
668
669						=cut
670
671
672
673
674
675
676						*treeclusterd = \&PDL::treeclusterd;
677
678
679
680
681
682						=head2 cuttree
683
684						=for sig
685
686						Signature: (
687						int tree(2,n);
688						int nclusters();
689						int [o]clusterids(n);
690						)
691
692
693						Cluster selection for hierarchical clustering trees.
694
695
696						=for bad
697
698						cuttree does not process bad values.
699						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
700
701
702						=cut
703
704
705
706
707
708
709						*cuttree = \&PDL::cuttree;
710
711
712
713
714
715						=head2 somcluster
716
717						=for sig
718
719						Signature: (
720						double data(d,n);
721						int mask(d,n);
722						double weight(d);
723						int nxnodes();
724						int nynodes();
725						double inittau();
726						int niter();
727						int [o]clusterids(2,n);
728						; char *distFlag;
729						)
730
731						=for ref
732
733						Self-Organizing Map clustering, does not return centroid data.
734
735						=for bad
736
737						somcluster does not process bad values.
738						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
739
740
741						=cut
742
743
744
745
746
747
748						*somcluster = \&PDL::somcluster;
749
750
751
752
753
754						=head2 pca
755
756						=for sig
757
758						Signature: (
759						double [o]U(d,n);
760						double [o]S(d);
761						double [o]V(d,d);
762						)
763
764
765						Principal Component Analysis (SVD), operates in-place on $U() and requires ($SIZE(n) E= $SIZE(d)).
766
767
768						=for bad
769
770						pca does not process bad values.
771						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
772
773
774						=cut
775
776
777
778
779
780
781						*pca = \&PDL::pca;
782
783
784
785
786
787						=head2 rowdistances
788
789						=for sig
790
791						Signature: (
792						double data(d,n);
793						int mask(d,n);
794						double weight(d);
795						int rowids1(ncmps);
796						int rowids2(ncmps);
797						double [o]dist(ncmps);
798						; char *distFlag;
799						)
800
801
802						Computes pairwise distances between rows of $data().
803						$rowids1() contains the row-indices of the left (first) comparison operand,
804						and $rowids2() the row-indices of the right (second) comparison operand. Since each
805						of these are assumed to be indices into the first dimension $data(), it should be the case that:
806
807						0 <= $rowids1(i),rowids2(i) < $SIZE(n) for 0 <= i < $SIZE(ncmps)
808
809						See also clusterdistance(), clusterdistances(), clusterdistancematrixenc(), distancematrix().
810
811
812						=for bad
813
814						rowdistances does not process bad values.
815						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
816
817
818						=cut
819
820
821
822
823
824
825						*rowdistances = \&PDL::rowdistances;
826
827
828
829
830
831						=head2 clusterdistances
832
833						=for sig
834
835						Signature: (
836						double data(d,n);
837						int mask(d,n);
838						double weight(d);
839						int rowids(nr);
840						int index2(n2);
841						double [o]dist(nr);
842						;
843						char *distFlag;
844						char *methodFlag;
845						)
846
847
848						Computes pairwise distance(s) from each of $rowids() as a singleton cluster
849						with the cluster represented by $index2(), which should be an index
850						vector as for clusterdistance(). See also clusterdistancematrixenc().
851
852
853						=for bad
854
855						clusterdistances does not process bad values.
856						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
857
858
859						=cut
860
861
862
863
864
865
866						*clusterdistances = \&PDL::clusterdistances;
867
868
869
870
871
872						=head2 clustersizes
873
874						=for sig
875
876						Signature: (int clusterids(n); int [o]clustersizes(k))
877
878
879						Computes the size (number of elements) of each cluster in $clusterids().
880						Useful for allocating less than maximmal space for $clusterelements().
881
882
883						=for bad
884
885						The output piddle should never be marked BAD.
886
887						=cut
888
889
890
891
892
893
894						*clustersizes = \&PDL::clustersizes;
895
896
897
898
899
900						=head2 clusterelements
901
902						=for sig
903
904						Signature: (int clusterids(n); int [o]clustersizes(k); int [o]eltids(mcsize,k))
905
906
907						Converts the vector $clusterids() to a matrix $eltids() of element (row) indices
908						indexed by cluster-id. $mcsize() is the maximum number of elements per cluster,
909						at most $n. The output PDLs $clustersizes() and $eltids() can be passed to
910						clusterdistancematrix().
911
912
913						=for bad
914
915						clusterelements does not process bad values.
916						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
917
918
919						=cut
920
921
922
923
924
925
926						*clusterelements = \&PDL::clusterelements;
927
928
929
930
931
932						=head2 clusterelementmask
933
934						=for sig
935
936						Signature: (int clusterids(n); byte [o]eltmask(k,n))
937
938
939						Get boolean membership mask $eltmask() based on cluster assignment in $clusterids().
940						No value in $clusterids() may be greater than or equal to $k.
941						On completion, $eltmask(k,n) is a true value iff $clusterids(n)=$k.
942
943
944						=for bad
945
946						clusterelementmask does not process bad values.
947						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
948
949
950						=cut
951
952
953
954
955
956
957						*clusterelementmask = \&PDL::clusterelementmask;
958
959
960
961
962
963						=head2 clusterdistancematrix
964
965						=for sig
966
967						Signature: (
968						double data(d,n);
969						int mask(d,n);
970						double weight(d);
971						int rowids(nr);
972						int clustersizes(k);
973						int eltids(mcsize,k);
974						double [o]dist(k,nr);
975						;
976						char *distFlag;
977						char *methodFlag;
978						)
979
980
981						B in favor of clusterdistancematrixenc().
982						In the future, this method is expected to become a wrapper for clusterdistancematrixenc().
983
984						Computes distance between each row index in $rowids()
985						considered as a singleton cluster
986						and each of the $k clusters whose elements are given by a single row of $eltids().
987						$clustersizes() and $eltids() are as output by the clusterelements() method.
988
989						See also clusterdistance(), clusterdistances(), clustersizes(), clusterelements(), clusterdistancematrixenc().
990
991
992						=for bad
993
994						clusterdistancematrix does not process bad values.
995						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
996
997
998						=cut
999
1000
1001
1002
1003
1004
1005						*clusterdistancematrix = \&PDL::clusterdistancematrix;
1006
1007
1008
1009
1010						=pod
1011
1012						=head2 clusterenc
1013
1014						=for sig
1015
1016						Signature: (
1017						int clusterids(n);
1018						int [o]clusterlens(k1);
1019						int [o]clustervals(k1);
1020						int [o]clusterrows(n);
1021						;
1022						int k1;
1023						)
1024
1025						Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1026						clusters-to-data. Returned PDL $clusterlens() holds the lengths of each
1027						cluster containing at least one element. $clustervals() holds the IDs
1028						of such clusters as they appear as values in $clusterids(). $clusterrows()
1029						is such that:
1030
1031						all( rld($clusterlens, $clustervals) == $clusterids )
1032
1033						... if all available cluster-ids are in use.
1034
1035						If specified, $k1 is a perl scalar
1036						holding the number of clusters (maximum cluster index + 1); an
1037						appropriate value will guessed from $clusterids() otherwise.
1038
1039						Really just a wrapper for some lower-level PDL and PDL::Cluster calls.
1040
1041						=cut
1042
1043						sub clusterenc {
1044	0		0	1		my ($cids, $clens,$cvals,$crows, $kmax) = @_;
1045	0	0				$kmax = $cids->max+1 if (!defined($kmax));
1046
1047						##-- cluster sizes
1048	0	0				$clens = zeroes(long, $kmax) if (!defined($clens));
1049	0					clustersizes($cids,$clens);
1050
1051						##-- cluster-id values
1052	0	0				if (!defined($cvals)) { $cvals = PDL->sequence(long,$kmax); }
	0
1053	0					else { $cvals .= PDL->sequence(long,$kmax); }
1054
1055						##-- cluster-row values: handle BAD and negative values
1056						#if (!defined($crows)) { $crows = $cids->qsorti->where($cids->isgood & $cids>=0); }
1057						#else { $crows .= $cids->qsorti->where($cids->isgood & $cids>=0); }
1058
1059						##-- cluster-row values: treat BAD and negative values like anything else
1060	0	0				if (!defined($crows)) { $crows = $cids->qsorti; }
	0
1061	0					else { $crows .= $cids->qsorti; }
1062
1063	0					return ($clens,$cvals,$crows);
1064						}
1065
1066
1067
1068
1069						=pod
1070
1071						=head2 clusterdec
1072
1073						=for sig
1074
1075						Signature: (
1076						int clusterlens(k1);
1077						int clustervals(k1);
1078						int clusterrows(n);
1079						int [o]clusterids(n);
1080						)
1081
1082						Decodes cluster-to-datum vectors ($clusterlens,$clustervals,$clusterrows)
1083						into a single datum-to-cluster vector $clusterids().
1084						$(clusterlens,$clustervals,$clusterrows) are as returned by the clusterenc() method.
1085
1086						Un-addressed row-index values in $clusterrows() will be assigned the pseudo-cluster (-1)
1087						in $clusterids().
1088
1089						Really just a wrapper for some lower-level PDL calls.
1090
1091						=cut
1092
1093						sub clusterdec {
1094	0		0	1		my ($clens,$cvals,$crows, $cids2) = @_;
1095
1096						##-- get $cids
1097	0	0				$cids2 = zeroes($cvals->type, $crows->dims) if (!defined($cids2));
1098	0					$cids2 .= -1;
1099
1100						##-- trim $crows
1101						#my $crows_good = $crows->slice("0:".($clens->sum-1)); ##-- assume bad indices are at END of $crows (BAD,inf,...)
1102	0					my $crows_good = $crows->slice(-$clens->sum.":-1"); ##-- assume bad indices are at BEGINNING of $crows (-1, ...)
1103
1104						##-- decode
1105	0					$clens->rld($cvals, $cids2->index($crows_good));
1106
1107	0					return $cids2;
1108						}
1109
1110
1111
1112
1113						=pod
1114
1115						=head2 clusteroffsets
1116
1117						=for sig
1118
1119						Signature: (
1120						int clusterids(n);
1121						int [o]clusteroffsets(k1+1);
1122						int [o]clustervals(k1);
1123						int [o]clusterrows(n);
1124						;
1125						int k1;
1126						)
1127
1128						Encodes datum-to-cluster vector $clusterids() for efficiently mapping
1129						clusters-to-data. Like clusterenc(), but returns cumulative offsets
1130						instead of lengths.
1131
1132						Really just a wrapper for clusterenc(), cumusumover(), and append().
1133
1134						=cut
1135
1136						sub clusteroffsets {
1137	0		0	1		my ($cids, $coffsets,$cvals,$crows, $kmax) = @_;
1138	0					my ($clens);
1139	0					($clens,$cvals,$crows) = clusterenc($cids,undef,$cvals,$crows,$kmax);
1140	0					$coffsets = $clens->append(0)->rotate(1)->cumusumover;
1141
1142	0					return ($coffsets,$cvals,$crows);
1143						}
1144
1145
1146
1147
1148
1149						=head2 clusterdistancematrixenc
1150
1151						=for sig
1152
1153						Signature: (
1154						double data(d,n);
1155						int mask(d,n);
1156						double weight(d);
1157						int clens1(k1);
1158						int crowids1(nc1);
1159						int clens2(k2);
1160						int crowids2(nc2);
1161						double [o]dist(k1,k2);
1162						;
1163						char *distFlag;
1164						char *methodFlag;
1165						)
1166
1167
1168						Computes cluster-distance between each pair of clusters in (sequence($k1) x sequence($k2)), where 'x'
1169						is the Cartesian product. Cluster contents are passed as pairs ($clens(),$crowids()) as returned
1170						by the clusterenc() function (assuming that the $cvals() vector returned by clusterenc() is a flat sequence).
1171
1172						The deprecated method clusterdistancematrix() can be simulated by this function in the following
1173						manner: if a clusterdistancematrix() call was:
1174
1175						clustersizes ($cids, $csizes=zeroes(long,$k));
1176						clusterelements($cids, $celts =zeroes(long,$csizes->max)-1);
1177						clusterdistancematrix($data,$msk,$wt, $rowids, $csizes,$celts,
1178						$cdmat=zeroes(double,$k,$rowids->dim(0)),
1179						$distFlag, $methodFlag
1180						);
1181
1182						Then the corresponding use of clusterdistancematrixenc() would be:
1183
1184						($clens,$cvals,$crows) = clusterenc($cids);
1185						clusterdistancematrixenc($data,$msk,$wt,
1186						$clens, $crows, ##-- "real" clusters in output dim 0
1187						$rowids->ones, $rowids, ##-- $rowids as singleton clusters in output dim 1
1188						$cdmat=zeroes(double,$clens->dim(0),$rowids->dim(0)),
1189						$distFlag, $methodFlag);
1190
1191						If your $cvals() are not a flat sequence, you will probably need to do some index-twiddling
1192						to get things into the proper shape:
1193
1194						if ( !all($cvals==$cvals->sequence) \|\| $cvals->dim(0) != $k )
1195						{
1196						my $cdmat0 = $cdmat;
1197						my $nr = $rowids->dim(0);
1198						$cdmat = pdl(double,"inf")->slice("$k,$nr")->make_physical(); ##-- "missing" distances are infinite
1199						$cdmat->dice_axis(0,$cvals) .= $cdmat0;
1200						}
1201
1202						$distFlag and $methodFlag are interpreted as for clusterdistance().
1203
1204						See also clusterenc(), clusterdistancematrix().
1205
1206
1207						=for bad
1208
1209						clusterdistancematrixenc does not process bad values.
1210						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1211
1212
1213						=cut
1214
1215
1216
1217
1218
1219
1220						*clusterdistancematrixenc = \&PDL::clusterdistancematrixenc;
1221
1222
1223
1224
1225
1226						=head2 clusterdistancesenc
1227
1228						=for sig
1229
1230						Signature: (
1231						double data(d,n);
1232						int mask(d,n);
1233						double weight(d);
1234						int coffsets1(k1);
1235						int crowids1(nc1);
1236						int cwhich1(ncmps);
1237						int coffsets2(k2);
1238						int crowids2(nc2);
1239						int cwhich2(ncmps);
1240						double [o]dists(ncmps);
1241						;
1242						char *distFlag;
1243						char *methodFlag;
1244						)
1245
1246
1247						Computes cluster-distance between selected pairs of co-indexed clusters in ($cwhich1,$cwhich2).
1248						Cluster contents are passed as pairs ($coffsetsX(),$crowidsX()) as returned
1249						by the clusteroffsets() function.
1250
1251						$distFlag and $methodFlag are interpreted as for clusterdistance().
1252
1253						See also clusterenc(), clusterdistancematrixenc().
1254
1255
1256						=for bad
1257
1258						clusterdistancesenc does not process bad values.
1259						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1260
1261
1262						=cut
1263
1264
1265
1266
1267
1268
1269						*clusterdistancesenc = \&PDL::clusterdistancesenc;
1270
1271
1272
1273
1274
1275						=head2 getclusterwsum
1276
1277						=for sig
1278
1279						Signature: (
1280						double data(d,n);
1281						int mask(d,n);
1282						double clusterwts(k,n);
1283						double [o]cdata(d,k);
1284						int [o]cmask(d,k);
1285						)
1286
1287
1288						Find cluster centroids by weighted sum. This can be considered an
1289						expensive generalization of the getclustermean() and getclustermedian()
1290						functions. Here, the input PDLs $data() and $mask(), as well as the
1291						output PDL $cdata() are as for getclustermean(). The matrix $clusterwts()
1292						determines the relative weight of each data row in determining the
1293						centroid of each cluster, potentially useful for "fuzzy" clustering.
1294						The equation used to compute cluster means is:
1295
1296						$cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n)
1297
1298						For centroids in the same range as data elements, $clusterwts()
1299						should sum to 1 over each column (k):
1300
1301						all($clusterwts->xchg(0,1)->sumover == 1)
1302
1303						getclustermean() can be simulated by instantiating $clusterwts() with
1304						a uniform distribution over cluster elements:
1305
1306						$clusterwts = zeroes($k,$n);
1307						$clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1;
1308						$clusterwts /= $clusterwts->xchg(0,1)->sumover;
1309						getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k));
1310
1311						Similarly, getclustermedian() can be simulated by setting $clusterwts() to
1312						1 for cluster medians and otherwise to 0. More sophisticated centroid
1313						discovery methods can be computed by this function by setting
1314						$clusterwts(k,n) to some estimate of the conditional probability
1315						of the datum at row $n given the cluster with index $k:
1316						p(Elt==n\|Cluster==k). One
1317						way to achieve such an estimate is to use (normalized inverses of) the
1318						singleton-row-to-cluster distances as output by clusterdistancematrix().
1319
1320
1321
1322						=for bad
1323
1324						getclusterwsum does not process bad values.
1325						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1326
1327
1328						=cut
1329
1330
1331
1332
1333
1334
1335						*getclusterwsum = \&PDL::getclusterwsum;
1336
1337
1338
1339
1340
1341						=head2 attachtonearest
1342
1343						=for sig
1344
1345						Signature: (
1346						double data(d,n);
1347						int mask(d,n);
1348						double weight(d);
1349						int rowids(nr);
1350						double cdata(d,k);
1351						int cmask(d,k);
1352						int [o]clusterids(nr);
1353						double [o]cdist(nr);
1354						;
1355						char *distFlag;
1356						char *methodFlag;
1357						)
1358
1359
1360						Assigns each specified data row to the nearest cluster centroid.
1361						Data elements are given by $data() and $mask(), feature weights are
1362						given by $weight(), as usual. Cluster centroids are defined by
1363						by $cdata() and $cmask(), and the indices of rows to be attached
1364						are given in the vector $rowids(). The output vector $clusterids()
1365						contains for each specified row index the identifier of the nearest
1366						cluster centroid. The vector $cdist() contains the distance to
1367						the best clusters.
1368
1369						See also: clusterdistancematrix(), attachtonearestd().
1370
1371
1372						=for bad
1373
1374						attachtonearest does not process bad values.
1375						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1376
1377
1378						=cut
1379
1380
1381
1382
1383
1384
1385						*attachtonearest = \&PDL::attachtonearest;
1386
1387
1388
1389
1390						=pod
1391
1392						=head2 attachtonearestd
1393
1394						=for sig
1395
1396						Signature: (
1397						double cdistmat(k,n);
1398						int rowids(nr);
1399						int [o]clusterids(nr);
1400						double [o]dists(nr);
1401						)
1402
1403						Assigns each specified data row to the nearest cluster centroid,
1404						as for attachtonearest(), given the datum-to-cluster distance
1405						matrix $cdistmat(). Currently just a wrapper for a few PDL calls.
1406						In scalar context returns $clusterids(), in list context returns
1407						the list ($clusterids(),$dists()).
1408
1409						=cut
1410
1411						sub attachtonearestd {
1412	0		0	1		my ($cdm,$rowids,$cids,$dists)=@_;
1413	0	0				$cids = zeroes(long, $rowids->dim(0)) if (!defined($cids));
1414	0	0				$dists = zeroes(double, $rowids->dim(0)) if (!defined($dists));
1415
1416						##-- dice matrix
1417	0					my $cdmr = $cdm->dice_axis(1,$rowids);
1418
1419						##-- get best
1420	0					$cdmr->minimum_ind($cids);
1421	0					$dists .= $cdmr->index($cids);
1422
1423	0	0				return wantarray ? ($cids,$dists) : $cids;
1424						}
1425
1426
1427
1428
1429
1430						=head2 checkprototypes
1431
1432						=for sig
1433
1434						Signature: (
1435						protos(k);
1436						[o]cprotos(k);
1437						byte [t]otmp(n);
1438						; int nsize => n)
1439
1440						(Deterministic)
1441
1442						Ensure that the assignment $protos() from $k objects to
1443						integer "prototype" indices in the range [0,$n( contains no repetitions of any
1444						of the $n possible prototype values. One use for this function is
1445						the restriction of (randomly generated) potential clustering solutions
1446						for $k clusters in which each cluster is represented by a
1447						"prototypical" element from a data sample of size $n.
1448
1449						Requires: $n >= $k.
1450
1451
1452						=for bad
1453
1454						checkprototypes does not process bad values.
1455						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1456
1457
1458						=cut
1459
1460
1461
1462
1463
1464
1465						*checkprototypes = \&PDL::checkprototypes;
1466
1467
1468
1469
1470
1471						=head2 checkpartitions
1472
1473						=for sig
1474
1475						Signature: (
1476						part(n);
1477						[o]cpart(n);
1478						[t]ptmp(k);
1479						; int ksize => k)
1480
1481						(Deterministic)
1482
1483						Ensure that the partitioning $part() of $n objects into $k bins
1484						(identified by integer values in the range [0,$k-1])
1485						contains at least one instance of each of the
1486						$k possible values. One use for this function is
1487						the restriction of (randomly generated) potential clustering solutions
1488						for $n elements into $k clusters to those which assign at least one
1489						element to each cluster.
1490
1491						Requires: $n >= $k.
1492
1493
1494						=for bad
1495
1496						checkpartitions does not process bad values.
1497						It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
1498
1499
1500						=cut
1501
1502
1503
1504
1505
1506
1507						*checkpartitions = \&PDL::checkpartitions;
1508
1509
1510
1511
1512						=pod
1513
1514						=head2 randomprototypes
1515
1516						=for sig
1517
1518						Signature: (int k; int n; [o]prototypes(k))
1519
1520						Generate a random set of $k prototype indices drawn from $n objects,
1521						ensuring that no object is used more than once. Calls checkprototypes().
1522
1523						See also: checkprototypes(), randomassign(), checkpartitions(), randompartition().
1524
1525						=cut
1526
1527						sub randomprototypes {
1528	0		0	1		my ($k,$n,$protos) = @_;
1529	0	0				$protos = zeroes(long, $k) if (!defined($protos));
1530	0					$protos .= PDL->random($k)*$n;
1531	0					checkprototypes($protos->inplace, $n);
1532	0					return $protos;
1533						}
1534
1535
1536
1537
1538						=pod
1539
1540						=head2 randompartition
1541
1542						=for sig
1543
1544						Signature: (int k; int n; [o]partition(n))
1545
1546						Generate a partitioning of $n objects into $k clusters,
1547						ensuring that every cluster contains at least one object.
1548						Calls checkpartitions().
1549						This method is identical in functionality to randomassign(),
1550						but may be faster if $k is significantly smaller than $n.
1551
1552						See also: randomassign(), checkpartitions(), checkprototypes(), randomprototypes().
1553
1554						=cut
1555
1556						sub randompartition {
1557	0		0	1		my ($k,$n,$part) = @_;
1558	0	0				$part = zeroes(long, $n) if (!defined($part));
1559	0					$part .= PDL->random($n)*$k;
1560	0					checkpartitions($part->inplace, $k);
1561	0					return $part;
1562						}
1563
1564
1565
1566
1567
1568						##---------------------------------------------------------------------
1569						=pod
1570
1571						=head1 COMMON ARGUMENTS
1572
1573						Many of the functions described above require one or
1574						more of the following parameters:
1575
1576						=over 4
1577
1578						=item d
1579
1580						The number of features defined for each data element.
1581
1582						=item n
1583
1584						The number of data elements to be clustered.
1585
1586						=item k
1587
1588						=item nclusters
1589
1590						The number of desired clusters.
1591
1592						=item data(d,n)
1593
1594						A matrix representing the data to be clustered, double-valued.
1595
1596						=item mask(d,n)
1597
1598						A matrix indicating which data values are missing. If
1599						mask(i,j) == 0, then data(i,j) is treated as missing.
1600
1601						=item weights(d)
1602
1603						The (feature-) weights that are used to calculate the distance.
1604
1605						B Not all distance metrics make use of weights;
1606						you must provide some nonetheless.
1607
1608						=item clusterids(n)
1609
1610						A clustering solution. $clusterids() maps data elements
1611						(row indices in $data()) to values in the range [0,$k-1].
1612
1613						=back
1614
1615						=cut
1616
1617						##---------------------------------------------------------------------
1618						=pod
1619
1620						=head2 Distance Metrics
1621
1622						Distances between data elements (and cluster centroids, where applicable)
1623						are computed using one of a number of built-in metrics. Which metric
1624						is to be used for a given computation is indicated by a character
1625						flag denoted above with $distFlag(). In the following, w[i] represents
1626						a weighting factor in the $weights() matrix, and $W represents the total
1627						of all weights.
1628
1629						Currently implemented distance
1630						metrics and the corresponding flags are:
1631
1632						=over 4
1633
1634						=item e
1635
1636						Pseudo-Euclidean distance:
1637
1638						dist_e(x,y) = 1/W * sum_{i=1..d} w[i] * (x[i] - y[i])^2
1639
1640						Note that this is not the "true" Euclidean distance, which is defined as:
1641
1642						dist_E(x,y) = sqrt( sum_{i=1..d} (x[i] - y[i])^2 )
1643
1644
1645						=item b
1646
1647						City-block ("Manhattan") distance:
1648
1649						dist_b(x,y) = 1/W * sum_{i=1..d} w[i] * \|x[i] - y[i]\|
1650
1651
1652
1653						=item c
1654
1655						Pearson correlation distance:
1656
1657						dist_c(x,y) = 1-r(x,y)
1658
1659						where r is the Pearson correlation coefficient:
1660
1661						r(x,y) = 1/d * sum_{i=1..d} (x[i]-mean(x))/stddev(x) * (y[i]-mean(y))/stddev(y)
1662
1663						=item a
1664
1665						Absolute value of the correlation,
1666
1667						dist_a(x,y) = 1-\|r(x,y)\|
1668
1669						where r(x,y) is the Pearson correlation coefficient.
1670
1671						=item u
1672
1673						Uncentered correlation (cosine of the angle):
1674
1675						dist_u(x,y) = 1-r_u(x,y)
1676
1677						where:
1678
1679						r_u(x,y) = 1/d * sum_{i=1..d} (x[i]/sigma0(x)) * (y[i]/sigma0(y))
1680
1681						and:
1682
1683						sigma0(w) = sqrt( 1/d * sum_{i=1..d} w[i]^2 )
1684
1685						=item x
1686
1687						Absolute uncentered correlation,
1688
1689						dist_x(x,y) = 1-\|r_u(x,y)\|
1690
1691						=item s
1692
1693						Spearman's rank correlation.
1694
1695						dist_s(x,y) = 1-r_s(x,y) ~= dist_c(ranks(x),ranks(y))
1696
1697						where r_s(x,y) is the Spearman rank correlation. Weights are ignored.
1698
1699						=item k
1700
1701						Kendall's tau (does not use weights).
1702
1703						dist_k(x,y) = 1 - tau(x,y)
1704
1705						=item (other values)
1706
1707						For other values of dist, the default (Euclidean distance) is used.
1708
1709						=back
1710
1711						=cut
1712
1713
1714						##---------------------------------------------------------------------
1715						=pod
1716
1717						=head2 Link Methods
1718
1719						For hierarchical clustering, the 'link method' must be specified
1720						by a character flag, denoted above as $methodFlag.
1721						Known link methods are:
1722
1723						=over 4
1724
1725						=item s
1726
1727						Pairwise minimum-linkage ("single") clustering.
1728
1729						Defines the distance between two clusters as the
1730						least distance between any two of their respective elements.
1731
1732						=item m
1733
1734						Pairwise maximum-linkage ("complete") clustering.
1735
1736						Defines the distance between two clusters as the
1737						greatest distance between any two of their respective elements.
1738
1739						=item a
1740
1741						Pairwise average-linkage clustering (centroid distance using arithmetic mean).
1742
1743						Defines the distance between two clusters as the
1744						distance between their respective centroids, where each
1745						cluster centroid is defined as the arithmetic mean of
1746						that cluster's elements.
1747
1748						=item c
1749
1750						Pairwise centroid-linkage clustering (centroid distance using median).
1751
1752						Identifies the distance between two clusters as the
1753						distance between their respective centroids, where each
1754						cluster centroid is computed as the median of
1755						that cluster's elements.
1756
1757						=item (other values)
1758
1759						Behavior for other values is currently undefined.
1760
1761						=back
1762
1763						For the first three, either the distance matrix or the gene expression data is
1764						sufficient to perform the clustering algorithm. For pairwise centroid-linkage
1765						clustering, however, the gene expression data are always needed, even if the
1766						distance matrix itself is available.
1767
1768						=cut
1769
1770						##---------------------------------------------------------------------
1771						=pod
1772
1773						=head1 ACKNOWLEDGEMENTS
1774
1775						Perl by Larry Wall.
1776
1777						PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others.
1778
1779						C Clustering Library by
1780						Michiel de Hoon,
1781						Seiya Imoto,
1782						and Satoru Miyano.
1783
1784						Orignal Algorithm::Cluster module by John Nolan and Michiel de Hoon.
1785
1786						=cut
1787
1788						##----------------------------------------------------------------------
1789						=pod
1790
1791						=head1 KNOWN BUGS
1792
1793						Dimensional requirements are sometimes too strict.
1794
1795						Passing weights to Spearman and Kendall link methods wastes space.
1796
1797						=cut
1798
1799
1800						##---------------------------------------------------------------------
1801						=pod
1802
1803						=head1 AUTHOR
1804
1805						Bryan Jurish Emoocow@cpan.orgE wrote and maintains the PDL::Cluster distribution.
1806
1807						Michiel de Hoon wrote the underlying C clustering library for cDNA microarray data.
1808
1809						=head1 COPYRIGHT
1810
1811						PDL::Cluster is a set of wrappers around the C Clustering library for cDNA microarray data.
1812
1813						=over 4
1814
1815						=item *
1816
1817						The C clustering library for cDNA microarray data.
1818						Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon.
1819
1820						This library was written at the Laboratory of DNA Information Analysis,
1821						Human Genome Center, Institute of Medical Science, University of Tokyo,
1822						4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
1823						Contact: michiel.dehoon 'AT' riken.jp
1824
1825						See the files F, F and F in the PDL::Cluster distribution
1826						for details.
1827
1828						=item *
1829
1830						PDL::Cluster wrappers copyright (C) Bryan Jurish 2005-2018. All rights reserved.
1831						This package is free software, and entirely without warranty.
1832						You may redistribute it and/or modify it under the same terms
1833						as Perl itself.
1834
1835						=back
1836
1837						=head1 SEE ALSO
1838
1839						perl(1), PDL(3perl), Algorithm::Cluster(3perl), cluster(1),
1840						L
1841
1842						=cut
1843
1844
1845
1846						;
1847
1848
1849
1850						# Exit with OK status
1851
1852						1;
1853
1854