line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# GENERATED WITH PDL::PP! Don't modify! |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
package PDL::Cluster; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
@EXPORT_OK = qw( PDL::PP cmean PDL::PP cmedian PDL::PP calculate_weights PDL::PP clusterdistance PDL::PP distancematrix PDL::PP getclustercentroids getclustermean getclustermedian PDL::PP getclustermedoids PDL::PP kcluster PDL::PP kmedoids PDL::PP treecluster PDL::PP treeclusterd PDL::PP cuttree PDL::PP somcluster PDL::PP pca PDL::PP rowdistances PDL::PP clusterdistances PDL::PP clustersizes PDL::PP clusterelements PDL::PP clusterelementmask PDL::PP clusterdistancematrix clusterenc clusterdec clusteroffsets PDL::PP clusterdistancematrixenc PDL::PP clusterdistancesenc PDL::PP getclusterwsum PDL::PP attachtonearest attachtonearestd PDL::PP checkprototypes PDL::PP checkpartitions randomprototypes randompartition ); |
8
|
|
|
|
|
|
|
%EXPORT_TAGS = (Func=>[@EXPORT_OK]); |
9
|
|
|
|
|
|
|
|
10
|
9
|
|
|
9
|
|
2212547
|
use PDL::Core; |
|
9
|
|
|
|
|
36
|
|
|
9
|
|
|
|
|
51
|
|
11
|
9
|
|
|
9
|
|
2003
|
use PDL::Exporter; |
|
9
|
|
|
|
|
17
|
|
|
9
|
|
|
|
|
46
|
|
12
|
9
|
|
|
9
|
|
204
|
use DynaLoader; |
|
9
|
|
|
|
|
15
|
|
|
9
|
|
|
|
|
9251
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
$PDL::Cluster::VERSION = '1.54.001'; |
17
|
|
|
|
|
|
|
@ISA = ( 'PDL::Exporter','DynaLoader' ); |
18
|
|
|
|
|
|
|
push @PDL::Core::PP, __PACKAGE__; |
19
|
|
|
|
|
|
|
bootstrap PDL::Cluster $VERSION; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
26
|
|
|
|
|
|
|
# File: PDL::Cluster.pm |
27
|
|
|
|
|
|
|
# Author: Bryan Jurish |
28
|
|
|
|
|
|
|
# Description: PDL wrappers for the C Clustering library. |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
# Copyright (c) 2005-2018 Bryan Jurish. All rights reserved. |
31
|
|
|
|
|
|
|
# This program is free software. You may modify and/or |
32
|
|
|
|
|
|
|
# distribute it under the same terms as Perl itself. |
33
|
|
|
|
|
|
|
# |
34
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
35
|
|
|
|
|
|
|
# Based on the C clustering library for cDNA microarray data, |
36
|
|
|
|
|
|
|
# Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon. |
37
|
|
|
|
|
|
|
# |
38
|
|
|
|
|
|
|
# The C clustering library was written at the Laboratory of DNA Information |
39
|
|
|
|
|
|
|
# Analysis, Human Genome Center, Institute of Medical Science, University of |
40
|
|
|
|
|
|
|
# Tokyo, 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan. |
41
|
|
|
|
|
|
|
# Contact: michiel.dehoon 'AT' riken.jp |
42
|
|
|
|
|
|
|
# |
43
|
|
|
|
|
|
|
# See the files "cluster.c" and "cluster.h" in the PDL::Cluster distribution |
44
|
|
|
|
|
|
|
# for details. |
45
|
|
|
|
|
|
|
#--------------------------------------------------------------------------- |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=pod |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 NAME |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
PDL::Cluster - PDL interface to the C Clustering Library |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 SYNOPSIS |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
use PDL::Cluster; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
##----------------------------------------------------- |
58
|
|
|
|
|
|
|
## Data Format |
59
|
|
|
|
|
|
|
$d = 42; ##-- number of features |
60
|
|
|
|
|
|
|
$n = 1024; ##-- number of data elements |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
$data = random($d,$n); ##-- data matrix |
63
|
|
|
|
|
|
|
$elt = $data->slice(",($i)"); ##-- element data vector |
64
|
|
|
|
|
|
|
$ftr = $data->slice("($j),"); ##-- feature vector over all elements |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
$wts = ones($d)/$d; ##-- feature weights |
67
|
|
|
|
|
|
|
$msk = ones($d,$n); ##-- missing-datum mask (1=ok) |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
##----------------------------------------------------- |
70
|
|
|
|
|
|
|
## Library Utilties |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
$mean = $ftr->cmean(); |
73
|
|
|
|
|
|
|
$median = $ftr->cmedian(); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
calculate_weights($data,$msk,$wts, $cutoff,$expnt, |
76
|
|
|
|
|
|
|
$weights); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
##----------------------------------------------------- |
79
|
|
|
|
|
|
|
## Distance Functions |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
clusterdistance($data,$msk,$wts, $n1,$n2,$idx1,$idx2, |
82
|
|
|
|
|
|
|
$dist, |
83
|
|
|
|
|
|
|
$distFlag, $methodFlag2); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
distancematrix($data,$msk,$wts, $distmat, $distFlag); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
##----------------------------------------------------- |
88
|
|
|
|
|
|
|
## Partitioning Algorithms |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
getclustermean($data,$msk,$clusterids, |
91
|
|
|
|
|
|
|
$ctrdata, $ctrmask); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
getclustermedian($data,$msk,$clusterids, |
94
|
|
|
|
|
|
|
$ctrdata, $ctrmask); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
getclustermedoid($distmat,$clusterids,$centroids, |
97
|
|
|
|
|
|
|
$errorsums); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
kcluster($k, $data,$msk,$wts, $npass, |
100
|
|
|
|
|
|
|
$clusterids, $error, $nfound, |
101
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
kmedoids($k, $distmat,$npass, |
104
|
|
|
|
|
|
|
$clusterids, $error, $nfound); |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
##----------------------------------------------------- |
107
|
|
|
|
|
|
|
## Hierarchical Algorithms |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
treecluster($data,$msk,$wts, |
110
|
|
|
|
|
|
|
$tree, $lnkdist, |
111
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
treeclusterd($data,$msk,$wts, $distmat, |
114
|
|
|
|
|
|
|
$tree, $lnkdist, |
115
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
cuttree($tree, $nclusters, |
118
|
|
|
|
|
|
|
$clusterids); |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
##----------------------------------------------------- |
121
|
|
|
|
|
|
|
## Self-Organizing Maps |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
somcluster($data,$msk,$wts, $nx,$ny,$tau,$niter, |
124
|
|
|
|
|
|
|
$clusterids, |
125
|
|
|
|
|
|
|
$distFlag); |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
##----------------------------------------------------- |
128
|
|
|
|
|
|
|
## Principal Component Analysis |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
pca($U, $S, $V); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
##----------------------------------------------------- |
133
|
|
|
|
|
|
|
## Extensions |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
rowdistances($data,$msk,$wts, $rowids1,$rowids2, $distvec, $distFlag); |
136
|
|
|
|
|
|
|
clusterdistances($data,$msk,$wts, $rowids, $index2, |
137
|
|
|
|
|
|
|
$dist, |
138
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
clustersizes($clusterids, $clustersizes); |
141
|
|
|
|
|
|
|
clusterelements($clustierids, $clustersizes, $eltids); |
142
|
|
|
|
|
|
|
clusterelementmask($clusterids, $eltmask); |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
clusterdistancematrix($data,$msk,$wts, |
145
|
|
|
|
|
|
|
$rowids, $clustersizes, $eltids, |
146
|
|
|
|
|
|
|
$dist, |
147
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
clusterenc($clusterids, $clens,$cvals,$crowids, $k); |
150
|
|
|
|
|
|
|
clusterdec($clens,$cvals,$crowids, $clusterids, $k); |
151
|
|
|
|
|
|
|
clusteroffsets($clusterids, $coffsets,$cvals,$crowids, $k); |
152
|
|
|
|
|
|
|
clusterdistancematrixenc($data,$msk,$wts, |
153
|
|
|
|
|
|
|
$clens1,$crowids1, $clens2,$crowids2, |
154
|
|
|
|
|
|
|
$dist, |
155
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=cut |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head1 FUNCTIONS |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head2 cmean |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=for sig |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Signature: (double a(n); double [o]b()) |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=for ref |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Computes arithmetic mean of the vector $a(). See also PDL::Primitive::avg(). |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=for bad |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
cmean does not process bad values. |
189
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=cut |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
*cmean = \&PDL::cmean; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head2 cmedian |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=for sig |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Signature: (double a(n); double [o]b()) |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=for ref |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Computes median of the vector $a(). See also PDL::Primitive::median(). |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=for bad |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
cmedian does not process bad values. |
218
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=cut |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
*cmedian = \&PDL::cmedian; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 calculate_weights |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=for sig |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Signature: ( |
239
|
|
|
|
|
|
|
double data(d,n); |
240
|
|
|
|
|
|
|
int mask(d,n); |
241
|
|
|
|
|
|
|
double weight(d); |
242
|
|
|
|
|
|
|
double cutoff(); |
243
|
|
|
|
|
|
|
double exponent(); |
244
|
|
|
|
|
|
|
double [o]oweights(d); |
245
|
|
|
|
|
|
|
; char *distFlag; |
246
|
|
|
|
|
|
|
) |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
This function calculates weights for the features using the weighting scheme |
250
|
|
|
|
|
|
|
proposed by Michael Eisen: |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
w[i] = 1.0 / sum_{j where dist(i,j)
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
where the cutoff and the exponent are specified by the user. |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=for bad |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
calculate_weights does not process bad values. |
260
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=cut |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
*calculate_weights = \&PDL::calculate_weights; |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head2 clusterdistance |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=for sig |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Signature: ( |
281
|
|
|
|
|
|
|
double data(d,n); |
282
|
|
|
|
|
|
|
int mask(d,n); |
283
|
|
|
|
|
|
|
double weight(d); |
284
|
|
|
|
|
|
|
int n1(); |
285
|
|
|
|
|
|
|
int n2(); |
286
|
|
|
|
|
|
|
int index1(n1); |
287
|
|
|
|
|
|
|
int index2(n2); |
288
|
|
|
|
|
|
|
double [o]dist(); |
289
|
|
|
|
|
|
|
; |
290
|
|
|
|
|
|
|
char *distFlag; |
291
|
|
|
|
|
|
|
char *methodFlag; |
292
|
|
|
|
|
|
|
) |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
Computes distance between two clusters $index1() and $index2(). |
296
|
|
|
|
|
|
|
Each of the $index() vectors represents a single cluster whose values |
297
|
|
|
|
|
|
|
are the row-indices in the $data() matrix of the elements assigned |
298
|
|
|
|
|
|
|
to the respective cluster. $n1() and $n2() are the number of elements |
299
|
|
|
|
|
|
|
in $index1() and $index2(), respectively. Each $index$i() must have |
300
|
|
|
|
|
|
|
at least $n$i() elements allocated. |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
B the $methodFlag argument is interpreted differently than |
303
|
|
|
|
|
|
|
by the treecluster() method, namely: |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=over 4 |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=item a |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Distance between the arithmetic means of the two clusters, |
310
|
|
|
|
|
|
|
as for treecluster() "f". |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=item m |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Distance between the medians of the two clusters, |
315
|
|
|
|
|
|
|
as for treecluster() "c". |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item s |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
Minimum pairwise distance between members of the two clusters, |
320
|
|
|
|
|
|
|
as for treecluster() "s". |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=item x |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
Maximum pairwise distance between members of the two clusters |
325
|
|
|
|
|
|
|
as for treecluster() "m". |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=item v |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
Average of the pairwise distances between members of the two clusters, |
330
|
|
|
|
|
|
|
as for treecluster() "a". |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=back |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=for bad |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
clusterdistance does not process bad values. |
339
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=cut |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
*clusterdistance = \&PDL::clusterdistance; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head2 distancematrix |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=for sig |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
Signature: ( |
360
|
|
|
|
|
|
|
double data(d,n); |
361
|
|
|
|
|
|
|
int mask(d,n); |
362
|
|
|
|
|
|
|
double weight(d); |
363
|
|
|
|
|
|
|
double [o]dists(n,n); |
364
|
|
|
|
|
|
|
; char *distFlag; |
365
|
|
|
|
|
|
|
) |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=for ref |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Compute triangular distance matrix over all data points. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=for bad |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
distancematrix does not process bad values. |
374
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
*distancematrix = \&PDL::distancematrix; |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=head2 getclustercentroids |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=for sig |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
Signature: ( |
395
|
|
|
|
|
|
|
double data(d,n); |
396
|
|
|
|
|
|
|
int mask(d,n); |
397
|
|
|
|
|
|
|
int clusterids(n); |
398
|
|
|
|
|
|
|
double [o]cdata(d,k); |
399
|
|
|
|
|
|
|
int [o]cmask(d,k); |
400
|
|
|
|
|
|
|
; char *ctrMethodFlag; |
401
|
|
|
|
|
|
|
) |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=for ref |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Find cluster centroids by arithmetic mean (C) or median over each dimension (C). |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=for bad |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
getclustercentroids does not process bad values. |
410
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=cut |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
*getclustercentroids = \&PDL::getclustercentroids; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=pod |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=head2 getclustermean |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=for sig |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Signature: ( |
432
|
|
|
|
|
|
|
double data(d,n); |
433
|
|
|
|
|
|
|
int mask(d,n); |
434
|
|
|
|
|
|
|
int clusterids(n); |
435
|
|
|
|
|
|
|
double [o]cdata(d,k); |
436
|
|
|
|
|
|
|
int [o]cmask(d,k); |
437
|
|
|
|
|
|
|
) |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Really just a wrapper for getclustercentroids(...,"a"). |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=cut |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
sub getclustermean { |
444
|
0
|
|
|
0
|
1
|
|
my ($data,$mask,$cids,$cdata,$cmask) = @_; |
445
|
0
|
|
|
|
|
|
return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'a'); |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=pod |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=head2 getclustermedian |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=for sig |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Signature: ( |
458
|
|
|
|
|
|
|
double data(d,n); |
459
|
|
|
|
|
|
|
int mask(d,n); |
460
|
|
|
|
|
|
|
int clusterids(n); |
461
|
|
|
|
|
|
|
double [o]cdata(d,k); |
462
|
|
|
|
|
|
|
int [o]cmask(d,k); |
463
|
|
|
|
|
|
|
) |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
Really just a wrapper for getclustercentroids(...,"m"). |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=cut |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub getclustermedian { |
470
|
0
|
|
|
0
|
1
|
|
my ($data,$mask,$cids,$cdata,$cmask) = @_; |
471
|
0
|
|
|
|
|
|
return getclustercentroids($dat,$mask,$cids,$cdata,$cmask,'m'); |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=head2 getclustermedoids |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=for sig |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Signature: ( |
483
|
|
|
|
|
|
|
double distance(n,n); |
484
|
|
|
|
|
|
|
int clusterids(n); |
485
|
|
|
|
|
|
|
int [o]centroids(k); |
486
|
|
|
|
|
|
|
double [o]errors(k); |
487
|
|
|
|
|
|
|
) |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
The getclustermedoid routine calculates the cluster centroids, given to which |
490
|
|
|
|
|
|
|
cluster each element belongs. The centroid is defined as the element with the |
491
|
|
|
|
|
|
|
smallest sum of distances to the other elements. |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=for bad |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
getclustermedoids does not process bad values. |
497
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=cut |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
*getclustermedoids = \&PDL::getclustermedoids; |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=head2 kcluster |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=for sig |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
Signature: ( |
518
|
|
|
|
|
|
|
int nclusters(); |
519
|
|
|
|
|
|
|
double data(d,n); |
520
|
|
|
|
|
|
|
int mask(d,n); |
521
|
|
|
|
|
|
|
double weight(d); |
522
|
|
|
|
|
|
|
int npass(); |
523
|
|
|
|
|
|
|
int [o]clusterids(n); |
524
|
|
|
|
|
|
|
double [o]error(); |
525
|
|
|
|
|
|
|
int [o]nfound(); |
526
|
|
|
|
|
|
|
; |
527
|
|
|
|
|
|
|
char *distFlag; |
528
|
|
|
|
|
|
|
char *ctrMethodFlag; |
529
|
|
|
|
|
|
|
) |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
K-Means clustering algorithm. The "ctrMethodFlag" determines how |
532
|
|
|
|
|
|
|
clusters centroids are to be computed; see getclustercentroids() for details. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
See also: kmedoids(). |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=for bad |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
kcluster does not process bad values. |
540
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=cut |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
*kcluster = \&PDL::kcluster; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head2 kmedoids |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=for sig |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
Signature: ( |
561
|
|
|
|
|
|
|
int nclusters(); |
562
|
|
|
|
|
|
|
double distance(n,n); |
563
|
|
|
|
|
|
|
int npass(); |
564
|
|
|
|
|
|
|
int [o]clusterids(n); |
565
|
|
|
|
|
|
|
double [o]error(); |
566
|
|
|
|
|
|
|
int [o]nfound(); |
567
|
|
|
|
|
|
|
) |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
K-Medoids clustering algorithm (uses distance matrix). |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
See also: kcluster(). |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=for bad |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
kmedoids does not process bad values. |
577
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=cut |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
*kmedoids = \&PDL::kmedoids; |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
=head2 treecluster |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=for sig |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Signature: ( |
598
|
|
|
|
|
|
|
double data(d,n); |
599
|
|
|
|
|
|
|
int mask(d,n); |
600
|
|
|
|
|
|
|
double weight(d); |
601
|
|
|
|
|
|
|
int [o]tree(2,n); |
602
|
|
|
|
|
|
|
double [o]lnkdist(n); |
603
|
|
|
|
|
|
|
; |
604
|
|
|
|
|
|
|
char *distFlag; |
605
|
|
|
|
|
|
|
char *methodFlag; |
606
|
|
|
|
|
|
|
) |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
Hierachical agglomerative clustering. |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
$tree(2,n) represents the clustering solution. |
612
|
|
|
|
|
|
|
Each row in the matrix describes one linking event, |
613
|
|
|
|
|
|
|
with the two columns containing the name of the nodes that were joined. |
614
|
|
|
|
|
|
|
The original genes are numbered 0..(n-1), nodes are numbered |
615
|
|
|
|
|
|
|
-1..-(n-1). |
616
|
|
|
|
|
|
|
$tree(2,n) thus actually uses only (2,n-1) cells. |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
$lnkdist(n) represents the distance between the two subnodes that were joined. |
619
|
|
|
|
|
|
|
As for $tree(), $lnkdist() uses only (n-1) cells. |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=for bad |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
treecluster does not process bad values. |
625
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
=cut |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
*treecluster = \&PDL::treecluster; |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=head2 treeclusterd |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=for sig |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
Signature: ( |
646
|
|
|
|
|
|
|
double data(d,n); |
647
|
|
|
|
|
|
|
int mask(d,n); |
648
|
|
|
|
|
|
|
double weight(d); |
649
|
|
|
|
|
|
|
double distances(n,n); |
650
|
|
|
|
|
|
|
int [o]tree(2,n); |
651
|
|
|
|
|
|
|
double [o]lnkdist(n); |
652
|
|
|
|
|
|
|
; |
653
|
|
|
|
|
|
|
char *distFlag; |
654
|
|
|
|
|
|
|
char *methodFlag; |
655
|
|
|
|
|
|
|
) |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
Hierachical agglomerative clustering using given distance matrix. |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
See distancematrix() and treecluster(), above. |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=for bad |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
treeclusterd does not process bad values. |
666
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=cut |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
*treeclusterd = \&PDL::treeclusterd; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=head2 cuttree |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=for sig |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
Signature: ( |
687
|
|
|
|
|
|
|
int tree(2,n); |
688
|
|
|
|
|
|
|
int nclusters(); |
689
|
|
|
|
|
|
|
int [o]clusterids(n); |
690
|
|
|
|
|
|
|
) |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
Cluster selection for hierarchical clustering trees. |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=for bad |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
cuttree does not process bad values. |
699
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=cut |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
*cuttree = \&PDL::cuttree; |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=head2 somcluster |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=for sig |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
Signature: ( |
720
|
|
|
|
|
|
|
double data(d,n); |
721
|
|
|
|
|
|
|
int mask(d,n); |
722
|
|
|
|
|
|
|
double weight(d); |
723
|
|
|
|
|
|
|
int nxnodes(); |
724
|
|
|
|
|
|
|
int nynodes(); |
725
|
|
|
|
|
|
|
double inittau(); |
726
|
|
|
|
|
|
|
int niter(); |
727
|
|
|
|
|
|
|
int [o]clusterids(2,n); |
728
|
|
|
|
|
|
|
; char *distFlag; |
729
|
|
|
|
|
|
|
) |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=for ref |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Self-Organizing Map clustering, does not return centroid data. |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=for bad |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
somcluster does not process bad values. |
738
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=cut |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
*somcluster = \&PDL::somcluster; |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=head2 pca |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=for sig |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
Signature: ( |
759
|
|
|
|
|
|
|
double [o]U(d,n); |
760
|
|
|
|
|
|
|
double [o]S(d); |
761
|
|
|
|
|
|
|
double [o]V(d,d); |
762
|
|
|
|
|
|
|
) |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
Principal Component Analysis (SVD), operates in-place on $U() and requires ($SIZE(n) E= $SIZE(d)). |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=for bad |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
pca does not process bad values. |
771
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=cut |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
*pca = \&PDL::pca; |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
=head2 rowdistances |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=for sig |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
Signature: ( |
792
|
|
|
|
|
|
|
double data(d,n); |
793
|
|
|
|
|
|
|
int mask(d,n); |
794
|
|
|
|
|
|
|
double weight(d); |
795
|
|
|
|
|
|
|
int rowids1(ncmps); |
796
|
|
|
|
|
|
|
int rowids2(ncmps); |
797
|
|
|
|
|
|
|
double [o]dist(ncmps); |
798
|
|
|
|
|
|
|
; char *distFlag; |
799
|
|
|
|
|
|
|
) |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Computes pairwise distances between rows of $data(). |
803
|
|
|
|
|
|
|
$rowids1() contains the row-indices of the left (first) comparison operand, |
804
|
|
|
|
|
|
|
and $rowids2() the row-indices of the right (second) comparison operand. Since each |
805
|
|
|
|
|
|
|
of these are assumed to be indices into the first dimension $data(), it should be the case that: |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
0 <= $rowids1(i),rowids2(i) < $SIZE(n) for 0 <= i < $SIZE(ncmps) |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
See also clusterdistance(), clusterdistances(), clusterdistancematrixenc(), distancematrix(). |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=for bad |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
rowdistances does not process bad values. |
815
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
=cut |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
*rowdistances = \&PDL::rowdistances; |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=head2 clusterdistances |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
=for sig |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
Signature: ( |
836
|
|
|
|
|
|
|
double data(d,n); |
837
|
|
|
|
|
|
|
int mask(d,n); |
838
|
|
|
|
|
|
|
double weight(d); |
839
|
|
|
|
|
|
|
int rowids(nr); |
840
|
|
|
|
|
|
|
int index2(n2); |
841
|
|
|
|
|
|
|
double [o]dist(nr); |
842
|
|
|
|
|
|
|
; |
843
|
|
|
|
|
|
|
char *distFlag; |
844
|
|
|
|
|
|
|
char *methodFlag; |
845
|
|
|
|
|
|
|
) |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Computes pairwise distance(s) from each of $rowids() as a singleton cluster |
849
|
|
|
|
|
|
|
with the cluster represented by $index2(), which should be an index |
850
|
|
|
|
|
|
|
vector as for clusterdistance(). See also clusterdistancematrixenc(). |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
=for bad |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
clusterdistances does not process bad values. |
856
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
=cut |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
*clusterdistances = \&PDL::clusterdistances; |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
=head2 clustersizes |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=for sig |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
Signature: (int clusterids(n); int [o]clustersizes(k)) |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
Computes the size (number of elements) of each cluster in $clusterids(). |
880
|
|
|
|
|
|
|
Useful for allocating less than maximmal space for $clusterelements(). |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
=for bad |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
The output piddle should never be marked BAD. |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=cut |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
*clustersizes = \&PDL::clustersizes; |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
=head2 clusterelements |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=for sig |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
Signature: (int clusterids(n); int [o]clustersizes(k); int [o]eltids(mcsize,k)) |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
Converts the vector $clusterids() to a matrix $eltids() of element (row) indices |
908
|
|
|
|
|
|
|
indexed by cluster-id. $mcsize() is the maximum number of elements per cluster, |
909
|
|
|
|
|
|
|
at most $n. The output PDLs $clustersizes() and $eltids() can be passed to |
910
|
|
|
|
|
|
|
clusterdistancematrix(). |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
=for bad |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
clusterelements does not process bad values. |
916
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=cut |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
*clusterelements = \&PDL::clusterelements; |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
=head2 clusterelementmask |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=for sig |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
Signature: (int clusterids(n); byte [o]eltmask(k,n)) |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
Get boolean membership mask $eltmask() based on cluster assignment in $clusterids(). |
940
|
|
|
|
|
|
|
No value in $clusterids() may be greater than or equal to $k. |
941
|
|
|
|
|
|
|
On completion, $eltmask(k,n) is a true value iff $clusterids(n)=$k. |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=for bad |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
clusterelementmask does not process bad values. |
947
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=cut |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
*clusterelementmask = \&PDL::clusterelementmask; |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
=head2 clusterdistancematrix |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
=for sig |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
Signature: ( |
968
|
|
|
|
|
|
|
double data(d,n); |
969
|
|
|
|
|
|
|
int mask(d,n); |
970
|
|
|
|
|
|
|
double weight(d); |
971
|
|
|
|
|
|
|
int rowids(nr); |
972
|
|
|
|
|
|
|
int clustersizes(k); |
973
|
|
|
|
|
|
|
int eltids(mcsize,k); |
974
|
|
|
|
|
|
|
double [o]dist(k,nr); |
975
|
|
|
|
|
|
|
; |
976
|
|
|
|
|
|
|
char *distFlag; |
977
|
|
|
|
|
|
|
char *methodFlag; |
978
|
|
|
|
|
|
|
) |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
B in favor of clusterdistancematrixenc(). |
982
|
|
|
|
|
|
|
In the future, this method is expected to become a wrapper for clusterdistancematrixenc(). |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
Computes distance between each row index in $rowids() |
985
|
|
|
|
|
|
|
considered as a singleton cluster |
986
|
|
|
|
|
|
|
and each of the $k clusters whose elements are given by a single row of $eltids(). |
987
|
|
|
|
|
|
|
$clustersizes() and $eltids() are as output by the clusterelements() method. |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
See also clusterdistance(), clusterdistances(), clustersizes(), clusterelements(), clusterdistancematrixenc(). |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
=for bad |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
clusterdistancematrix does not process bad values. |
995
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=cut |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
*clusterdistancematrix = \&PDL::clusterdistancematrix; |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
=pod |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
=head2 clusterenc |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
=for sig |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
Signature: ( |
1017
|
|
|
|
|
|
|
int clusterids(n); |
1018
|
|
|
|
|
|
|
int [o]clusterlens(k1); |
1019
|
|
|
|
|
|
|
int [o]clustervals(k1); |
1020
|
|
|
|
|
|
|
int [o]clusterrows(n); |
1021
|
|
|
|
|
|
|
; |
1022
|
|
|
|
|
|
|
int k1; |
1023
|
|
|
|
|
|
|
) |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
Encodes datum-to-cluster vector $clusterids() for efficiently mapping |
1026
|
|
|
|
|
|
|
clusters-to-data. Returned PDL $clusterlens() holds the lengths of each |
1027
|
|
|
|
|
|
|
cluster containing at least one element. $clustervals() holds the IDs |
1028
|
|
|
|
|
|
|
of such clusters as they appear as values in $clusterids(). $clusterrows() |
1029
|
|
|
|
|
|
|
is such that: |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
all( rld($clusterlens, $clustervals) == $clusterids ) |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
... if all available cluster-ids are in use. |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
If specified, $k1 is a perl scalar |
1036
|
|
|
|
|
|
|
holding the number of clusters (maximum cluster index + 1); an |
1037
|
|
|
|
|
|
|
appropriate value will guessed from $clusterids() otherwise. |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
Really just a wrapper for some lower-level PDL and PDL::Cluster calls. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
=cut |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
sub clusterenc { |
1044
|
0
|
|
|
0
|
1
|
|
my ($cids, $clens,$cvals,$crows, $kmax) = @_; |
1045
|
0
|
0
|
|
|
|
|
$kmax = $cids->max+1 if (!defined($kmax)); |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
##-- cluster sizes |
1048
|
0
|
0
|
|
|
|
|
$clens = zeroes(long, $kmax) if (!defined($clens)); |
1049
|
0
|
|
|
|
|
|
clustersizes($cids,$clens); |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
##-- cluster-id values |
1052
|
0
|
0
|
|
|
|
|
if (!defined($cvals)) { $cvals = PDL->sequence(long,$kmax); } |
|
0
|
|
|
|
|
|
|
1053
|
0
|
|
|
|
|
|
else { $cvals .= PDL->sequence(long,$kmax); } |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
##-- cluster-row values: handle BAD and negative values |
1056
|
|
|
|
|
|
|
#if (!defined($crows)) { $crows = $cids->qsorti->where($cids->isgood & $cids>=0); } |
1057
|
|
|
|
|
|
|
#else { $crows .= $cids->qsorti->where($cids->isgood & $cids>=0); } |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
##-- cluster-row values: treat BAD and negative values like anything else |
1060
|
0
|
0
|
|
|
|
|
if (!defined($crows)) { $crows = $cids->qsorti; } |
|
0
|
|
|
|
|
|
|
1061
|
0
|
|
|
|
|
|
else { $crows .= $cids->qsorti; } |
1062
|
|
|
|
|
|
|
|
1063
|
0
|
|
|
|
|
|
return ($clens,$cvals,$crows); |
1064
|
|
|
|
|
|
|
} |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
=pod |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
=head2 clusterdec |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
=for sig |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
Signature: ( |
1076
|
|
|
|
|
|
|
int clusterlens(k1); |
1077
|
|
|
|
|
|
|
int clustervals(k1); |
1078
|
|
|
|
|
|
|
int clusterrows(n); |
1079
|
|
|
|
|
|
|
int [o]clusterids(n); |
1080
|
|
|
|
|
|
|
) |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
Decodes cluster-to-datum vectors ($clusterlens,$clustervals,$clusterrows) |
1083
|
|
|
|
|
|
|
into a single datum-to-cluster vector $clusterids(). |
1084
|
|
|
|
|
|
|
$(clusterlens,$clustervals,$clusterrows) are as returned by the clusterenc() method. |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
Un-addressed row-index values in $clusterrows() will be assigned the pseudo-cluster (-1) |
1087
|
|
|
|
|
|
|
in $clusterids(). |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
Really just a wrapper for some lower-level PDL calls. |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
=cut |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
sub clusterdec { |
1094
|
0
|
|
|
0
|
1
|
|
my ($clens,$cvals,$crows, $cids2) = @_; |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
##-- get $cids |
1097
|
0
|
0
|
|
|
|
|
$cids2 = zeroes($cvals->type, $crows->dims) if (!defined($cids2)); |
1098
|
0
|
|
|
|
|
|
$cids2 .= -1; |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
##-- trim $crows |
1101
|
|
|
|
|
|
|
#my $crows_good = $crows->slice("0:".($clens->sum-1)); ##-- assume bad indices are at END of $crows (BAD,inf,...) |
1102
|
0
|
|
|
|
|
|
my $crows_good = $crows->slice(-$clens->sum.":-1"); ##-- assume bad indices are at BEGINNING of $crows (-1, ...) |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
##-- decode |
1105
|
0
|
|
|
|
|
|
$clens->rld($cvals, $cids2->index($crows_good)); |
1106
|
|
|
|
|
|
|
|
1107
|
0
|
|
|
|
|
|
return $cids2; |
1108
|
|
|
|
|
|
|
} |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=pod |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
=head2 clusteroffsets |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
=for sig |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
Signature: ( |
1120
|
|
|
|
|
|
|
int clusterids(n); |
1121
|
|
|
|
|
|
|
int [o]clusteroffsets(k1+1); |
1122
|
|
|
|
|
|
|
int [o]clustervals(k1); |
1123
|
|
|
|
|
|
|
int [o]clusterrows(n); |
1124
|
|
|
|
|
|
|
; |
1125
|
|
|
|
|
|
|
int k1; |
1126
|
|
|
|
|
|
|
) |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
Encodes datum-to-cluster vector $clusterids() for efficiently mapping |
1129
|
|
|
|
|
|
|
clusters-to-data. Like clusterenc(), but returns cumulative offsets |
1130
|
|
|
|
|
|
|
instead of lengths. |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
Really just a wrapper for clusterenc(), cumusumover(), and append(). |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=cut |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub clusteroffsets { |
1137
|
0
|
|
|
0
|
1
|
|
my ($cids, $coffsets,$cvals,$crows, $kmax) = @_; |
1138
|
0
|
|
|
|
|
|
my ($clens); |
1139
|
0
|
|
|
|
|
|
($clens,$cvals,$crows) = clusterenc($cids,undef,$cvals,$crows,$kmax); |
1140
|
0
|
|
|
|
|
|
$coffsets = $clens->append(0)->rotate(1)->cumusumover; |
1141
|
|
|
|
|
|
|
|
1142
|
0
|
|
|
|
|
|
return ($coffsets,$cvals,$crows); |
1143
|
|
|
|
|
|
|
} |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
=head2 clusterdistancematrixenc |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
=for sig |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
Signature: ( |
1154
|
|
|
|
|
|
|
double data(d,n); |
1155
|
|
|
|
|
|
|
int mask(d,n); |
1156
|
|
|
|
|
|
|
double weight(d); |
1157
|
|
|
|
|
|
|
int clens1(k1); |
1158
|
|
|
|
|
|
|
int crowids1(nc1); |
1159
|
|
|
|
|
|
|
int clens2(k2); |
1160
|
|
|
|
|
|
|
int crowids2(nc2); |
1161
|
|
|
|
|
|
|
double [o]dist(k1,k2); |
1162
|
|
|
|
|
|
|
; |
1163
|
|
|
|
|
|
|
char *distFlag; |
1164
|
|
|
|
|
|
|
char *methodFlag; |
1165
|
|
|
|
|
|
|
) |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
Computes cluster-distance between each pair of clusters in (sequence($k1) x sequence($k2)), where 'x' |
1169
|
|
|
|
|
|
|
is the Cartesian product. Cluster contents are passed as pairs ($clens(),$crowids()) as returned |
1170
|
|
|
|
|
|
|
by the clusterenc() function (assuming that the $cvals() vector returned by clusterenc() is a flat sequence). |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
The deprecated method clusterdistancematrix() can be simulated by this function in the following |
1173
|
|
|
|
|
|
|
manner: if a clusterdistancematrix() call was: |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
clustersizes ($cids, $csizes=zeroes(long,$k)); |
1176
|
|
|
|
|
|
|
clusterelements($cids, $celts =zeroes(long,$csizes->max)-1); |
1177
|
|
|
|
|
|
|
clusterdistancematrix($data,$msk,$wt, $rowids, $csizes,$celts, |
1178
|
|
|
|
|
|
|
$cdmat=zeroes(double,$k,$rowids->dim(0)), |
1179
|
|
|
|
|
|
|
$distFlag, $methodFlag |
1180
|
|
|
|
|
|
|
); |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
Then the corresponding use of clusterdistancematrixenc() would be: |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
($clens,$cvals,$crows) = clusterenc($cids); |
1185
|
|
|
|
|
|
|
clusterdistancematrixenc($data,$msk,$wt, |
1186
|
|
|
|
|
|
|
$clens, $crows, ##-- "real" clusters in output dim 0 |
1187
|
|
|
|
|
|
|
$rowids->ones, $rowids, ##-- $rowids as singleton clusters in output dim 1 |
1188
|
|
|
|
|
|
|
$cdmat=zeroes(double,$clens->dim(0),$rowids->dim(0)), |
1189
|
|
|
|
|
|
|
$distFlag, $methodFlag); |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
If your $cvals() are not a flat sequence, you will probably need to do some index-twiddling |
1192
|
|
|
|
|
|
|
to get things into the proper shape: |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
if ( !all($cvals==$cvals->sequence) || $cvals->dim(0) != $k ) |
1195
|
|
|
|
|
|
|
{ |
1196
|
|
|
|
|
|
|
my $cdmat0 = $cdmat; |
1197
|
|
|
|
|
|
|
my $nr = $rowids->dim(0); |
1198
|
|
|
|
|
|
|
$cdmat = pdl(double,"inf")->slice("*$k,*$nr")->make_physical(); ##-- "missing" distances are infinite |
1199
|
|
|
|
|
|
|
$cdmat->dice_axis(0,$cvals) .= $cdmat0; |
1200
|
|
|
|
|
|
|
} |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
$distFlag and $methodFlag are interpreted as for clusterdistance(). |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
See also clusterenc(), clusterdistancematrix(). |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=for bad |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
clusterdistancematrixenc does not process bad values. |
1210
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
=cut |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
*clusterdistancematrixenc = \&PDL::clusterdistancematrixenc; |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
=head2 clusterdistancesenc |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
=for sig |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
Signature: ( |
1231
|
|
|
|
|
|
|
double data(d,n); |
1232
|
|
|
|
|
|
|
int mask(d,n); |
1233
|
|
|
|
|
|
|
double weight(d); |
1234
|
|
|
|
|
|
|
int coffsets1(k1); |
1235
|
|
|
|
|
|
|
int crowids1(nc1); |
1236
|
|
|
|
|
|
|
int cwhich1(ncmps); |
1237
|
|
|
|
|
|
|
int coffsets2(k2); |
1238
|
|
|
|
|
|
|
int crowids2(nc2); |
1239
|
|
|
|
|
|
|
int cwhich2(ncmps); |
1240
|
|
|
|
|
|
|
double [o]dists(ncmps); |
1241
|
|
|
|
|
|
|
; |
1242
|
|
|
|
|
|
|
char *distFlag; |
1243
|
|
|
|
|
|
|
char *methodFlag; |
1244
|
|
|
|
|
|
|
) |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Computes cluster-distance between selected pairs of co-indexed clusters in ($cwhich1,$cwhich2). |
1248
|
|
|
|
|
|
|
Cluster contents are passed as pairs ($coffsetsX(),$crowidsX()) as returned |
1249
|
|
|
|
|
|
|
by the clusteroffsets() function. |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
$distFlag and $methodFlag are interpreted as for clusterdistance(). |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
See also clusterenc(), clusterdistancematrixenc(). |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
=for bad |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
clusterdistancesenc does not process bad values. |
1259
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
=cut |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
*clusterdistancesenc = \&PDL::clusterdistancesenc; |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
=head2 getclusterwsum |
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
=for sig |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
Signature: ( |
1280
|
|
|
|
|
|
|
double data(d,n); |
1281
|
|
|
|
|
|
|
int mask(d,n); |
1282
|
|
|
|
|
|
|
double clusterwts(k,n); |
1283
|
|
|
|
|
|
|
double [o]cdata(d,k); |
1284
|
|
|
|
|
|
|
int [o]cmask(d,k); |
1285
|
|
|
|
|
|
|
) |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
Find cluster centroids by weighted sum. This can be considered an |
1289
|
|
|
|
|
|
|
expensive generalization of the getclustermean() and getclustermedian() |
1290
|
|
|
|
|
|
|
functions. Here, the input PDLs $data() and $mask(), as well as the |
1291
|
|
|
|
|
|
|
output PDL $cdata() are as for getclustermean(). The matrix $clusterwts() |
1292
|
|
|
|
|
|
|
determines the relative weight of each data row in determining the |
1293
|
|
|
|
|
|
|
centroid of each cluster, potentially useful for "fuzzy" clustering. |
1294
|
|
|
|
|
|
|
The equation used to compute cluster means is: |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
$cdata(d,k) = sum_{n} $clusterwts(k,n) * $data(d,n) * $mask(d,n) |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
For centroids in the same range as data elements, $clusterwts() |
1299
|
|
|
|
|
|
|
should sum to 1 over each column (k): |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
all($clusterwts->xchg(0,1)->sumover == 1) |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
getclustermean() can be simulated by instantiating $clusterwts() with |
1304
|
|
|
|
|
|
|
a uniform distribution over cluster elements: |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
$clusterwts = zeroes($k,$n); |
1307
|
|
|
|
|
|
|
$clusterwts->indexND(cat($clusterids, xvals($clusterids))->xchg(0,1)) .= 1; |
1308
|
|
|
|
|
|
|
$clusterwts /= $clusterwts->xchg(0,1)->sumover; |
1309
|
|
|
|
|
|
|
getclusterwsum($data,$mask, $clusterwts, $cdata=zeroes($d,$k)); |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
Similarly, getclustermedian() can be simulated by setting $clusterwts() to |
1312
|
|
|
|
|
|
|
1 for cluster medians and otherwise to 0. More sophisticated centroid |
1313
|
|
|
|
|
|
|
discovery methods can be computed by this function by setting |
1314
|
|
|
|
|
|
|
$clusterwts(k,n) to some estimate of the conditional probability |
1315
|
|
|
|
|
|
|
of the datum at row $n given the cluster with index $k: |
1316
|
|
|
|
|
|
|
p(Elt==n|Cluster==k). One |
1317
|
|
|
|
|
|
|
way to achieve such an estimate is to use (normalized inverses of) the |
1318
|
|
|
|
|
|
|
singleton-row-to-cluster distances as output by clusterdistancematrix(). |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
=for bad |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
getclusterwsum does not process bad values. |
1325
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
=cut |
1329
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
*getclusterwsum = \&PDL::getclusterwsum; |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
=head2 attachtonearest |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
=for sig |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
Signature: ( |
1346
|
|
|
|
|
|
|
double data(d,n); |
1347
|
|
|
|
|
|
|
int mask(d,n); |
1348
|
|
|
|
|
|
|
double weight(d); |
1349
|
|
|
|
|
|
|
int rowids(nr); |
1350
|
|
|
|
|
|
|
double cdata(d,k); |
1351
|
|
|
|
|
|
|
int cmask(d,k); |
1352
|
|
|
|
|
|
|
int [o]clusterids(nr); |
1353
|
|
|
|
|
|
|
double [o]cdist(nr); |
1354
|
|
|
|
|
|
|
; |
1355
|
|
|
|
|
|
|
char *distFlag; |
1356
|
|
|
|
|
|
|
char *methodFlag; |
1357
|
|
|
|
|
|
|
) |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
Assigns each specified data row to the nearest cluster centroid. |
1361
|
|
|
|
|
|
|
Data elements are given by $data() and $mask(), feature weights are |
1362
|
|
|
|
|
|
|
given by $weight(), as usual. Cluster centroids are defined by |
1363
|
|
|
|
|
|
|
by $cdata() and $cmask(), and the indices of rows to be attached |
1364
|
|
|
|
|
|
|
are given in the vector $rowids(). The output vector $clusterids() |
1365
|
|
|
|
|
|
|
contains for each specified row index the identifier of the nearest |
1366
|
|
|
|
|
|
|
cluster centroid. The vector $cdist() contains the distance to |
1367
|
|
|
|
|
|
|
the best clusters. |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
See also: clusterdistancematrix(), attachtonearestd(). |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
=for bad |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
attachtonearest does not process bad values. |
1375
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
=cut |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
*attachtonearest = \&PDL::attachtonearest; |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
=pod |
1391
|
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
=head2 attachtonearestd |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
=for sig |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
Signature: ( |
1397
|
|
|
|
|
|
|
double cdistmat(k,n); |
1398
|
|
|
|
|
|
|
int rowids(nr); |
1399
|
|
|
|
|
|
|
int [o]clusterids(nr); |
1400
|
|
|
|
|
|
|
double [o]dists(nr); |
1401
|
|
|
|
|
|
|
) |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
Assigns each specified data row to the nearest cluster centroid, |
1404
|
|
|
|
|
|
|
as for attachtonearest(), given the datum-to-cluster distance |
1405
|
|
|
|
|
|
|
matrix $cdistmat(). Currently just a wrapper for a few PDL calls. |
1406
|
|
|
|
|
|
|
In scalar context returns $clusterids(), in list context returns |
1407
|
|
|
|
|
|
|
the list ($clusterids(),$dists()). |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
=cut |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
sub attachtonearestd { |
1412
|
0
|
|
|
0
|
1
|
|
my ($cdm,$rowids,$cids,$dists)=@_; |
1413
|
0
|
0
|
|
|
|
|
$cids = zeroes(long, $rowids->dim(0)) if (!defined($cids)); |
1414
|
0
|
0
|
|
|
|
|
$dists = zeroes(double, $rowids->dim(0)) if (!defined($dists)); |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
##-- dice matrix |
1417
|
0
|
|
|
|
|
|
my $cdmr = $cdm->dice_axis(1,$rowids); |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
##-- get best |
1420
|
0
|
|
|
|
|
|
$cdmr->minimum_ind($cids); |
1421
|
0
|
|
|
|
|
|
$dists .= $cdmr->index($cids); |
1422
|
|
|
|
|
|
|
|
1423
|
0
|
0
|
|
|
|
|
return wantarray ? ($cids,$dists) : $cids; |
1424
|
|
|
|
|
|
|
} |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
=head2 checkprototypes |
1431
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
=for sig |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
Signature: ( |
1435
|
|
|
|
|
|
|
protos(k); |
1436
|
|
|
|
|
|
|
[o]cprotos(k); |
1437
|
|
|
|
|
|
|
byte [t]otmp(n); |
1438
|
|
|
|
|
|
|
; int nsize => n) |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
(Deterministic) |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
Ensure that the assignment $protos() from $k objects to |
1443
|
|
|
|
|
|
|
integer "prototype" indices in the range [0,$n( contains no repetitions of any |
1444
|
|
|
|
|
|
|
of the $n possible prototype values. One use for this function is |
1445
|
|
|
|
|
|
|
the restriction of (randomly generated) potential clustering solutions |
1446
|
|
|
|
|
|
|
for $k clusters in which each cluster is represented by a |
1447
|
|
|
|
|
|
|
"prototypical" element from a data sample of size $n. |
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
Requires: $n >= $k. |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
=for bad |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
checkprototypes does not process bad values. |
1455
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1456
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
=cut |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
|
1465
|
|
|
|
|
|
|
*checkprototypes = \&PDL::checkprototypes; |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
=head2 checkpartitions |
1472
|
|
|
|
|
|
|
|
1473
|
|
|
|
|
|
|
=for sig |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
Signature: ( |
1476
|
|
|
|
|
|
|
part(n); |
1477
|
|
|
|
|
|
|
[o]cpart(n); |
1478
|
|
|
|
|
|
|
[t]ptmp(k); |
1479
|
|
|
|
|
|
|
; int ksize => k) |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
(Deterministic) |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
Ensure that the partitioning $part() of $n objects into $k bins |
1484
|
|
|
|
|
|
|
(identified by integer values in the range [0,$k-1]) |
1485
|
|
|
|
|
|
|
contains at least one instance of each of the |
1486
|
|
|
|
|
|
|
$k possible values. One use for this function is |
1487
|
|
|
|
|
|
|
the restriction of (randomly generated) potential clustering solutions |
1488
|
|
|
|
|
|
|
for $n elements into $k clusters to those which assign at least one |
1489
|
|
|
|
|
|
|
element to each cluster. |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
Requires: $n >= $k. |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
|
1494
|
|
|
|
|
|
|
=for bad |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
checkpartitions does not process bad values. |
1497
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
|
1500
|
|
|
|
|
|
|
=cut |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
|
1507
|
|
|
|
|
|
|
*checkpartitions = \&PDL::checkpartitions; |
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
=pod |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
=head2 randomprototypes |
1515
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
=for sig |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
Signature: (int k; int n; [o]prototypes(k)) |
1519
|
|
|
|
|
|
|
|
1520
|
|
|
|
|
|
|
Generate a random set of $k prototype indices drawn from $n objects, |
1521
|
|
|
|
|
|
|
ensuring that no object is used more than once. Calls checkprototypes(). |
1522
|
|
|
|
|
|
|
|
1523
|
|
|
|
|
|
|
See also: checkprototypes(), randomassign(), checkpartitions(), randompartition(). |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
=cut |
1526
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
sub randomprototypes { |
1528
|
0
|
|
|
0
|
1
|
|
my ($k,$n,$protos) = @_; |
1529
|
0
|
0
|
|
|
|
|
$protos = zeroes(long, $k) if (!defined($protos)); |
1530
|
0
|
|
|
|
|
|
$protos .= PDL->random($k)*$n; |
1531
|
0
|
|
|
|
|
|
checkprototypes($protos->inplace, $n); |
1532
|
0
|
|
|
|
|
|
return $protos; |
1533
|
|
|
|
|
|
|
} |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
=pod |
1539
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
=head2 randompartition |
1541
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
=for sig |
1543
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
Signature: (int k; int n; [o]partition(n)) |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
Generate a partitioning of $n objects into $k clusters, |
1547
|
|
|
|
|
|
|
ensuring that every cluster contains at least one object. |
1548
|
|
|
|
|
|
|
Calls checkpartitions(). |
1549
|
|
|
|
|
|
|
This method is identical in functionality to randomassign(), |
1550
|
|
|
|
|
|
|
but may be faster if $k is significantly smaller than $n. |
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
See also: randomassign(), checkpartitions(), checkprototypes(), randomprototypes(). |
1553
|
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
|
=cut |
1555
|
|
|
|
|
|
|
|
1556
|
|
|
|
|
|
|
sub randompartition { |
1557
|
0
|
|
|
0
|
1
|
|
my ($k,$n,$part) = @_; |
1558
|
0
|
0
|
|
|
|
|
$part = zeroes(long, $n) if (!defined($part)); |
1559
|
0
|
|
|
|
|
|
$part .= PDL->random($n)*$k; |
1560
|
0
|
|
|
|
|
|
checkpartitions($part->inplace, $k); |
1561
|
0
|
|
|
|
|
|
return $part; |
1562
|
|
|
|
|
|
|
} |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
1569
|
|
|
|
|
|
|
=pod |
1570
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
=head1 COMMON ARGUMENTS |
1572
|
|
|
|
|
|
|
|
1573
|
|
|
|
|
|
|
Many of the functions described above require one or |
1574
|
|
|
|
|
|
|
more of the following parameters: |
1575
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
=over 4 |
1577
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
=item d |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
The number of features defined for each data element. |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
=item n |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
The number of data elements to be clustered. |
1585
|
|
|
|
|
|
|
|
1586
|
|
|
|
|
|
|
=item k |
1587
|
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
|
=item nclusters |
1589
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
The number of desired clusters. |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
=item data(d,n) |
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
A matrix representing the data to be clustered, double-valued. |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
=item mask(d,n) |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
A matrix indicating which data values are missing. If |
1599
|
|
|
|
|
|
|
mask(i,j) == 0, then data(i,j) is treated as missing. |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
=item weights(d) |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
The (feature-) weights that are used to calculate the distance. |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
B Not all distance metrics make use of weights; |
1606
|
|
|
|
|
|
|
you must provide some nonetheless. |
1607
|
|
|
|
|
|
|
|
1608
|
|
|
|
|
|
|
=item clusterids(n) |
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
A clustering solution. $clusterids() maps data elements |
1611
|
|
|
|
|
|
|
(row indices in $data()) to values in the range [0,$k-1]. |
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
=back |
1614
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
=cut |
1616
|
|
|
|
|
|
|
|
1617
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
1618
|
|
|
|
|
|
|
=pod |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
=head2 Distance Metrics |
1621
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
Distances between data elements (and cluster centroids, where applicable) |
1623
|
|
|
|
|
|
|
are computed using one of a number of built-in metrics. Which metric |
1624
|
|
|
|
|
|
|
is to be used for a given computation is indicated by a character |
1625
|
|
|
|
|
|
|
flag denoted above with $distFlag(). In the following, w[i] represents |
1626
|
|
|
|
|
|
|
a weighting factor in the $weights() matrix, and $W represents the total |
1627
|
|
|
|
|
|
|
of all weights. |
1628
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
Currently implemented distance |
1630
|
|
|
|
|
|
|
metrics and the corresponding flags are: |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
=over 4 |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
=item e |
1635
|
|
|
|
|
|
|
|
1636
|
|
|
|
|
|
|
Pseudo-Euclidean distance: |
1637
|
|
|
|
|
|
|
|
1638
|
|
|
|
|
|
|
dist_e(x,y) = 1/W * sum_{i=1..d} w[i] * (x[i] - y[i])^2 |
1639
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
Note that this is not the "true" Euclidean distance, which is defined as: |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
dist_E(x,y) = sqrt( sum_{i=1..d} (x[i] - y[i])^2 ) |
1643
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
=item b |
1646
|
|
|
|
|
|
|
|
1647
|
|
|
|
|
|
|
City-block ("Manhattan") distance: |
1648
|
|
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
dist_b(x,y) = 1/W * sum_{i=1..d} w[i] * |x[i] - y[i]| |
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
|
=item c |
1654
|
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
|
Pearson correlation distance: |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
dist_c(x,y) = 1-r(x,y) |
1658
|
|
|
|
|
|
|
|
1659
|
|
|
|
|
|
|
where r is the Pearson correlation coefficient: |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
r(x,y) = 1/d * sum_{i=1..d} (x[i]-mean(x))/stddev(x) * (y[i]-mean(y))/stddev(y) |
1662
|
|
|
|
|
|
|
|
1663
|
|
|
|
|
|
|
=item a |
1664
|
|
|
|
|
|
|
|
1665
|
|
|
|
|
|
|
Absolute value of the correlation, |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
dist_a(x,y) = 1-|r(x,y)| |
1668
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
where r(x,y) is the Pearson correlation coefficient. |
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
=item u |
1672
|
|
|
|
|
|
|
|
1673
|
|
|
|
|
|
|
Uncentered correlation (cosine of the angle): |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
dist_u(x,y) = 1-r_u(x,y) |
1676
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
where: |
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
r_u(x,y) = 1/d * sum_{i=1..d} (x[i]/sigma0(x)) * (y[i]/sigma0(y)) |
1680
|
|
|
|
|
|
|
|
1681
|
|
|
|
|
|
|
and: |
1682
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
sigma0(w) = sqrt( 1/d * sum_{i=1..d} w[i]^2 ) |
1684
|
|
|
|
|
|
|
|
1685
|
|
|
|
|
|
|
=item x |
1686
|
|
|
|
|
|
|
|
1687
|
|
|
|
|
|
|
Absolute uncentered correlation, |
1688
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
dist_x(x,y) = 1-|r_u(x,y)| |
1690
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
=item s |
1692
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
Spearman's rank correlation. |
1694
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
dist_s(x,y) = 1-r_s(x,y) ~= dist_c(ranks(x),ranks(y)) |
1696
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
where r_s(x,y) is the Spearman rank correlation. Weights are ignored. |
1698
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
=item k |
1700
|
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
|
Kendall's tau (does not use weights). |
1702
|
|
|
|
|
|
|
|
1703
|
|
|
|
|
|
|
dist_k(x,y) = 1 - tau(x,y) |
1704
|
|
|
|
|
|
|
|
1705
|
|
|
|
|
|
|
=item (other values) |
1706
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
For other values of dist, the default (Euclidean distance) is used. |
1708
|
|
|
|
|
|
|
|
1709
|
|
|
|
|
|
|
=back |
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
=cut |
1712
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
1715
|
|
|
|
|
|
|
=pod |
1716
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
=head2 Link Methods |
1718
|
|
|
|
|
|
|
|
1719
|
|
|
|
|
|
|
For hierarchical clustering, the 'link method' must be specified |
1720
|
|
|
|
|
|
|
by a character flag, denoted above as $methodFlag. |
1721
|
|
|
|
|
|
|
Known link methods are: |
1722
|
|
|
|
|
|
|
|
1723
|
|
|
|
|
|
|
=over 4 |
1724
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
=item s |
1726
|
|
|
|
|
|
|
|
1727
|
|
|
|
|
|
|
Pairwise minimum-linkage ("single") clustering. |
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
Defines the distance between two clusters as the |
1730
|
|
|
|
|
|
|
least distance between any two of their respective elements. |
1731
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
=item m |
1733
|
|
|
|
|
|
|
|
1734
|
|
|
|
|
|
|
Pairwise maximum-linkage ("complete") clustering. |
1735
|
|
|
|
|
|
|
|
1736
|
|
|
|
|
|
|
Defines the distance between two clusters as the |
1737
|
|
|
|
|
|
|
greatest distance between any two of their respective elements. |
1738
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
=item a |
1740
|
|
|
|
|
|
|
|
1741
|
|
|
|
|
|
|
Pairwise average-linkage clustering (centroid distance using arithmetic mean). |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
Defines the distance between two clusters as the |
1744
|
|
|
|
|
|
|
distance between their respective centroids, where each |
1745
|
|
|
|
|
|
|
cluster centroid is defined as the arithmetic mean of |
1746
|
|
|
|
|
|
|
that cluster's elements. |
1747
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
=item c |
1749
|
|
|
|
|
|
|
|
1750
|
|
|
|
|
|
|
Pairwise centroid-linkage clustering (centroid distance using median). |
1751
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
Identifies the distance between two clusters as the |
1753
|
|
|
|
|
|
|
distance between their respective centroids, where each |
1754
|
|
|
|
|
|
|
cluster centroid is computed as the median of |
1755
|
|
|
|
|
|
|
that cluster's elements. |
1756
|
|
|
|
|
|
|
|
1757
|
|
|
|
|
|
|
=item (other values) |
1758
|
|
|
|
|
|
|
|
1759
|
|
|
|
|
|
|
Behavior for other values is currently undefined. |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
=back |
1762
|
|
|
|
|
|
|
|
1763
|
|
|
|
|
|
|
For the first three, either the distance matrix or the gene expression data is |
1764
|
|
|
|
|
|
|
sufficient to perform the clustering algorithm. For pairwise centroid-linkage |
1765
|
|
|
|
|
|
|
clustering, however, the gene expression data are always needed, even if the |
1766
|
|
|
|
|
|
|
distance matrix itself is available. |
1767
|
|
|
|
|
|
|
|
1768
|
|
|
|
|
|
|
=cut |
1769
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
1771
|
|
|
|
|
|
|
=pod |
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
1774
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
Perl by Larry Wall. |
1776
|
|
|
|
|
|
|
|
1777
|
|
|
|
|
|
|
PDL by Karl Glazebrook, Tuomas J. Lukka, Christian Soeller, and others. |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
C Clustering Library by |
1780
|
|
|
|
|
|
|
Michiel de Hoon, |
1781
|
|
|
|
|
|
|
Seiya Imoto, |
1782
|
|
|
|
|
|
|
and Satoru Miyano. |
1783
|
|
|
|
|
|
|
|
1784
|
|
|
|
|
|
|
Orignal Algorithm::Cluster module by John Nolan and Michiel de Hoon. |
1785
|
|
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
=cut |
1787
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
1789
|
|
|
|
|
|
|
=pod |
1790
|
|
|
|
|
|
|
|
1791
|
|
|
|
|
|
|
=head1 KNOWN BUGS |
1792
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
Dimensional requirements are sometimes too strict. |
1794
|
|
|
|
|
|
|
|
1795
|
|
|
|
|
|
|
Passing weights to Spearman and Kendall link methods wastes space. |
1796
|
|
|
|
|
|
|
|
1797
|
|
|
|
|
|
|
=cut |
1798
|
|
|
|
|
|
|
|
1799
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
##--------------------------------------------------------------------- |
1801
|
|
|
|
|
|
|
=pod |
1802
|
|
|
|
|
|
|
|
1803
|
|
|
|
|
|
|
=head1 AUTHOR |
1804
|
|
|
|
|
|
|
|
1805
|
|
|
|
|
|
|
Bryan Jurish Emoocow@cpan.orgE wrote and maintains the PDL::Cluster distribution. |
1806
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
Michiel de Hoon wrote the underlying C clustering library for cDNA microarray data. |
1808
|
|
|
|
|
|
|
|
1809
|
|
|
|
|
|
|
=head1 COPYRIGHT |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
PDL::Cluster is a set of wrappers around the C Clustering library for cDNA microarray data. |
1812
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
=over 4 |
1814
|
|
|
|
|
|
|
|
1815
|
|
|
|
|
|
|
=item * |
1816
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
The C clustering library for cDNA microarray data. |
1818
|
|
|
|
|
|
|
Copyright (C) 2002-2005 Michiel Jan Laurens de Hoon. |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
This library was written at the Laboratory of DNA Information Analysis, |
1821
|
|
|
|
|
|
|
Human Genome Center, Institute of Medical Science, University of Tokyo, |
1822
|
|
|
|
|
|
|
4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan. |
1823
|
|
|
|
|
|
|
Contact: michiel.dehoon 'AT' riken.jp |
1824
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
See the files F, F and F in the PDL::Cluster distribution |
1826
|
|
|
|
|
|
|
for details. |
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
=item * |
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
PDL::Cluster wrappers copyright (C) Bryan Jurish 2005-2018. All rights reserved. |
1831
|
|
|
|
|
|
|
This package is free software, and entirely without warranty. |
1832
|
|
|
|
|
|
|
You may redistribute it and/or modify it under the same terms |
1833
|
|
|
|
|
|
|
as Perl itself. |
1834
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
=back |
1836
|
|
|
|
|
|
|
|
1837
|
|
|
|
|
|
|
=head1 SEE ALSO |
1838
|
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
perl(1), PDL(3perl), Algorithm::Cluster(3perl), cluster(1), |
1840
|
|
|
|
|
|
|
L |
1841
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
=cut |
1843
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
|
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
; |
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
# Exit with OK status |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
1; |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
|