File Coverage

cluster.c
Criterion Covered Total %
statement 759 1929 39.3
branch 504 1472 34.2
condition n/a
subroutine n/a
pod n/a
total 1263 3401 37.1


line stmt bran cond sub pod time code
1             /* The C clustering library.
2             * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3             *
4             * This library was written at the Laboratory of DNA Information Analysis,
5             * Human Genome Center, Institute of Medical Science, University of Tokyo,
6             * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7             * Contact: michiel.dehoon 'AT' riken.jp
8             *
9             * Permission to use, copy, modify, and distribute this software and its
10             * documentation with or without modifications and for any purpose and
11             * without fee is hereby granted, provided that any copyright notices
12             * appear in all copies and that both those copyright notices and this
13             * permission notice appear in supporting documentation, and that the
14             * names of the contributors or copyright holders not be used in
15             * advertising or publicity pertaining to distribution of the software
16             * without specific prior permission.
17             *
18             * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19             * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20             * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21             * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22             * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23             * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24             * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25             * OR PERFORMANCE OF THIS SOFTWARE.
26             *
27             */
28              
29             #include
30             #include
31             #include
32             #include
33             #include
34             #include
35             #include "cluster.h"
36             #ifdef WINDOWS
37             # include
38             #endif
39              
40             /* ************************************************************************ */
41              
42             #ifdef WINDOWS
43             /* Then we make a Windows DLL */
44             int WINAPI
45             clusterdll_init (HANDLE h, DWORD reason, void* foo)
46             {
47             return 1;
48             }
49             #endif
50              
51             /* ************************************************************************ */
52              
53 2           double mean(int n, double x[])
54 2           { double result = 0.;
55             int i;
56 9 100         for (i = 0; i < n; i++) result += x[i];
57 2           result /= n;
58 2           return result;
59             }
60              
61             /* ************************************************************************ */
62              
63 50           double median (int n, double x[])
64             /*
65             Find the median of X(1), ... , X(N), using as much of the quicksort
66             algorithm as is needed to isolate it.
67             N.B. On exit, the array X is partially ordered.
68             Based on Alan J. Miller's median.f90 routine.
69             */
70              
71             { int i, j;
72 50           int nr = n / 2;
73 50           int nl = nr - 1;
74 50           int even = 0;
75             /* hi & lo are position limits encompassing the median. */
76 50           int lo = 0;
77 50           int hi = n-1;
78              
79 50 100         if (n==2*nr) even = 1;
80 50 100         if (n<3)
81 36 50         { if (n<1) return 0.;
82 36 50         if (n == 1) return x[0];
83 0           return 0.5*(x[0]+x[1]);
84             }
85              
86             /* Find median of 1st, middle & last values. */
87             do
88             { int loop;
89 15           int mid = (lo + hi)/2;
90 15           double result = x[mid];
91 15           double xlo = x[lo];
92 15           double xhi = x[hi];
93 15 100         if (xhi
94 1           { double temp = xlo;
95 1           xlo = xhi;
96 1           xhi = temp;
97             }
98 15 50         if (result>xhi) result = xhi;
99 15 50         else if (result
100             /* The basic quicksort algorithm to move all values <= the sort key (XMED)
101             * to the left-hand end, and all higher values to the other end.
102             */
103 15           i = lo;
104 15           j = hi;
105             do
106 30 100         { while (x[i]
107 31 100         while (x[j]>result) j--;
108 16           loop = 0;
109 16 100         if (i
110 1           { double temp = x[i];
111 1           x[i] = x[j];
112 1           x[j] = temp;
113 1           i++;
114 1           j--;
115 1 50         if (i<=j) loop = 1;
116             }
117 16 100         } while (loop); /* Decide which half the median is in. */
118              
119 15 100         if (even)
120 2 100         { if (j==nl && i==nr)
    50          
121             /* Special case, n even, j = n/2 & i = j + 1, so the median is
122             * between the two halves of the series. Find max. of the first
123             * half & min. of the second half, then average.
124             */
125             { int k;
126 0           double xmax = x[0];
127 0           double xmin = x[n-1];
128 0 0         for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
    0          
129 0 0         for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
    0          
130 0           return 0.5*(xmin + xmax);
131             }
132 2 50         if (j
133 2 50         if (i>nr) hi = j;
134 2 50         if (i==j)
135 2 100         { if (i==nl) lo = nl;
136 2 100         if (j==nr) hi = nr;
137             }
138             }
139             else
140 13 50         { if (j
141 13 50         if (i>nr) hi = j;
142             /* Test whether median has been isolated. */
143 13 50         if (i==j && i==nr) return result;
    50          
144             }
145             }
146 2 100         while (lo
147              
148 1 50         if (even) return (0.5*(x[nl]+x[nr]));
149 0 0         if (x[lo]>x[hi])
150 0           { double temp = x[lo];
151 0           x[lo] = x[hi];
152 0           x[hi] = temp;
153             }
154 0           return x[nr];
155             }
156              
157             /* ********************************************************************** */
158              
159             static const double* sortdata = NULL; /* used in the quicksort algorithm */
160              
161             /* ---------------------------------------------------------------------- */
162              
163             static
164 0           int compare(const void* a, const void* b)
165             /* Helper function for sort. Previously, this was a nested function under
166             * sort, which is not allowed under ANSI C.
167             */
168 0           { const int i1 = *(const int*)a;
169 0           const int i2 = *(const int*)b;
170 0           const double term1 = sortdata[i1];
171 0           const double term2 = sortdata[i2];
172 0 0         if (term1 < term2) return -1;
173 0 0         if (term1 > term2) return +1;
174 0           return 0;
175             }
176              
177             /* ---------------------------------------------------------------------- */
178              
179 0           void sort(int n, const double data[], int index[])
180             /* Sets up an index table given the data, such that data[index[]] is in
181             * increasing order. Sorting is done on the indices; the array data
182             * is unchanged.
183             */
184             { int i;
185 0           sortdata = data;
186 0 0         for (i = 0; i < n; i++) index[i] = i;
187 0           qsort(index, n, sizeof(int), compare);
188 0           }
189              
190             /* ********************************************************************** */
191              
192 0           static double* getrank (int n, double data[])
193             /* Calculates the ranks of the elements in the array data. Two elements with
194             * the same value get the same rank, equal to the average of the ranks had the
195             * elements different values. The ranks are returned as a newly allocated
196             * array that should be freed by the calling routine. If getrank fails due to
197             * a memory allocation error, it returns NULL.
198             */
199             { int i;
200             double* rank;
201             int* index;
202 0           rank = malloc(n*sizeof(double));
203 0 0         if (!rank) return NULL;
204 0           index = malloc(n*sizeof(int));
205 0 0         if (!index)
206 0           { free(rank);
207 0           return NULL;
208             }
209             /* Call sort to get an index table */
210 0           sort (n, data, index);
211             /* Build a rank table */
212 0 0         for (i = 0; i < n; i++) rank[index[i]] = i;
213             /* Fix for equal ranks */
214 0           i = 0;
215 0 0         while (i < n)
216             { int m;
217 0           double value = data[index[i]];
218 0           int j = i + 1;
219 0 0         while (j < n && data[index[j]] == value) j++;
    0          
220 0           m = j - i; /* number of equal ranks found */
221 0           value = rank[index[i]] + (m-1)/2.;
222 0 0         for (j = i; j < i + m; j++) rank[index[j]] = value;
223 0           i += m;
224             }
225 0           free (index);
226 0           return rank;
227             }
228              
229             /* ---------------------------------------------------------------------- */
230              
231             static int
232 5           makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
233             { int i;
234             double** data;
235             int** mask;
236 5           data = malloc(nrows*sizeof(double*));
237 5 50         if(!data) return 0;
238 5           mask = malloc(nrows*sizeof(int*));
239 5 50         if(!mask)
240 0           { free(data);
241 0           return 0;
242             }
243 31 100         for (i = 0; i < nrows; i++)
244 26           { data[i] = malloc(ncols*sizeof(double));
245 26 50         if(!data[i]) break;
246 26           mask[i] = malloc(ncols*sizeof(int));
247 26 50         if(!mask[i])
248 0           { free(data[i]);
249 0           break;
250             }
251             }
252 5 50         if (i==nrows) /* break not encountered */
253 5           { *pdata = data;
254 5           *pmask = mask;
255 5           return 1;
256             }
257 0           *pdata = NULL;
258 0           *pmask = NULL;
259 0           nrows = i;
260 0 0         for (i = 0; i < nrows; i++)
261 0           { free(data[i]);
262 0           free(mask[i]);
263             }
264 0           free(data);
265 0           free(mask);
266 0           return 0;
267             }
268              
269             /* ---------------------------------------------------------------------- */
270              
271             static void
272 3           freedatamask(int n, double** data, int** mask)
273             { int i;
274 12 100         for (i = 0; i < n; i++)
275 9           { free(mask[i]);
276 9           free(data[i]);
277             }
278 3           free(mask);
279 3           free(data);
280 3           }
281              
282             /* ---------------------------------------------------------------------- */
283              
284             static
285 45           double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
286             /*
287             This function searches the distance matrix to find the pair with the shortest
288             distance between them. The indices of the pair are returned in ip and jp; the
289             distance itself is returned by the function.
290              
291             n (input) int
292             The number of elements in the distance matrix.
293              
294             distmatrix (input) double**
295             A ragged array containing the distance matrix. The number of columns in each
296             row is one less than the row index.
297              
298             ip (output) int*
299             A pointer to the integer that is to receive the first index of the pair with
300             the shortest distance.
301              
302             jp (output) int*
303             A pointer to the integer that is to receive the second index of the pair with
304             the shortest distance.
305             */
306             { int i, j;
307             double temp;
308 45           double distance = distmatrix[1][0];
309 45           *ip = 1;
310 45           *jp = 0;
311 297 100         for (i = 1; i < n; i++)
312 1374 100         { for (j = 0; j < i; j++)
313 1122           { temp = distmatrix[i][j];
314 1122 100         if (temp
315 72           { distance = temp;
316 72           *ip = i;
317 72           *jp = j;
318             }
319             }
320             }
321 45           return distance;
322             }
323              
324             /* ********************************************************************* */
325              
326 0           static int svd(int m, int n, double** u, double w[], double** vt)
327             /*
328             * This subroutine is a translation of the Algol procedure svd,
329             * Num. Math. 14, 403-420(1970) by Golub and Reinsch.
330             * Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
331             *
332             * This subroutine determines the singular value decomposition
333             * t
334             * A=usv of a real m by n rectangular matrix, where m is greater
335             * than or equal to n. Householder bidiagonalization and a variant
336             * of the QR algorithm are used.
337             *
338             *
339             * On input.
340             *
341             * m is the number of rows of A (and u).
342             *
343             * n is the number of columns of A (and u) and the order of v.
344             *
345             * u contains the rectangular input matrix A to be decomposed.
346             *
347             * On output.
348             *
349             * the routine returns an integer ierr equal to
350             * 0 to indicate a normal return,
351             * k if the k-th singular value has not been
352             * determined after 30 iterations,
353             * -1 if memory allocation fails.
354             *
355             *
356             * w contains the n (non-negative) singular values of a (the
357             * diagonal elements of s). they are unordered. if an
358             * error exit is made, the singular values should be correct
359             * for indices ierr+1,ierr+2,...,n.
360             *
361             *
362             * u contains the matrix u (orthogonal column vectors) of the
363             * decomposition.
364             * if an error exit is made, the columns of u corresponding
365             * to indices of correct singular values should be correct.
366             *
367             * t
368             * vt contains the matrix v (orthogonal) of the decomposition.
369             * if an error exit is made, the columns of v corresponding
370             * to indices of correct singular values should be correct.
371             *
372             *
373             * Questions and comments should be directed to B. S. Garbow,
374             * Applied Mathematics division, Argonne National Laboratory
375             *
376             * Modified to eliminate machep
377             *
378             * Translated to C by Michiel de Hoon, Human Genome Center,
379             * University of Tokyo, for inclusion in the C Clustering Library.
380             * This routine is less general than the original svd routine, as
381             * it focuses on the singular value decomposition as needed for
382             * clustering. In particular,
383             * - We calculate both u and v in all cases
384             * - We pass the input array A via u; this array is subsequently
385             * overwritten.
386             * - We allocate for the array rv1, used as a working space,
387             * internally in this routine, instead of passing it as an
388             * argument. If the allocation fails, svd returns -1.
389             * 2003.06.05
390             */
391             { int i, j, k, i1, k1, l1, its;
392             double c,f,h,s,x,y,z;
393 0           int l = 0;
394 0           int ierr = 0;
395 0           double g = 0.0;
396 0           double scale = 0.0;
397 0           double anorm = 0.0;
398 0           double* rv1 = malloc(n*sizeof(double));
399 0 0         if (!rv1) return -1;
400 0 0         if (m >= n)
401             { /* Householder reduction to bidiagonal form */
402 0 0         for (i = 0; i < n; i++)
403 0           { l = i + 1;
404 0           rv1[i] = scale * g;
405 0           g = 0.0;
406 0           s = 0.0;
407 0           scale = 0.0;
408 0 0         for (k = i; k < m; k++) scale += fabs(u[k][i]);
409 0 0         if (scale != 0.0)
410 0 0         { for (k = i; k < m; k++)
411 0           { u[k][i] /= scale;
412 0           s += u[k][i]*u[k][i];
413             }
414 0           f = u[i][i];
415 0 0         g = (f >= 0) ? -sqrt(s) : sqrt(s);
416 0           h = f * g - s;
417 0           u[i][i] = f - g;
418 0 0         if (i < n-1)
419 0 0         { for (j = l; j < n; j++)
420 0           { s = 0.0;
421 0 0         for (k = i; k < m; k++) s += u[k][i] * u[k][j];
422 0           f = s / h;
423 0 0         for (k = i; k < m; k++) u[k][j] += f * u[k][i];
424             }
425             }
426 0 0         for (k = i; k < m; k++) u[k][i] *= scale;
427             }
428 0           w[i] = scale * g;
429 0           g = 0.0;
430 0           s = 0.0;
431 0           scale = 0.0;
432 0 0         if (i
433 0 0         { for (k = l; k < n; k++) scale += fabs(u[i][k]);
434 0 0         if (scale != 0.0)
435 0 0         { for (k = l; k < n; k++)
436 0           { u[i][k] /= scale;
437 0           s += u[i][k] * u[i][k];
438             }
439 0           f = u[i][l];
440 0 0         g = (f >= 0) ? -sqrt(s) : sqrt(s);
441 0           h = f * g - s;
442 0           u[i][l] = f - g;
443 0 0         for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
444 0 0         for (j = l; j < m; j++)
445 0           { s = 0.0;
446 0 0         for (k = l; k < n; k++) s += u[j][k] * u[i][k];
447 0 0         for (k = l; k < n; k++) u[j][k] += s * rv1[k];
448             }
449 0 0         for (k = l; k < n; k++) u[i][k] *= scale;
450             }
451             }
452 0 0         anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
453             }
454             /* accumulation of right-hand transformations */
455 0 0         for (i = n-1; i>=0; i--)
456 0 0         { if (i < n-1)
457 0 0         { if (g != 0.0)
458 0 0         { for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
459             /* double division avoids possible underflow */
460 0 0         for (j = l; j < n; j++)
461 0           { s = 0.0;
462 0 0         for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
463 0 0         for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
464             }
465             }
466             }
467 0 0         for (j = l; j < n; j++)
468 0           { vt[j][i] = 0.0;
469 0           vt[i][j] = 0.0;
470             }
471 0           vt[i][i] = 1.0;
472 0           g = rv1[i];
473 0           l = i;
474             }
475             /* accumulation of left-hand transformations */
476 0 0         for (i = n-1; i >= 0; i--)
477 0           { l = i + 1;
478 0           g = w[i];
479 0 0         if (i!=n-1)
480 0 0         for (j = l; j < n; j++) u[i][j] = 0.0;
481 0 0         if (g!=0.0)
482 0 0         { if (i!=n-1)
483 0 0         { for (j = l; j < n; j++)
484 0           { s = 0.0;
485 0 0         for (k = l; k < m; k++) s += u[k][i] * u[k][j];
486             /* double division avoids possible underflow */
487 0           f = (s / u[i][i]) / g;
488 0 0         for (k = i; k < m; k++) u[k][j] += f * u[k][i];
489             }
490             }
491 0 0         for (j = i; j < m; j++) u[j][i] /= g;
492             }
493             else
494 0 0         for (j = i; j < m; j++) u[j][i] = 0.0;
495 0           u[i][i] += 1.0;
496             }
497             /* diagonalization of the bidiagonal form */
498 0 0         for (k = n-1; k >= 0; k--)
499 0           { k1 = k-1;
500 0           its = 0;
501             while(1)
502             /* test for splitting */
503 0 0         { for (l = k; l >= 0; l--)
504 0           { l1 = l-1;
505 0 0         if (fabs(rv1[l]) + anorm == anorm) break;
506             /* rv1[0] is always zero, so there is no exit
507             * through the bottom of the loop */
508 0 0         if (fabs(w[l1]) + anorm == anorm)
509             /* cancellation of rv1[l] if l greater than 0 */
510 0           { c = 0.0;
511 0           s = 1.0;
512 0 0         for (i = l; i <= k; i++)
513 0           { f = s * rv1[i];
514 0           rv1[i] *= c;
515 0 0         if (fabs(f) + anorm == anorm) break;
516 0           g = w[i];
517 0           h = sqrt(f*f+g*g);
518 0           w[i] = h;
519 0           c = g / h;
520 0           s = -f / h;
521 0 0         for (j = 0; j < m; j++)
522 0           { y = u[j][l1];
523 0           z = u[j][i];
524 0           u[j][l1] = y * c + z * s;
525 0           u[j][i] = -y * s + z * c;
526             }
527             }
528 0           break;
529             }
530             }
531             /* test for convergence */
532 0           z = w[k];
533 0 0         if (l==k) /* convergence */
534 0 0         { if (z < 0.0)
535             /* w[k] is made non-negative */
536 0           { w[k] = -z;
537 0 0         for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
538             }
539 0           break;
540             }
541 0 0         else if (its==30)
542 0           { ierr = k;
543 0           break;
544             }
545             else
546             /* shift from bottom 2 by 2 minor */
547 0           { its++;
548 0           x = w[l];
549 0           y = w[k1];
550 0           g = rv1[k1];
551 0           h = rv1[k];
552 0           f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
553 0           g = sqrt(f*f+1.0);
554 0 0         f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
555             /* next qr transformation */
556 0           c = 1.0;
557 0           s = 1.0;
558 0 0         for (i1 = l; i1 <= k1; i1++)
559 0           { i = i1 + 1;
560 0           g = rv1[i];
561 0           y = w[i];
562 0           h = s * g;
563 0           g = c * g;
564 0           z = sqrt(f*f+h*h);
565 0           rv1[i1] = z;
566 0           c = f / z;
567 0           s = h / z;
568 0           f = x * c + g * s;
569 0           g = -x * s + g * c;
570 0           h = y * s;
571 0           y = y * c;
572 0 0         for (j = 0; j < n; j++)
573 0           { x = vt[i1][j];
574 0           z = vt[i][j];
575 0           vt[i1][j] = x * c + z * s;
576 0           vt[i][j] = -x * s + z * c;
577             }
578 0           z = sqrt(f*f+h*h);
579 0           w[i1] = z;
580             /* rotation can be arbitrary if z is zero */
581 0 0         if (z!=0.0)
582 0           { c = f / z;
583 0           s = h / z;
584             }
585 0           f = c * g + s * y;
586 0           x = -s * g + c * y;
587 0 0         for (j = 0; j < m; j++)
588 0           { y = u[j][i1];
589 0           z = u[j][i];
590 0           u[j][i1] = y * c + z * s;
591 0           u[j][i] = -y * s + z * c;
592             }
593             }
594 0           rv1[l] = 0.0;
595 0           rv1[k] = f;
596 0           w[k] = x;
597             }
598 0           }
599             }
600             }
601             else /* m < n */
602             { /* Householder reduction to bidiagonal form */
603 0 0         for (i = 0; i < m; i++)
604 0           { l = i + 1;
605 0           rv1[i] = scale * g;
606 0           g = 0.0;
607 0           s = 0.0;
608 0           scale = 0.0;
609 0 0         for (k = i; k < n; k++) scale += fabs(u[i][k]);
610 0 0         if (scale != 0.0)
611 0 0         { for (k = i; k < n; k++)
612 0           { u[i][k] /= scale;
613 0           s += u[i][k]*u[i][k];
614             }
615 0           f = u[i][i];
616 0 0         g = (f >= 0) ? -sqrt(s) : sqrt(s);
617 0           h = f * g - s;
618 0           u[i][i] = f - g;
619 0 0         if (i < m-1)
620 0 0         { for (j = l; j < m; j++)
621 0           { s = 0.0;
622 0 0         for (k = i; k < n; k++) s += u[i][k] * u[j][k];
623 0           f = s / h;
624 0 0         for (k = i; k < n; k++) u[j][k] += f * u[i][k];
625             }
626             }
627 0 0         for (k = i; k < n; k++) u[i][k] *= scale;
628             }
629 0           w[i] = scale * g;
630 0           g = 0.0;
631 0           s = 0.0;
632 0           scale = 0.0;
633 0 0         if (i
634 0 0         { for (k = l; k < m; k++) scale += fabs(u[k][i]);
635 0 0         if (scale != 0.0)
636 0 0         { for (k = l; k < m; k++)
637 0           { u[k][i] /= scale;
638 0           s += u[k][i] * u[k][i];
639             }
640 0           f = u[l][i];
641 0 0         g = (f >= 0) ? -sqrt(s) : sqrt(s);
642 0           h = f * g - s;
643 0           u[l][i] = f - g;
644 0 0         for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
645 0 0         for (j = l; j < n; j++)
646 0           { s = 0.0;
647 0 0         for (k = l; k < m; k++) s += u[k][j] * u[k][i];
648 0 0         for (k = l; k < m; k++) u[k][j] += s * rv1[k];
649             }
650 0 0         for (k = l; k < m; k++) u[k][i] *= scale;
651             }
652             }
653 0 0         anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
654             }
655             /* accumulation of right-hand transformations */
656 0 0         for (i = m-1; i>=0; i--)
657 0 0         { if (i < m-1)
658 0 0         { if (g != 0.0)
659 0 0         { for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
660             /* double division avoids possible underflow */
661 0 0         for (j = l; j < m; j++)
662 0           { s = 0.0;
663 0 0         for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
664 0 0         for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
665             }
666             }
667             }
668 0 0         for (j = l; j < m; j++)
669 0           { vt[i][j] = 0.0;
670 0           vt[j][i] = 0.0;
671             }
672 0           vt[i][i] = 1.0;
673 0           g = rv1[i];
674 0           l = i;
675             }
676             /* accumulation of left-hand transformations */
677 0 0         for (i = m-1; i >= 0; i--)
678 0           { l = i + 1;
679 0           g = w[i];
680 0 0         if (i!=m-1)
681 0 0         for (j = l; j < m; j++) u[j][i] = 0.0;
682 0 0         if (g!=0.0)
683 0 0         { if (i!=m-1)
684 0 0         { for (j = l; j < m; j++)
685 0           { s = 0.0;
686 0 0         for (k = l; k < n; k++) s += u[i][k] * u[j][k];
687             /* double division avoids possible underflow */
688 0           f = (s / u[i][i]) / g;
689 0 0         for (k = i; k < n; k++) u[j][k] += f * u[i][k];
690             }
691             }
692 0 0         for (j = i; j < n; j++) u[i][j] /= g;
693             }
694             else
695 0 0         for (j = i; j < n; j++) u[i][j] = 0.0;
696 0           u[i][i] += 1.0;
697             }
698             /* diagonalization of the bidiagonal form */
699 0 0         for (k = m-1; k >= 0; k--)
700 0           { k1 = k-1;
701 0           its = 0;
702             while(1)
703             /* test for splitting */
704 0 0         { for (l = k; l >= 0; l--)
705 0           { l1 = l-1;
706 0 0         if (fabs(rv1[l]) + anorm == anorm) break;
707             /* rv1[0] is always zero, so there is no exit
708             * through the bottom of the loop */
709 0 0         if (fabs(w[l1]) + anorm == anorm)
710             /* cancellation of rv1[l] if l greater than 0 */
711 0           { c = 0.0;
712 0           s = 1.0;
713 0 0         for (i = l; i <= k; i++)
714 0           { f = s * rv1[i];
715 0           rv1[i] *= c;
716 0 0         if (fabs(f) + anorm == anorm) break;
717 0           g = w[i];
718 0           h = sqrt(f*f+g*g);
719 0           w[i] = h;
720 0           c = g / h;
721 0           s = -f / h;
722 0 0         for (j = 0; j < n; j++)
723 0           { y = u[l1][j];
724 0           z = u[i][j];
725 0           u[l1][j] = y * c + z * s;
726 0           u[i][j] = -y * s + z * c;
727             }
728             }
729 0           break;
730             }
731             }
732             /* test for convergence */
733 0           z = w[k];
734 0 0         if (l==k) /* convergence */
735 0 0         { if (z < 0.0)
736             /* w[k] is made non-negative */
737 0           { w[k] = -z;
738 0 0         for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
739             }
740 0           break;
741             }
742 0 0         else if (its==30)
743 0           { ierr = k;
744 0           break;
745             }
746             else
747             /* shift from bottom 2 by 2 minor */
748 0           { its++;
749 0           x = w[l];
750 0           y = w[k1];
751 0           g = rv1[k1];
752 0           h = rv1[k];
753 0           f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
754 0           g = sqrt(f*f+1.0);
755 0 0         f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
756             /* next qr transformation */
757 0           c = 1.0;
758 0           s = 1.0;
759 0 0         for (i1 = l; i1 <= k1; i1++)
760 0           { i = i1 + 1;
761 0           g = rv1[i];
762 0           y = w[i];
763 0           h = s * g;
764 0           g = c * g;
765 0           z = sqrt(f*f+h*h);
766 0           rv1[i1] = z;
767 0           c = f / z;
768 0           s = h / z;
769 0           f = x * c + g * s;
770 0           g = -x * s + g * c;
771 0           h = y * s;
772 0           y = y * c;
773 0 0         for (j = 0; j < m; j++)
774 0           { x = vt[j][i1];
775 0           z = vt[j][i];
776 0           vt[j][i1] = x * c + z * s;
777 0           vt[j][i] = -x * s + z * c;
778             }
779 0           z = sqrt(f*f+h*h);
780 0           w[i1] = z;
781             /* rotation can be arbitrary if z is zero */
782 0 0         if (z!=0.0)
783 0           { c = f / z;
784 0           s = h / z;
785             }
786 0           f = c * g + s * y;
787 0           x = -s * g + c * y;
788 0 0         for (j = 0; j < n; j++)
789 0           { y = u[i1][j];
790 0           z = u[i][j];
791 0           u[i1][j] = y * c + z * s;
792 0           u[i][j] = -y * s + z * c;
793             }
794             }
795 0           rv1[l] = 0.0;
796 0           rv1[k] = f;
797 0           w[k] = x;
798             }
799 0           }
800             }
801             }
802 0           free(rv1);
803 0           return ierr;
804             }
805              
806             /* ********************************************************************* */
807              
808 0           int pca(int nrows, int ncolumns, double** u, double** v, double* w)
809             /*
810             Purpose
811             =======
812              
813             This subroutine uses the singular value decomposition to perform principal
814             components analysis of a real nrows by ncolumns rectangular matrix.
815              
816             Arguments
817             =========
818              
819             nrows (input) int
820             The number of rows in the matrix u.
821              
822             ncolumns (input) int
823             The number of columns in the matrix v.
824              
825             u (input) double[nrows][ncolumns]
826             On input, the array containing the data to which the principal component
827             analysis should be applied. The function assumes that the mean has already been
828             subtracted of each column, and hence that the mean of each column is zero.
829             On output, see below.
830              
831             v (input) double[n][n], where n = min(nrows, ncolumns)
832             Not used on input.
833              
834             w (input) double[n], where n = min(nrows, ncolumns)
835             Not used on input.
836              
837              
838             Return value
839             ============
840              
841             On output:
842              
843             If nrows >= ncolumns, then
844              
845             u contains the coordinates with respect to the principal components;
846             v contains the principal component vectors.
847              
848             The dot product u . v reproduces the data that were passed in u.
849              
850              
851             If nrows < ncolumns, then
852              
853             u contains the principal component vectors;
854             v contains the coordinates with respect to the principal components.
855              
856             The dot product v . u reproduces the data that were passed in u.
857              
858             The eigenvalues of the covariance matrix are returned in w.
859              
860             The arrays u, v, and w are sorted according to eigenvalue, with the largest
861             eigenvalues appearing first.
862              
863             The function returns 0 if successful, -1 if memory allocation fails, and a
864             positive integer if the singular value decomposition fails to converge.
865             */
866             {
867             int i;
868             int j;
869             int error;
870 0           int* index = malloc(ncolumns*sizeof(int));
871 0           double* temp = malloc(ncolumns*sizeof(double));
872 0 0         if (!index || !temp)
    0          
873 0 0         { if (index) free(index);
874 0 0         if (temp) free(temp);
875 0           return -1;
876             }
877 0           error = svd(nrows, ncolumns, u, w, v);
878 0 0         if (error==0)
879             {
880 0 0         if (nrows >= ncolumns)
881 0 0         { for (j = 0; j < ncolumns; j++)
882 0           { const double s = w[j];
883 0 0         for (i = 0; i < nrows; i++) u[i][j] *= s;
884             }
885 0           sort(ncolumns, w, index);
886 0 0         for (i = 0; i < ncolumns/2; i++)
887 0           { j = index[i];
888 0           index[i] = index[ncolumns-1-i];
889 0           index[ncolumns-1-i] = j;
890             }
891 0 0         for (i = 0; i < nrows; i++)
892 0 0         { for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
893 0 0         for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
894             }
895 0 0         for (i = 0; i < ncolumns; i++)
896 0 0         { for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
897 0 0         for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
898             }
899 0 0         for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
900 0 0         for (i = 0; i < ncolumns; i++) w[i] = temp[i];
901             }
902             else /* nrows < ncolumns */
903 0 0         { for (j = 0; j < nrows; j++)
904 0           { const double s = w[j];
905 0 0         for (i = 0; i < nrows; i++) v[i][j] *= s;
906             }
907 0           sort(nrows, w, index);
908 0 0         for (i = 0; i < nrows/2; i++)
909 0           { j = index[i];
910 0           index[i] = index[nrows-1-i];
911 0           index[nrows-1-i] = j;
912             }
913 0 0         for (j = 0; j < ncolumns; j++)
914 0 0         { for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
915 0 0         for (i = 0; i < nrows; i++) u[i][j] = temp[i];
916             }
917 0 0         for (j = 0; j < nrows; j++)
918 0 0         { for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
919 0 0         for (i = 0; i < nrows; i++) v[j][i] = temp[i];
920             }
921 0 0         for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
922 0 0         for (i = 0; i < nrows; i++) w[i] = temp[i];
923             }
924             }
925 0           free(index);
926 0           free(temp);
927 0           return error;
928             }
929              
930             /* ********************************************************************* */
931              
932             static
933 36704           double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
934             const double weight[], int index1, int index2, int transpose)
935              
936             /*
937             Purpose
938             =======
939              
940             The euclid routine calculates the weighted Euclidean distance between two
941             rows or columns in a matrix.
942              
943             Arguments
944             =========
945              
946             n (input) int
947             The number of elements in a row or column. If transpose==0, then n is the number
948             of columns; otherwise, n is the number of rows.
949              
950             data1 (input) double array
951             The data array containing the first vector.
952              
953             data2 (input) double array
954             The data array containing the second vector.
955              
956             mask1 (input) int array
957             This array which elements in data1 are missing. If mask1[i][j]==0, then
958             data1[i][j] is missing.
959              
960             mask2 (input) int array
961             This array which elements in data2 are missing. If mask2[i][j]==0, then
962             data2[i][j] is missing.
963              
964             weight (input) double[n]
965             The weights that are used to calculate the distance.
966              
967             index1 (input) int
968             Index of the first row or column.
969              
970             index2 (input) int
971             Index of the second row or column.
972              
973             transpose (input) int
974             If transpose==0, the distance between two rows in the matrix is calculated.
975             Otherwise, the distance between two columns in the matrix is calculated.
976              
977             ============================================================================
978             */
979 36704           { double result = 0.;
980 36704           double tweight = 0;
981             int i;
982 36704 50         if (transpose==0) /* Calculate the distance between two rows */
983 145242 100         { for (i = 0; i < n; i++)
984 108538 50         { if (mask1[index1][i] && mask2[index2][i])
    50          
985 108538           { double term = data1[index1][i] - data2[index2][i];
986 108538           result += weight[i]*term*term;
987 108538           tweight += weight[i];
988             }
989             }
990             }
991             else
992 0 0         { for (i = 0; i < n; i++)
993 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
994 0           { double term = data1[i][index1] - data2[i][index2];
995 0           result += weight[i]*term*term;
996 0           tweight += weight[i];
997             }
998             }
999             }
1000 36704 50         if (!tweight) return 0; /* usually due to empty clusters */
1001 36704           result /= tweight;
1002 36704           return result;
1003             }
1004              
1005             /* ********************************************************************* */
1006              
1007             static
1008 64           double cityblock (int n, double** data1, double** data2, int** mask1,
1009             int** mask2, const double weight[], int index1, int index2, int transpose)
1010              
1011             /*
1012             Purpose
1013             =======
1014              
1015             The cityblock routine calculates the weighted "City Block" distance between
1016             two rows or columns in a matrix. City Block distance is defined as the
1017             absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
1018             equivalent to taking an "up and over" path.
1019              
1020             Arguments
1021             =========
1022              
1023             n (input) int
1024             The number of elements in a row or column. If transpose==0, then n is the number
1025             of columns; otherwise, n is the number of rows.
1026              
1027             data1 (input) double array
1028             The data array containing the first vector.
1029              
1030             data2 (input) double array
1031             The data array containing the second vector.
1032              
1033             mask1 (input) int array
1034             This array which elements in data1 are missing. If mask1[i][j]==0, then
1035             data1[i][j] is missing.
1036              
1037             mask2 (input) int array
1038             This array which elements in data2 are missing. If mask2[i][j]==0, then
1039             data2[i][j] is missing.
1040              
1041             weight (input) double[n]
1042             The weights that are used to calculate the distance.
1043              
1044             index1 (input) int
1045             Index of the first row or column.
1046              
1047             index2 (input) int
1048             Index of the second row or column.
1049              
1050             transpose (input) int
1051             If transpose==0, the distance between two rows in the matrix is calculated.
1052             Otherwise, the distance between two columns in the matrix is calculated.
1053              
1054             ============================================================================ */
1055 64           { double result = 0.;
1056 64           double tweight = 0;
1057             int i;
1058 64 50         if (transpose==0) /* Calculate the distance between two rows */
1059 256 100         { for (i = 0; i < n; i++)
1060 192 50         { if (mask1[index1][i] && mask2[index2][i])
    50          
1061 192           { double term = data1[index1][i] - data2[index2][i];
1062 192           result = result + weight[i]*fabs(term);
1063 192           tweight += weight[i];
1064             }
1065             }
1066             }
1067             else
1068 0 0         { for (i = 0; i < n; i++)
1069 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1070 0           { double term = data1[i][index1] - data2[i][index2];
1071 0           result = result + weight[i]*fabs(term);
1072 0           tweight += weight[i];
1073             }
1074             }
1075             }
1076 64 50         if (!tweight) return 0; /* usually due to empty clusters */
1077 64           result /= tweight;
1078 64           return result;
1079             }
1080              
1081             /* ********************************************************************* */
1082              
1083             static
1084 0           double correlation (int n, double** data1, double** data2, int** mask1,
1085             int** mask2, const double weight[], int index1, int index2, int transpose)
1086             /*
1087             Purpose
1088             =======
1089              
1090             The correlation routine calculates the weighted Pearson distance between two
1091             rows or columns in a matrix. We define the Pearson distance as one minus the
1092             Pearson correlation.
1093             This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1094             but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1095             (e.g., choose b = a + c).
1096              
1097             Arguments
1098             =========
1099              
1100             n (input) int
1101             The number of elements in a row or column. If transpose==0, then n is the number
1102             of columns; otherwise, n is the number of rows.
1103              
1104             data1 (input) double array
1105             The data array containing the first vector.
1106              
1107             data2 (input) double array
1108             The data array containing the second vector.
1109              
1110             mask1 (input) int array
1111             This array which elements in data1 are missing. If mask1[i][j]==0, then
1112             data1[i][j] is missing.
1113              
1114             mask2 (input) int array
1115             This array which elements in data2 are missing. If mask2[i][j]==0, then
1116             data2[i][j] is missing.
1117              
1118             weight (input) double[n]
1119             The weights that are used to calculate the distance.
1120              
1121             index1 (input) int
1122             Index of the first row or column.
1123              
1124             index2 (input) int
1125             Index of the second row or column.
1126              
1127             transpose (input) int
1128             If transpose==0, the distance between two rows in the matrix is calculated.
1129             Otherwise, the distance between two columns in the matrix is calculated.
1130             ============================================================================
1131             */
1132 0           { double result = 0.;
1133 0           double sum1 = 0.;
1134 0           double sum2 = 0.;
1135 0           double denom1 = 0.;
1136 0           double denom2 = 0.;
1137 0           double tweight = 0.;
1138 0 0         if (transpose==0) /* Calculate the distance between two rows */
1139             { int i;
1140 0 0         for (i = 0; i < n; i++)
1141 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1142 0           { double term1 = data1[index1][i];
1143 0           double term2 = data2[index2][i];
1144 0           double w = weight[i];
1145 0           sum1 += w*term1;
1146 0           sum2 += w*term2;
1147 0           result += w*term1*term2;
1148 0           denom1 += w*term1*term1;
1149 0           denom2 += w*term2*term2;
1150 0           tweight += w;
1151             }
1152             }
1153             }
1154             else
1155             { int i;
1156 0 0         for (i = 0; i < n; i++)
1157 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1158 0           { double term1 = data1[i][index1];
1159 0           double term2 = data2[i][index2];
1160 0           double w = weight[i];
1161 0           sum1 += w*term1;
1162 0           sum2 += w*term2;
1163 0           result += w*term1*term2;
1164 0           denom1 += w*term1*term1;
1165 0           denom2 += w*term2*term2;
1166 0           tweight += w;
1167             }
1168             }
1169             }
1170 0 0         if (!tweight) return 0; /* usually due to empty clusters */
1171 0           result -= sum1 * sum2 / tweight;
1172 0           denom1 -= sum1 * sum1 / tweight;
1173 0           denom2 -= sum2 * sum2 / tweight;
1174 0 0         if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1175 0 0         if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1176 0           result = result / sqrt(denom1*denom2);
1177 0           result = 1. - result;
1178 0           return result;
1179             }
1180              
1181             /* ********************************************************************* */
1182              
1183             static
1184 0           double acorrelation (int n, double** data1, double** data2, int** mask1,
1185             int** mask2, const double weight[], int index1, int index2, int transpose)
1186             /*
1187             Purpose
1188             =======
1189              
1190             The acorrelation routine calculates the weighted Pearson distance between two
1191             rows or columns, using the absolute value of the correlation.
1192             This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1193             but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1194             (e.g., choose b = a + c).
1195              
1196             Arguments
1197             =========
1198              
1199             n (input) int
1200             The number of elements in a row or column. If transpose==0, then n is the number
1201             of columns; otherwise, n is the number of rows.
1202              
1203             data1 (input) double array
1204             The data array containing the first vector.
1205              
1206             data2 (input) double array
1207             The data array containing the second vector.
1208              
1209             mask1 (input) int array
1210             This array which elements in data1 are missing. If mask1[i][j]==0, then
1211             data1[i][j] is missing.
1212              
1213             mask2 (input) int array
1214             This array which elements in data2 are missing. If mask2[i][j]==0, then
1215             data2[i][j] is missing.
1216              
1217             weight (input) double[n]
1218             The weights that are used to calculate the distance.
1219              
1220             index1 (input) int
1221             Index of the first row or column.
1222              
1223             index2 (input) int
1224             Index of the second row or column.
1225              
1226             transpose (input) int
1227             If transpose==0, the distance between two rows in the matrix is calculated.
1228             Otherwise, the distance between two columns in the matrix is calculated.
1229             ============================================================================
1230             */
1231 0           { double result = 0.;
1232 0           double sum1 = 0.;
1233 0           double sum2 = 0.;
1234 0           double denom1 = 0.;
1235 0           double denom2 = 0.;
1236 0           double tweight = 0.;
1237 0 0         if (transpose==0) /* Calculate the distance between two rows */
1238             { int i;
1239 0 0         for (i = 0; i < n; i++)
1240 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1241 0           { double term1 = data1[index1][i];
1242 0           double term2 = data2[index2][i];
1243 0           double w = weight[i];
1244 0           sum1 += w*term1;
1245 0           sum2 += w*term2;
1246 0           result += w*term1*term2;
1247 0           denom1 += w*term1*term1;
1248 0           denom2 += w*term2*term2;
1249 0           tweight += w;
1250             }
1251             }
1252             }
1253             else
1254             { int i;
1255 0 0         for (i = 0; i < n; i++)
1256 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1257 0           { double term1 = data1[i][index1];
1258 0           double term2 = data2[i][index2];
1259 0           double w = weight[i];
1260 0           sum1 += w*term1;
1261 0           sum2 += w*term2;
1262 0           result += w*term1*term2;
1263 0           denom1 += w*term1*term1;
1264 0           denom2 += w*term2*term2;
1265 0           tweight += w;
1266             }
1267             }
1268             }
1269 0 0         if (!tweight) return 0; /* usually due to empty clusters */
1270 0           result -= sum1 * sum2 / tweight;
1271 0           denom1 -= sum1 * sum1 / tweight;
1272 0           denom2 -= sum2 * sum2 / tweight;
1273 0 0         if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1274 0 0         if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1275 0           result = fabs(result) / sqrt(denom1*denom2);
1276 0           result = 1. - result;
1277 0           return result;
1278             }
1279              
1280             /* ********************************************************************* */
1281              
1282             static
1283 0           double ucorrelation (int n, double** data1, double** data2, int** mask1,
1284             int** mask2, const double weight[], int index1, int index2, int transpose)
1285             /*
1286             Purpose
1287             =======
1288              
1289             The ucorrelation routine calculates the weighted Pearson distance between two
1290             rows or columns, using the uncentered version of the Pearson correlation. In the
1291             uncentered Pearson correlation, a zero mean is used for both vectors even if
1292             the actual mean is nonzero.
1293             This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1294             but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1295             (e.g., choose b = a + c).
1296              
1297             Arguments
1298             =========
1299              
1300             n (input) int
1301             The number of elements in a row or column. If transpose==0, then n is the number
1302             of columns; otherwise, n is the number of rows.
1303              
1304             data1 (input) double array
1305             The data array containing the first vector.
1306              
1307             data2 (input) double array
1308             The data array containing the second vector.
1309              
1310             mask1 (input) int array
1311             This array which elements in data1 are missing. If mask1[i][j]==0, then
1312             data1[i][j] is missing.
1313              
1314             mask2 (input) int array
1315             This array which elements in data2 are missing. If mask2[i][j]==0, then
1316             data2[i][j] is missing.
1317              
1318             weight (input) double[n]
1319             The weights that are used to calculate the distance.
1320              
1321             index1 (input) int
1322             Index of the first row or column.
1323              
1324             index2 (input) int
1325             Index of the second row or column.
1326              
1327             transpose (input) int
1328             If transpose==0, the distance between two rows in the matrix is calculated.
1329             Otherwise, the distance between two columns in the matrix is calculated.
1330             ============================================================================
1331             */
1332 0           { double result = 0.;
1333 0           double denom1 = 0.;
1334 0           double denom2 = 0.;
1335 0           int flag = 0;
1336             /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1337             * found.
1338             */
1339 0 0         if (transpose==0) /* Calculate the distance between two rows */
1340             { int i;
1341 0 0         for (i = 0; i < n; i++)
1342 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1343 0           { double term1 = data1[index1][i];
1344 0           double term2 = data2[index2][i];
1345 0           double w = weight[i];
1346 0           result += w*term1*term2;
1347 0           denom1 += w*term1*term1;
1348 0           denom2 += w*term2*term2;
1349 0           flag = 1;
1350             }
1351             }
1352             }
1353             else
1354             { int i;
1355 0 0         for (i = 0; i < n; i++)
1356 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1357 0           { double term1 = data1[i][index1];
1358 0           double term2 = data2[i][index2];
1359 0           double w = weight[i];
1360 0           result += w*term1*term2;
1361 0           denom1 += w*term1*term1;
1362 0           denom2 += w*term2*term2;
1363 0           flag = 1;
1364             }
1365             }
1366             }
1367 0 0         if (!flag) return 0.;
1368 0 0         if (denom1==0.) return 1.;
1369 0 0         if (denom2==0.) return 1.;
1370 0           result = result / sqrt(denom1*denom2);
1371 0           result = 1. - result;
1372 0           return result;
1373             }
1374              
1375             /* ********************************************************************* */
1376              
1377             static
1378 0           double uacorrelation (int n, double** data1, double** data2, int** mask1,
1379             int** mask2, const double weight[], int index1, int index2, int transpose)
1380             /*
1381             Purpose
1382             =======
1383              
1384             The uacorrelation routine calculates the weighted Pearson distance between two
1385             rows or columns, using the absolute value of the uncentered version of the
1386             Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1387             for both vectors even if the actual mean is nonzero.
1388             This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1389             but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1390             (e.g., choose b = a + c).
1391              
1392             Arguments
1393             =========
1394              
1395             n (input) int
1396             The number of elements in a row or column. If transpose==0, then n is the number
1397             of columns; otherwise, n is the number of rows.
1398              
1399             data1 (input) double array
1400             The data array containing the first vector.
1401              
1402             data2 (input) double array
1403             The data array containing the second vector.
1404              
1405             mask1 (input) int array
1406             This array which elements in data1 are missing. If mask1[i][j]==0, then
1407             data1[i][j] is missing.
1408              
1409             mask2 (input) int array
1410             This array which elements in data2 are missing. If mask2[i][j]==0, then
1411             data2[i][j] is missing.
1412              
1413             weight (input) double[n]
1414             The weights that are used to calculate the distance.
1415              
1416             index1 (input) int
1417             Index of the first row or column.
1418              
1419             index2 (input) int
1420             Index of the second row or column.
1421              
1422             transpose (input) int
1423             If transpose==0, the distance between two rows in the matrix is calculated.
1424             Otherwise, the distance between two columns in the matrix is calculated.
1425             ============================================================================
1426             */
1427 0           { double result = 0.;
1428 0           double denom1 = 0.;
1429 0           double denom2 = 0.;
1430 0           int flag = 0;
1431             /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1432             * found.
1433             */
1434 0 0         if (transpose==0) /* Calculate the distance between two rows */
1435             { int i;
1436 0 0         for (i = 0; i < n; i++)
1437 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1438 0           { double term1 = data1[index1][i];
1439 0           double term2 = data2[index2][i];
1440 0           double w = weight[i];
1441 0           result += w*term1*term2;
1442 0           denom1 += w*term1*term1;
1443 0           denom2 += w*term2*term2;
1444 0           flag = 1;
1445             }
1446             }
1447             }
1448             else
1449             { int i;
1450 0 0         for (i = 0; i < n; i++)
1451 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1452 0           { double term1 = data1[i][index1];
1453 0           double term2 = data2[i][index2];
1454 0           double w = weight[i];
1455 0           result += w*term1*term2;
1456 0           denom1 += w*term1*term1;
1457 0           denom2 += w*term2*term2;
1458 0           flag = 1;
1459             }
1460             }
1461             }
1462 0 0         if (!flag) return 0.;
1463 0 0         if (denom1==0.) return 1.;
1464 0 0         if (denom2==0.) return 1.;
1465 0           result = fabs(result) / sqrt(denom1*denom2);
1466 0           result = 1. - result;
1467 0           return result;
1468             }
1469              
1470             /* ********************************************************************* */
1471              
1472             static
1473 0           double spearman (int n, double** data1, double** data2, int** mask1,
1474             int** mask2, const double weight[], int index1, int index2, int transpose)
1475             /*
1476             Purpose
1477             =======
1478              
1479             The spearman routine calculates the Spearman distance between two rows or
1480             columns. The Spearman distance is defined as one minus the Spearman rank
1481             correlation.
1482              
1483             Arguments
1484             =========
1485              
1486             n (input) int
1487             The number of elements in a row or column. If transpose==0, then n is the number
1488             of columns; otherwise, n is the number of rows.
1489              
1490             data1 (input) double array
1491             The data array containing the first vector.
1492              
1493             data2 (input) double array
1494             The data array containing the second vector.
1495              
1496             mask1 (input) int array
1497             This array which elements in data1 are missing. If mask1[i][j]==0, then
1498             data1[i][j] is missing.
1499              
1500             mask2 (input) int array
1501             This array which elements in data2 are missing. If mask2[i][j]==0, then
1502             data2[i][j] is missing.
1503              
1504             weight (input) double[n]
1505             These weights are ignored, but included for consistency with other distance
1506             measures.
1507              
1508             index1 (input) int
1509             Index of the first row or column.
1510              
1511             index2 (input) int
1512             Index of the second row or column.
1513              
1514             transpose (input) int
1515             If transpose==0, the distance between two rows in the matrix is calculated.
1516             Otherwise, the distance between two columns in the matrix is calculated.
1517             ============================================================================
1518             */
1519             { int i;
1520 0           int m = 0;
1521             double* rank1;
1522             double* rank2;
1523 0           double result = 0.;
1524 0           double denom1 = 0.;
1525 0           double denom2 = 0.;
1526             double avgrank;
1527             double* tdata1;
1528             double* tdata2;
1529 0           tdata1 = malloc(n*sizeof(double));
1530 0 0         if(!tdata1) return 0.0; /* Memory allocation error */
1531 0           tdata2 = malloc(n*sizeof(double));
1532 0 0         if(!tdata2) /* Memory allocation error */
1533 0           { free(tdata1);
1534 0           return 0.0;
1535             }
1536 0 0         if (transpose==0)
1537 0 0         { for (i = 0; i < n; i++)
1538 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1539 0           { tdata1[m] = data1[index1][i];
1540 0           tdata2[m] = data2[index2][i];
1541 0           m++;
1542             }
1543             }
1544             }
1545             else
1546 0 0         { for (i = 0; i < n; i++)
1547 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1548 0           { tdata1[m] = data1[i][index1];
1549 0           tdata2[m] = data2[i][index2];
1550 0           m++;
1551             }
1552             }
1553             }
1554 0 0         if (m==0)
1555 0           { free(tdata1);
1556 0           free(tdata2);
1557 0           return 0;
1558             }
1559 0           rank1 = getrank(m, tdata1);
1560 0           free(tdata1);
1561 0 0         if(!rank1)
1562 0           { free(tdata2);
1563 0           return 0.0; /* Memory allocation error */
1564             }
1565 0           rank2 = getrank(m, tdata2);
1566 0           free(tdata2);
1567 0 0         if(!rank2) /* Memory allocation error */
1568 0           { free(rank1);
1569 0           return 0.0;
1570             }
1571 0           avgrank = 0.5*(m-1); /* Average rank */
1572 0 0         for (i = 0; i < m; i++)
1573 0           { const double value1 = rank1[i];
1574 0           const double value2 = rank2[i];
1575 0           result += value1 * value2;
1576 0           denom1 += value1 * value1;
1577 0           denom2 += value2 * value2;
1578             }
1579             /* Note: denom1 and denom2 cannot be calculated directly from the number
1580             * of elements. If two elements have the same rank, the squared sum of
1581             * their ranks will change.
1582             */
1583 0           free(rank1);
1584 0           free(rank2);
1585 0           result /= m;
1586 0           denom1 /= m;
1587 0           denom2 /= m;
1588 0           result -= avgrank * avgrank;
1589 0           denom1 -= avgrank * avgrank;
1590 0           denom2 -= avgrank * avgrank;
1591 0 0         if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1592 0 0         if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1593 0           result = result / sqrt(denom1*denom2);
1594 0           result = 1. - result;
1595 0           return result;
1596             }
1597              
1598             /* ********************************************************************* */
1599              
1600             static
1601 0           double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
1602             const double weight[], int index1, int index2, int transpose)
1603             /*
1604             Purpose
1605             =======
1606              
1607             The kendall routine calculates the Kendall distance between two
1608             rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1609              
1610             Arguments
1611             =========
1612              
1613             n (input) int
1614             The number of elements in a row or column. If transpose==0, then n is the number
1615             of columns; otherwise, n is the number of rows.
1616              
1617             data1 (input) double array
1618             The data array containing the first vector.
1619              
1620             data2 (input) double array
1621             The data array containing the second vector.
1622              
1623             mask1 (input) int array
1624             This array which elements in data1 are missing. If mask1[i][j]==0, then
1625             data1[i][j] is missing.
1626              
1627             mask2 (input) int array
1628             This array which elements in data2 are missing. If mask2[i][j]==0, then
1629             data2[i][j] is missing.
1630              
1631             weight (input) double[n]
1632             These weights are ignored, but included for consistency with other distance
1633             measures.
1634              
1635             index1 (input) int
1636             Index of the first row or column.
1637              
1638             index2 (input) int
1639             Index of the second row or column.
1640              
1641             transpose (input) int
1642             If transpose==0, the distance between two rows in the matrix is calculated.
1643             Otherwise, the distance between two columns in the matrix is calculated.
1644             ============================================================================
1645             */
1646 0           { int con = 0;
1647 0           int dis = 0;
1648 0           int exx = 0;
1649 0           int exy = 0;
1650 0           int flag = 0;
1651             /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1652             * found.
1653             */
1654             double denomx;
1655             double denomy;
1656             double tau;
1657             int i, j;
1658 0 0         if (transpose==0)
1659 0 0         { for (i = 0; i < n; i++)
1660 0 0         { if (mask1[index1][i] && mask2[index2][i])
    0          
1661 0 0         { for (j = 0; j < i; j++)
1662 0 0         { if (mask1[index1][j] && mask2[index2][j])
    0          
1663 0           { double x1 = data1[index1][i];
1664 0           double x2 = data1[index1][j];
1665 0           double y1 = data2[index2][i];
1666 0           double y2 = data2[index2][j];
1667 0 0         if (x1 < x2 && y1 < y2) con++;
    0          
1668 0 0         if (x1 > x2 && y1 > y2) con++;
    0          
1669 0 0         if (x1 < x2 && y1 > y2) dis++;
    0          
1670 0 0         if (x1 > x2 && y1 < y2) dis++;
    0          
1671 0 0         if (x1 == x2 && y1 != y2) exx++;
    0          
1672 0 0         if (x1 != x2 && y1 == y2) exy++;
    0          
1673 0           flag = 1;
1674             }
1675             }
1676             }
1677             }
1678             }
1679             else
1680 0 0         { for (i = 0; i < n; i++)
1681 0 0         { if (mask1[i][index1] && mask2[i][index2])
    0          
1682 0 0         { for (j = 0; j < i; j++)
1683 0 0         { if (mask1[j][index1] && mask2[j][index2])
    0          
1684 0           { double x1 = data1[i][index1];
1685 0           double x2 = data1[j][index1];
1686 0           double y1 = data2[i][index2];
1687 0           double y2 = data2[j][index2];
1688 0 0         if (x1 < x2 && y1 < y2) con++;
    0          
1689 0 0         if (x1 > x2 && y1 > y2) con++;
    0          
1690 0 0         if (x1 < x2 && y1 > y2) dis++;
    0          
1691 0 0         if (x1 > x2 && y1 < y2) dis++;
    0          
1692 0 0         if (x1 == x2 && y1 != y2) exx++;
    0          
1693 0 0         if (x1 != x2 && y1 == y2) exy++;
    0          
1694 0           flag = 1;
1695             }
1696             }
1697             }
1698             }
1699             }
1700 0 0         if (!flag) return 0.;
1701 0           denomx = con + dis + exx;
1702 0           denomy = con + dis + exy;
1703 0 0         if (denomx==0) return 1;
1704 0 0         if (denomy==0) return 1;
1705 0           tau = (con-dis)/sqrt(denomx*denomy);
1706 0           return 1.-tau;
1707             }
1708              
1709             /* ********************************************************************* */
1710              
1711 64           static double(*setmetric(char dist))
1712             (int, double**, double**, int**, int**, const double[], int, int, int)
1713 64           { switch(dist)
1714 24           { case 'e': return &euclid;
1715 40           case 'b': return &cityblock;
1716 0           case 'c': return &correlation;
1717 0           case 'a': return &acorrelation;
1718 0           case 'u': return &ucorrelation;
1719 0           case 'x': return &uacorrelation;
1720 0           case 's': return &spearman;
1721 0           case 'k': return &kendall;
1722 0           default: return &euclid;
1723             }
1724             return NULL; /* Never get here */
1725             }
1726              
1727             /* ********************************************************************* */
1728              
1729 4317           static double uniform(void)
1730             /*
1731             Purpose
1732             =======
1733              
1734             This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1735             and 1.0 are excluded. This random number generator is described in:
1736              
1737             Pierre l'Ecuyer
1738             Efficient and Portable Combined Random Number Generators
1739             Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1740              
1741             The first time this routine is called, it initializes the random number
1742             generator using the current time. First, the current epoch time in seconds is
1743             used as a seed for the random number generator in the C library. The first two
1744             random numbers generated by this generator are used to initialize the random
1745             number generator implemented in this routine.
1746              
1747              
1748             Arguments
1749             =========
1750              
1751             None.
1752              
1753              
1754             Return value
1755             ============
1756              
1757             A double-precison number between 0.0 and 1.0.
1758             ============================================================================
1759             */
1760             { int z;
1761             static const int m1 = 2147483563;
1762             static const int m2 = 2147483399;
1763 4317           const double scale = 1.0/m1;
1764              
1765             static int s1 = 0;
1766             static int s2 = 0;
1767              
1768 4317 100         if (s1==0 || s2==0) /* initialize */
    50          
1769 3           { unsigned int initseed = (unsigned int) time(0);
1770 3           srand(initseed);
1771 3           s1 = rand();
1772 3           s2 = rand();
1773             }
1774              
1775             do
1776             { int k;
1777 4317           k = s1/53668;
1778 4317           s1 = 40014*(s1-k*53668)-k*12211;
1779 4317 100         if (s1 < 0) s1+=m1;
1780 4317           k = s2/52774;
1781 4317           s2 = 40692*(s2-k*52774)-k*3791;
1782 4317 100         if(s2 < 0) s2+=m2;
1783 4317           z = s1-s2;
1784 4317 100         if(z < 1) z+=(m1-1);
1785 4317 50         } while (z==m1); /* To avoid returning 1.0 */
1786              
1787 4317           return z*scale;
1788             }
1789              
1790             /* ************************************************************************ */
1791              
1792 700           static int binomial(int n, double p)
1793             /*
1794             Purpose
1795             =======
1796              
1797             This routine generates a random number between 0 and n inclusive, following
1798             the binomial distribution with probability p and n trials. The routine is
1799             based on the BTPE algorithm, described in:
1800              
1801             Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1802             Binomial Random Variate Generation
1803             Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1804              
1805              
1806             Arguments
1807             =========
1808              
1809             p (input) double
1810             The probability of a single event. This probability should be less than or
1811             equal to 0.5.
1812              
1813             n (input) int
1814             The number of trials.
1815              
1816              
1817             Return value
1818             ============
1819              
1820             An integer drawn from a binomial distribution with parameters (p, n).
1821              
1822             ============================================================================
1823             */
1824 700           { const double q = 1 - p;
1825 700 50         if (n*p < 30.0) /* Algorithm BINV */
1826 700           { const double s = p/q;
1827 700           const double a = (n+1)*s;
1828 700           double r = exp(n*log(q)); /* pow() causes a crash on AIX */
1829 700           int x = 0;
1830 700           double u = uniform();
1831             while(1)
1832 2002 100         { if (u < r) return x;
1833 1302           u-=r;
1834 1302           x++;
1835 1302           r *= (a/x)-s;
1836 1302           }
1837             }
1838             else /* Algorithm BTPE */
1839             { /* Step 0 */
1840 0           const double fm = n*p + p;
1841 0           const int m = (int) fm;
1842 0           const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
1843 0           const double xm = m + 0.5;
1844 0           const double xl = xm - p1;
1845 0           const double xr = xm + p1;
1846 0           const double c = 0.134 + 20.5/(15.3+m);
1847 0           const double a = (fm-xl)/(fm-xl*p);
1848 0           const double b = (xr-fm)/(xr*q);
1849 0           const double lambdal = a*(1.0+0.5*a);
1850 0           const double lambdar = b*(1.0+0.5*b);
1851 0           const double p2 = p1*(1+2*c);
1852 0           const double p3 = p2 + c/lambdal;
1853 0           const double p4 = p3 + c/lambdar;
1854             while (1)
1855             { /* Step 1 */
1856             int y;
1857             int k;
1858 0           double u = uniform();
1859 0           double v = uniform();
1860 0           u *= p4;
1861 0 0         if (u <= p1) return (int)(xm-p1*v+u);
1862             /* Step 2 */
1863 0 0         if (u > p2)
1864             { /* Step 3 */
1865 0 0         if (u > p3)
1866             { /* Step 4 */
1867 0           y = (int)(xr-log(v)/lambdar);
1868 0 0         if (y > n) continue;
1869             /* Go to step 5 */
1870 0           v = v*(u-p3)*lambdar;
1871             }
1872             else
1873 0           { y = (int)(xl+log(v)/lambdal);
1874 0 0         if (y < 0) continue;
1875             /* Go to step 5 */
1876 0           v = v*(u-p2)*lambdal;
1877             }
1878             }
1879             else
1880 0           { const double x = xl + (u-p1)/c;
1881 0           v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1882 0 0         if (v > 1) continue;
1883             /* Go to step 5 */
1884 0           y = (int)x;
1885             }
1886             /* Step 5 */
1887             /* Step 5.0 */
1888 0           k = abs(y-m);
1889 0 0         if (k > 20 && k < 0.5*n*p*q-1.0)
    0          
1890             { /* Step 5.2 */
1891 0           double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
1892 0           double t = -k*k/(2*n*p*q);
1893 0           double A = log(v);
1894 0 0         if (A < t-rho) return y;
1895 0 0         else if (A > t+rho) continue;
1896             else
1897             { /* Step 5.3 */
1898 0           double x1 = y+1;
1899 0           double f1 = m+1;
1900 0           double z = n+1-m;
1901 0           double w = n-y+1;
1902 0           double x2 = x1*x1;
1903 0           double f2 = f1*f1;
1904 0           double z2 = z*z;
1905 0           double w2 = w*w;
1906 0 0         if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1907 0           + (y-m)*log(w*p/(x1*q))
1908 0           + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1909 0           + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1910 0           + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1911 0           + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1912 0           continue;
1913 0           return y;
1914             }
1915             }
1916             else
1917             { /* Step 5.1 */
1918             int i;
1919 0           const double s = p/q;
1920 0           const double aa = s*(n+1);
1921 0           double f = 1.0;
1922 0 0         for (i = m; i < y; f *= (aa/(++i)-s));
1923 0 0         for (i = y; i < m; f /= (aa/(++i)-s));
1924 0 0         if (v > f) continue;
1925 0           return y;
1926             }
1927 0           }
1928             }
1929             /* Never get here */
1930             return -1;
1931             }
1932              
1933             /* ************************************************************************ */
1934              
1935 300           static void randomassign (int nclusters, int nelements, int clusterid[])
1936             /*
1937             Purpose
1938             =======
1939              
1940             The randomassign routine performs an initial random clustering, needed for
1941             k-means or k-median clustering. Elements (genes or microarrays) are randomly
1942             assigned to clusters. The number of elements in each cluster is chosen
1943             randomly, making sure that each cluster will receive at least one element.
1944              
1945              
1946             Arguments
1947             =========
1948              
1949             nclusters (input) int
1950             The number of clusters.
1951              
1952             nelements (input) int
1953             The number of elements to be clustered (i.e., the number of genes or microarrays
1954             to be clustered).
1955              
1956             clusterid (output) int[nelements]
1957             The cluster number to which an element was assigned.
1958              
1959             ============================================================================
1960             */
1961             { int i, j;
1962 300           int k = 0;
1963             double p;
1964 300           int n = nelements-nclusters;
1965             /* Draw the number of elements in each cluster from a multinomial
1966             * distribution, reserving ncluster elements to set independently
1967             * in order to guarantee that none of the clusters are empty.
1968             */
1969 1000 100         for (i = 0; i < nclusters-1; i++)
1970 700           { p = 1.0/(nclusters-i);
1971 700           j = binomial(n, p);
1972 700           n -= j;
1973 700           j += k+1; /* Assign at least one element to cluster i */
1974 2702 100         for ( ; k < j; k++) clusterid[k] = i;
1975             }
1976             /* Assign the remaining elements to the last cluster */
1977 1198 100         for ( ; k < nelements; k++) clusterid[k] = i;
1978              
1979             /* Create a random permutation of the cluster assignments */
1980 3200 100         for (i = 0; i < nelements; i++)
1981 2900           { j = (int) (i + (nelements-i)*uniform());
1982 2900           k = clusterid[j];
1983 2900           clusterid[j] = clusterid[i];
1984 2900           clusterid[i] = k;
1985             }
1986              
1987 300           return;
1988             }
1989              
1990             /* ********************************************************************* */
1991              
1992 555           static void getclustermeans(int nclusters, int nrows, int ncolumns,
1993             double** data, int** mask, int clusterid[], double** cdata, int** cmask,
1994             int transpose)
1995             /*
1996             Purpose
1997             =======
1998              
1999             The getclustermeans routine calculates the cluster centroids, given to which
2000             cluster each element belongs. The centroid is defined as the mean over all
2001             elements for each dimension.
2002              
2003             Arguments
2004             =========
2005              
2006             nclusters (input) int
2007             The number of clusters.
2008              
2009             nrows (input) int
2010             The number of rows in the gene expression data matrix, equal to the number of
2011             genes.
2012              
2013             ncolumns (input) int
2014             The number of columns in the gene expression data matrix, equal to the number of
2015             microarrays.
2016              
2017             data (input) double[nrows][ncolumns]
2018             The array containing the gene expression data.
2019              
2020             mask (input) int[nrows][ncolumns]
2021             This array shows which data values are missing. If mask[i][j]==0, then
2022             data[i][j] is missing.
2023              
2024             clusterid (output) int[nrows] if transpose==0
2025             int[ncolumns] if transpose==1
2026             The cluster number to which each element belongs. If transpose==0, then the
2027             dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2028             is equal to ncolumns (the number of microarrays).
2029              
2030             cdata (output) double[nclusters][ncolumns] if transpose==0
2031             double[nrows][nclusters] if transpose==1
2032             On exit of getclustermeans, this array contains the cluster centroids.
2033              
2034             cmask (output) int[nclusters][ncolumns] if transpose==0
2035             int[nrows][nclusters] if transpose==1
2036             This array shows which data values of are missing for each centroid. If
2037             cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2038             a centroid if all corresponding data values of the cluster members are missing.
2039              
2040             transpose (input) int
2041             If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2042             columns (microarrays) are specified.
2043              
2044             ========================================================================
2045             */
2046             { int i, j, k;
2047 555 50         if (transpose==0)
2048 2220 100         { for (i = 0; i < nclusters; i++)
2049 6813 100         { for (j = 0; j < ncolumns; j++)
2050 5148           { cmask[i][j] = 0;
2051 5148           cdata[i][j] = 0.;
2052             }
2053             }
2054 5952 100         for (k = 0; k < nrows; k++)
2055 5397           { i = clusterid[k];
2056 18615 100         for (j = 0; j < ncolumns; j++)
2057 13218 50         { if (mask[k][j] != 0)
2058 13218           { cdata[i][j]+=data[k][j];
2059 13218           cmask[i][j]++;
2060             }
2061             }
2062             }
2063 2220 100         for (i = 0; i < nclusters; i++)
2064 6813 100         { for (j = 0; j < ncolumns; j++)
2065 5148 50         { if (cmask[i][j]>0)
2066 5148           { cdata[i][j] /= cmask[i][j];
2067 5148           cmask[i][j] = 1;
2068             }
2069             }
2070             }
2071             }
2072             else
2073 0 0         { for (i = 0; i < nrows; i++)
2074 0 0         { for (j = 0; j < nclusters; j++)
2075 0           { cdata[i][j] = 0.;
2076 0           cmask[i][j] = 0;
2077             }
2078             }
2079 0 0         for (k = 0; k < ncolumns; k++)
2080 0           { i = clusterid[k];
2081 0 0         for (j = 0; j < nrows; j++)
2082 0 0         { if (mask[j][k] != 0)
2083 0           { cdata[j][i]+=data[j][k];
2084 0           cmask[j][i]++;
2085             }
2086             }
2087             }
2088 0 0         for (i = 0; i < nrows; i++)
2089 0 0         { for (j = 0; j < nclusters; j++)
2090 0 0         { if (cmask[i][j]>0)
2091 0           { cdata[i][j] /= cmask[i][j];
2092 0           cmask[i][j] = 1;
2093             }
2094             }
2095             }
2096             }
2097 555           }
2098              
2099             /* ********************************************************************* */
2100              
2101             static void
2102 0           getclustermedians(int nclusters, int nrows, int ncolumns,
2103             double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2104             int transpose, double cache[])
2105             /*
2106             Purpose
2107             =======
2108              
2109             The getclustermedians routine calculates the cluster centroids, given to which
2110             cluster each element belongs. The centroid is defined as the median over all
2111             elements for each dimension.
2112              
2113             Arguments
2114             =========
2115              
2116             nclusters (input) int
2117             The number of clusters.
2118              
2119             nrows (input) int
2120             The number of rows in the gene expression data matrix, equal to the number of
2121             genes.
2122              
2123             ncolumns (input) int
2124             The number of columns in the gene expression data matrix, equal to the number of
2125             microarrays.
2126              
2127             data (input) double[nrows][ncolumns]
2128             The array containing the gene expression data.
2129              
2130             mask (input) int[nrows][ncolumns]
2131             This array shows which data values are missing. If mask[i][j]==0, then
2132             data[i][j] is missing.
2133              
2134             clusterid (output) int[nrows] if transpose==0
2135             int[ncolumns] if transpose==1
2136             The cluster number to which each element belongs. If transpose==0, then the
2137             dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2138             is equal to ncolumns (the number of microarrays).
2139              
2140             cdata (output) double[nclusters][ncolumns] if transpose==0
2141             double[nrows][nclusters] if transpose==1
2142             On exit of getclustermedians, this array contains the cluster centroids.
2143              
2144             cmask (output) int[nclusters][ncolumns] if transpose==0
2145             int[nrows][nclusters] if transpose==1
2146             This array shows which data values of are missing for each centroid. If
2147             cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2148             a centroid if all corresponding data values of the cluster members are missing.
2149              
2150             transpose (input) int
2151             If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2152             columns (microarrays) are specified.
2153              
2154             cache (input) double[nrows] if transpose==0
2155             double[ncolumns] if transpose==1
2156             This array should be allocated before calling getclustermedians; its contents
2157             on input is not relevant. This array is used as a temporary storage space when
2158             calculating the medians.
2159              
2160             ========================================================================
2161             */
2162             { int i, j, k;
2163 0 0         if (transpose==0)
2164 0 0         { for (i = 0; i < nclusters; i++)
2165 0 0         { for (j = 0; j < ncolumns; j++)
2166 0           { int count = 0;
2167 0 0         for (k = 0; k < nrows; k++)
2168 0 0         { if (i==clusterid[k] && mask[k][j])
    0          
2169 0           { cache[count] = data[k][j];
2170 0           count++;
2171             }
2172             }
2173 0 0         if (count>0)
2174 0           { cdata[i][j] = median(count,cache);
2175 0           cmask[i][j] = 1;
2176             }
2177             else
2178 0           { cdata[i][j] = 0.;
2179 0           cmask[i][j] = 0;
2180             }
2181             }
2182             }
2183             }
2184             else
2185 0 0         { for (i = 0; i < nclusters; i++)
2186 0 0         { for (j = 0; j < nrows; j++)
2187 0           { int count = 0;
2188 0 0         for (k = 0; k < ncolumns; k++)
2189 0 0         { if (i==clusterid[k] && mask[j][k])
    0          
2190 0           { cache[count] = data[j][k];
2191 0           count++;
2192             }
2193             }
2194 0 0         if (count>0)
2195 0           { cdata[j][i] = median(count,cache);
2196 0           cmask[j][i] = 1;
2197             }
2198             else
2199 0           { cdata[j][i] = 0.;
2200 0           cmask[j][i] = 0;
2201             }
2202             }
2203             }
2204             }
2205 0           }
2206              
2207             /* ********************************************************************* */
2208              
2209 0           int getclustercentroids(int nclusters, int nrows, int ncolumns,
2210             double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2211             int transpose, char method)
2212             /*
2213             Purpose
2214             =======
2215              
2216             The getclustercentroids routine calculates the cluster centroids, given to
2217             which cluster each element belongs. Depending on the argument method, the
2218             centroid is defined as either the mean or the median for each dimension over
2219             all elements belonging to a cluster.
2220              
2221             Arguments
2222             =========
2223              
2224             nclusters (input) int
2225             The number of clusters.
2226              
2227             nrows (input) int
2228             The number of rows in the gene expression data matrix, equal to the number of
2229             genes.
2230              
2231             ncolumns (input) int
2232             The number of columns in the gene expression data matrix, equal to the number of
2233             microarrays.
2234              
2235             data (input) double[nrows][ncolumns]
2236             The array containing the gene expression data.
2237              
2238             mask (input) int[nrows][ncolumns]
2239             This array shows which data values are missing. If mask[i][j]==0, then
2240             data[i][j] is missing.
2241              
2242             clusterid (output) int[nrows] if transpose==0
2243             int[ncolumns] if transpose==1
2244             The cluster number to which each element belongs. If transpose==0, then the
2245             dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2246             is equal to ncolumns (the number of microarrays).
2247              
2248             cdata (output) double[nclusters][ncolumns] if transpose==0
2249             double[nrows][nclusters] if transpose==1
2250             On exit of getclustercentroids, this array contains the cluster centroids.
2251              
2252             cmask (output) int[nclusters][ncolumns] if transpose==0
2253             int[nrows][nclusters] if transpose==1
2254             This array shows which data values of are missing for each centroid. If
2255             cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2256             a centroid if all corresponding data values of the cluster members are missing.
2257              
2258             transpose (input) int
2259             If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2260             columns (microarrays) are specified.
2261              
2262             method (input) char
2263             For method=='a', the centroid is defined as the mean over all elements
2264             belonging to a cluster for each dimension.
2265             For method=='m', the centroid is defined as the median over all elements
2266             belonging to a cluster for each dimension.
2267              
2268             Return value
2269             ============
2270              
2271             The function returns an integer to indicate success or failure. If a
2272             memory error occurs, or if method is not 'm' or 'a', getclustercentroids
2273             returns 0. If successful, getclustercentroids returns 1.
2274             ========================================================================
2275             */
2276 0           { switch(method)
2277             { case 'm':
2278 0 0         { const int nelements = (transpose==0) ? nrows : ncolumns;
2279 0           double* cache = malloc(nelements*sizeof(double));
2280 0 0         if (!cache) return 0;
2281 0           getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
2282             cdata, cmask, transpose, cache);
2283 0           free(cache);
2284 0           return 1;
2285             }
2286             case 'a':
2287 0           { getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
2288             cdata, cmask, transpose);
2289 0           return 1;
2290             }
2291             }
2292 0           return 0;
2293             }
2294              
2295             /* ********************************************************************* */
2296              
2297 343           void getclustermedoids(int nclusters, int nelements, double** distance,
2298             int clusterid[], int centroids[], double errors[])
2299             /*
2300             Purpose
2301             =======
2302              
2303             The getclustermedoids routine calculates the cluster centroids, given to which
2304             cluster each element belongs. The centroid is defined as the element with the
2305             smallest sum of distances to the other elements.
2306              
2307             Arguments
2308             =========
2309              
2310             nclusters (input) int
2311             The number of clusters.
2312              
2313             nelements (input) int
2314             The total number of elements.
2315              
2316             distmatrix (input) double array, ragged
2317             (number of rows is nelements, number of columns is equal to the row number)
2318             The distance matrix. To save space, the distance matrix is given in the
2319             form of a ragged array. The distance matrix is symmetric and has zeros
2320             on the diagonal. See distancematrix for a description of the content.
2321              
2322             clusterid (output) int[nelements]
2323             The cluster number to which each element belongs.
2324              
2325             centroid (output) int[nclusters]
2326             The index of the element that functions as the centroid for each cluster.
2327              
2328             errors (output) double[nclusters]
2329             The within-cluster sum of distances between the items and the cluster
2330             centroid.
2331              
2332             ========================================================================
2333             */
2334             { int i, j, k;
2335 1715 100         for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
2336 4459 100         for (i = 0; i < nelements; i++)
2337 4116           { double d = 0.0;
2338 4116           j = clusterid[i];
2339 46379 100         for (k = 0; k < nelements; k++)
2340 43863 100         { if (i==k || clusterid[k]!=j) continue;
    100          
2341 9493 100         d += (i < k ? distance[k][i] : distance[i][k]);
2342 9493 100         if (d > errors[j]) break;
2343             }
2344 4116 100         if (d < errors[j])
2345 2195           { errors[j] = d;
2346 2195           centroids[j] = i;
2347             }
2348             }
2349 343           }
2350              
2351             /* ********************************************************************* */
2352              
2353             static int
2354 3           kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2355             double weight[], int transpose, int npass, char dist,
2356             double** cdata, int** cmask, int clusterid[], double* error,
2357             int tclusterid[], int counts[], int mapping[])
2358             { int i, j, k;
2359 3 50         const int nelements = (transpose==0) ? nrows : ncolumns;
2360 3 50         const int ndata = (transpose==0) ? ncolumns : nrows;
2361 3           int ifound = 1;
2362 3           int ipass = 0;
2363             /* Set the metric function as indicated by dist */
2364 3           double (*metric)
2365             (int, double**, double**, int**, int**, const double[], int, int, int) =
2366 3           setmetric(dist);
2367              
2368             /* We save the clustering solution periodically and check if it reappears */
2369 3           int* saved = malloc(nelements*sizeof(int));
2370 3 50         if (saved==NULL) return -1;
2371              
2372 3           *error = DBL_MAX;
2373              
2374             do
2375 201           { double total = DBL_MAX;
2376 201           int counter = 0;
2377 201           int period = 10;
2378              
2379             /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2380 201 100         if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2381              
2382 804 100         for (i = 0; i < nclusters; i++) counts[i] = 0;
2383 1914 100         for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2384              
2385             /* Start the loop */
2386             while(1)
2387 555           { double previous = total;
2388 555           total = 0.0;
2389              
2390 555 100         if (counter % period == 0) /* Save the current cluster assignments */
2391 1914 100         { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2392 201 50         if (period < INT_MAX / 2) period *= 2;
2393             }
2394 555           counter++;
2395              
2396             /* Find the center */
2397 555           getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
2398             cdata, cmask, transpose);
2399              
2400 5952 100         for (i = 0; i < nelements; i++)
2401             /* Calculate the distances */
2402             { double distance;
2403 5397           k = tclusterid[i];
2404 5397 100         if (counts[k]==1) continue;
2405             /* No reassignment if that would lead to an empty cluster */
2406             /* Treat the present cluster as a special case */
2407 4790           distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2408 19160 100         for (j = 0; j < nclusters; j++)
2409             { double tdistance;
2410 14370 100         if (j==k) continue;
2411 9580           tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2412 9580 100         if (tdistance < distance)
2413 932           { distance = tdistance;
2414 932           counts[tclusterid[i]]--;
2415 932           tclusterid[i] = j;
2416 932           counts[j]++;
2417             }
2418             }
2419 4790           total += distance;
2420             }
2421 555 100         if (total>=previous) break;
2422             /* total>=previous is FALSE on some machines even if total and previous
2423             * are bitwise identical. */
2424 938 100         for (i = 0; i < nelements; i++)
2425 889 100         if (saved[i]!=tclusterid[i]) break;
2426 403 100         if (i==nelements)
2427 49           break; /* Identical solution found; break out of this loop */
2428 354           }
2429              
2430 201 100         if (npass<=1)
2431 1           { *error = total;
2432 1           break;
2433             }
2434              
2435 800 100         for (i = 0; i < nclusters; i++) mapping[i] = -1;
2436 1662 100         for (i = 0; i < nelements; i++)
2437 1544           { j = tclusterid[i];
2438 1544           k = clusterid[i];
2439 1544 100         if (mapping[k] == -1) mapping[k] = j;
2440 999 100         else if (mapping[k] != j)
2441 82 100         { if (total < *error)
2442 2           { ifound = 1;
2443 2           *error = total;
2444 19 100         for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2445             }
2446 82           break;
2447             }
2448             }
2449 200 100         if (i==nelements) ifound++; /* break statement not encountered */
2450 200 100         } while (++ipass < npass);
2451              
2452 3           free(saved);
2453 3           return ifound;
2454             }
2455              
2456             /* ---------------------------------------------------------------------- */
2457              
2458             static int
2459 0           kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2460             double weight[], int transpose, int npass, char dist,
2461             double** cdata, int** cmask, int clusterid[], double* error,
2462             int tclusterid[], int counts[], int mapping[], double cache[])
2463             { int i, j, k;
2464 0 0         const int nelements = (transpose==0) ? nrows : ncolumns;
2465 0 0         const int ndata = (transpose==0) ? ncolumns : nrows;
2466 0           int ifound = 1;
2467 0           int ipass = 0;
2468             /* Set the metric function as indicated by dist */
2469 0           double (*metric)
2470             (int, double**, double**, int**, int**, const double[], int, int, int) =
2471 0           setmetric(dist);
2472              
2473             /* We save the clustering solution periodically and check if it reappears */
2474 0           int* saved = malloc(nelements*sizeof(int));
2475 0 0         if (saved==NULL) return -1;
2476              
2477 0           *error = DBL_MAX;
2478              
2479             do
2480 0           { double total = DBL_MAX;
2481 0           int counter = 0;
2482 0           int period = 10;
2483              
2484             /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2485 0 0         if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2486              
2487 0 0         for (i = 0; i < nclusters; i++) counts[i]=0;
2488 0 0         for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2489              
2490             /* Start the loop */
2491             while(1)
2492 0           { double previous = total;
2493 0           total = 0.0;
2494              
2495 0 0         if (counter % period == 0) /* Save the current cluster assignments */
2496 0 0         { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2497 0 0         if (period < INT_MAX / 2) period *= 2;
2498             }
2499 0           counter++;
2500              
2501             /* Find the center */
2502 0           getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
2503             cdata, cmask, transpose, cache);
2504              
2505 0 0         for (i = 0; i < nelements; i++)
2506             /* Calculate the distances */
2507             { double distance;
2508 0           k = tclusterid[i];
2509 0 0         if (counts[k]==1) continue;
2510             /* No reassignment if that would lead to an empty cluster */
2511             /* Treat the present cluster as a special case */
2512 0           distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2513 0 0         for (j = 0; j < nclusters; j++)
2514             { double tdistance;
2515 0 0         if (j==k) continue;
2516 0           tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2517 0 0         if (tdistance < distance)
2518 0           { distance = tdistance;
2519 0           counts[tclusterid[i]]--;
2520 0           tclusterid[i] = j;
2521 0           counts[j]++;
2522             }
2523             }
2524 0           total += distance;
2525             }
2526 0 0         if (total>=previous) break;
2527             /* total>=previous is FALSE on some machines even if total and previous
2528             * are bitwise identical. */
2529 0 0         for (i = 0; i < nelements; i++)
2530 0 0         if (saved[i]!=tclusterid[i]) break;
2531 0 0         if (i==nelements)
2532 0           break; /* Identical solution found; break out of this loop */
2533 0           }
2534              
2535 0 0         if (npass<=1)
2536 0           { *error = total;
2537 0           break;
2538             }
2539              
2540 0 0         for (i = 0; i < nclusters; i++) mapping[i] = -1;
2541 0 0         for (i = 0; i < nelements; i++)
2542 0           { j = tclusterid[i];
2543 0           k = clusterid[i];
2544 0 0         if (mapping[k] == -1) mapping[k] = j;
2545 0 0         else if (mapping[k] != j)
2546 0 0         { if (total < *error)
2547 0           { ifound = 1;
2548 0           *error = total;
2549 0 0         for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2550             }
2551 0           break;
2552             }
2553             }
2554 0 0         if (i==nelements) ifound++; /* break statement not encountered */
2555 0 0         } while (++ipass < npass);
2556              
2557 0           free(saved);
2558 0           return ifound;
2559             }
2560              
2561             /* ********************************************************************* */
2562              
2563 3           void kcluster (int nclusters, int nrows, int ncolumns,
2564             double** data, int** mask, double weight[], int transpose,
2565             int npass, char method, char dist,
2566             int clusterid[], double* error, int* ifound)
2567             /*
2568             Purpose
2569             =======
2570              
2571             The kcluster routine performs k-means or k-median clustering on a given set of
2572             elements, using the specified distance measure. The number of clusters is given
2573             by the user. Multiple passes are being made to find the optimal clustering
2574             solution, each time starting from a different initial clustering.
2575              
2576              
2577             Arguments
2578             =========
2579              
2580             nclusters (input) int
2581             The number of clusters to be found.
2582              
2583             data (input) double[nrows][ncolumns]
2584             The array containing the data of the elements to be clustered (i.e., the gene
2585             expression data).
2586              
2587             mask (input) int[nrows][ncolumns]
2588             This array shows which data values are missing. If
2589             mask[i][j] == 0, then data[i][j] is missing.
2590              
2591             nrows (input) int
2592             The number of rows in the data matrix, equal to the number of genes.
2593              
2594             ncolumns (input) int
2595             The number of columns in the data matrix, equal to the number of microarrays.
2596              
2597             weight (input) double[n]
2598             The weights that are used to calculate the distance.
2599              
2600             transpose (input) int
2601             If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2602             of the matrix are clustered.
2603              
2604             npass (input) int
2605             The number of times clustering is performed. Clustering is performed npass
2606             times, each time starting from a different (random) initial assignment of
2607             genes to clusters. The clustering solution with the lowest within-cluster sum
2608             of distances is chosen.
2609             If npass==0, then the clustering algorithm will be run once, where the initial
2610             assignment of elements to clusters is taken from the clusterid array.
2611              
2612             method (input) char
2613             Defines whether the arithmetic mean (method=='a') or the median
2614             (method=='m') is used to calculate the cluster center.
2615              
2616             dist (input) char
2617             Defines which distance measure is used, as given by the table:
2618             dist=='e': Euclidean distance
2619             dist=='b': City-block distance
2620             dist=='c': correlation
2621             dist=='a': absolute value of the correlation
2622             dist=='u': uncentered correlation
2623             dist=='x': absolute uncentered correlation
2624             dist=='s': Spearman's rank correlation
2625             dist=='k': Kendall's tau
2626             For other values of dist, the default (Euclidean distance) is used.
2627              
2628             clusterid (output; input) int[nrows] if transpose==0
2629             int[ncolumns] if transpose==1
2630             The cluster number to which a gene or microarray was assigned. If npass==0,
2631             then on input clusterid contains the initial clustering assignment from which
2632             the clustering algorithm starts. On output, it contains the clustering solution
2633             that was found.
2634              
2635             error (output) double*
2636             The sum of distances to the cluster center of each item in the optimal k-means
2637             clustering solution that was found.
2638              
2639             ifound (output) int*
2640             The number of times the optimal clustering solution was
2641             found. The value of ifound is at least 1; its maximum value is npass. If the
2642             number of clusters is larger than the number of elements being clustered,
2643             *ifound is set to 0 as an error code. If a memory allocation error occurs,
2644             *ifound is set to -1.
2645              
2646             ========================================================================
2647             */
2648 3 50         { const int nelements = (transpose==0) ? nrows : ncolumns;
2649 3 50         const int ndata = (transpose==0) ? ncolumns : nrows;
2650              
2651             int i;
2652             int ok;
2653             int* tclusterid;
2654 3           int* mapping = NULL;
2655             double** cdata;
2656             int** cmask;
2657             int* counts;
2658              
2659 3 50         if (nelements < nclusters)
2660 0           { *ifound = 0;
2661 0           return;
2662             }
2663             /* More clusters asked for than elements available */
2664              
2665 3           *ifound = -1;
2666              
2667             /* This will contain the number of elements in each cluster, which is
2668             * needed to check for empty clusters. */
2669 3           counts = malloc(nclusters*sizeof(int));
2670 3 50         if(!counts) return;
2671              
2672             /* Find out if the user specified an initial clustering */
2673 3 100         if (npass<=1) tclusterid = clusterid;
2674             else
2675 2           { tclusterid = malloc(nelements*sizeof(int));
2676 2 50         if (!tclusterid)
2677 0           { free(counts);
2678 0           return;
2679             }
2680 2           mapping = malloc(nclusters*sizeof(int));
2681 2 50         if (!mapping)
2682 0           { free(counts);
2683 0           free(tclusterid);
2684 0           return;
2685             }
2686 19 100         for (i = 0; i < nelements; i++) clusterid[i] = 0;
2687             }
2688              
2689             /* Allocate space to store the centroid data */
2690 3 50         if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
2691 0           else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
2692 3 50         if(!ok)
2693 0           { free(counts);
2694 0 0         if(npass>1)
2695 0           { free(tclusterid);
2696 0           free(mapping);
2697 0           return;
2698             }
2699             }
2700              
2701 3 50         if (method=='m')
2702 0           { double* cache = malloc(nelements*sizeof(double));
2703 0 0         if(cache)
2704 0           { *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
2705             transpose, npass, dist, cdata, cmask, clusterid, error,
2706             tclusterid, counts, mapping, cache);
2707 0           free(cache);
2708             }
2709             }
2710             else
2711 3           *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
2712             transpose, npass, dist, cdata, cmask, clusterid, error,
2713             tclusterid, counts, mapping);
2714              
2715             /* Deallocate temporarily used space */
2716 3 100         if (npass > 1)
2717 2           { free(mapping);
2718 2           free(tclusterid);
2719             }
2720              
2721 3 50         if (transpose==0) freedatamask(nclusters, cdata, cmask);
2722 0           else freedatamask(ndata, cdata, cmask);
2723              
2724 3           free(counts);
2725             }
2726              
2727             /* *********************************************************************** */
2728              
2729 2           void kmedoids (int nclusters, int nelements, double** distmatrix,
2730             int npass, int clusterid[], double* error, int* ifound)
2731             /*
2732             Purpose
2733             =======
2734              
2735             The kmedoids routine performs k-medoids clustering on a given set of elements,
2736             using the distance matrix and the number of clusters passed by the user.
2737             Multiple passes are being made to find the optimal clustering solution, each
2738             time starting from a different initial clustering.
2739              
2740              
2741             Arguments
2742             =========
2743              
2744             nclusters (input) int
2745             The number of clusters to be found.
2746              
2747             nelements (input) int
2748             The number of elements to be clustered.
2749              
2750             distmatrix (input) double array, ragged
2751             (number of rows is nelements, number of columns is equal to the row number)
2752             The distance matrix. To save space, the distance matrix is given in the
2753             form of a ragged array. The distance matrix is symmetric and has zeros
2754             on the diagonal. See distancematrix for a description of the content.
2755              
2756             npass (input) int
2757             The number of times clustering is performed. Clustering is performed npass
2758             times, each time starting from a different (random) initial assignment of genes
2759             to clusters. The clustering solution with the lowest within-cluster sum of
2760             distances is chosen.
2761             If npass==0, then the clustering algorithm will be run once, where the initial
2762             assignment of elements to clusters is taken from the clusterid array.
2763              
2764             clusterid (output; input) int[nelements]
2765             On input, if npass==0, then clusterid contains the initial clustering assignment
2766             from which the clustering algorithm starts; all numbers in clusterid should be
2767             between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2768             input.
2769             On output, clusterid contains the clustering solution that was found: clusterid
2770             contains the number of the cluster to which each item was assigned. On output,
2771             the number of a cluster is defined as the item number of the centroid of the
2772             cluster.
2773              
2774             error (output) double
2775             The sum of distances to the cluster center of each item in the optimal k-medoids
2776             clustering solution that was found.
2777              
2778             ifound (output) int
2779             If kmedoids is successful: the number of times the optimal clustering solution
2780             was found. The value of ifound is at least 1; its maximum value is npass.
2781             If the user requested more clusters than elements available, ifound is set
2782             to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2783              
2784             ========================================================================
2785             */
2786             { int i, j, icluster;
2787             int* tclusterid;
2788             int* saved;
2789             int* centroids;
2790             double* errors;
2791 2           int ipass = 0;
2792              
2793 2 50         if (nelements < nclusters)
2794 0           { *ifound = 0;
2795 0           return;
2796             } /* More clusters asked for than elements available */
2797              
2798 2           *ifound = -1;
2799              
2800             /* We save the clustering solution periodically and check if it reappears */
2801 2           saved = malloc(nelements*sizeof(int));
2802 2 50         if (saved==NULL) return;
2803              
2804 2           centroids = malloc(nclusters*sizeof(int));
2805 2 50         if(!centroids)
2806 0           { free(saved);
2807 0           return;
2808             }
2809              
2810 2           errors = malloc(nclusters*sizeof(double));
2811 2 50         if(!errors)
2812 0           { free(saved);
2813 0           free(centroids);
2814 0           return;
2815             }
2816              
2817             /* Find out if the user specified an initial clustering */
2818 2 100         if (npass<=1) tclusterid = clusterid;
2819             else
2820 1           { tclusterid = malloc(nelements*sizeof(int));
2821 1 50         if(!tclusterid)
2822 0           { free(saved);
2823 0           free(centroids);
2824 0           free(errors);
2825 0           return;
2826             }
2827 13 100         for (i = 0; i < nelements; i++) clusterid[i] = -1;
2828             }
2829              
2830 2           *error = DBL_MAX;
2831             do /* Start the loop */
2832 101           { double total = DBL_MAX;
2833 101           int counter = 0;
2834 101           int period = 10;
2835              
2836 101 100         if (npass!=0) randomassign(nclusters, nelements, tclusterid);
2837             while(1)
2838 343           { double previous = total;
2839 343           total = 0.0;
2840              
2841 343 100         if (counter % period == 0) /* Save the current cluster assignments */
2842 1313 100         { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2843 101 50         if (period < INT_MAX / 2) period *= 2;
2844             }
2845 343           counter++;
2846              
2847             /* Find the center */
2848 343           getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2849             centroids, errors);
2850              
2851 4459 100         for (i = 0; i < nelements; i++)
2852             /* Find the closest cluster */
2853 4116           { double distance = DBL_MAX;
2854 17150 100         for (icluster = 0; icluster < nclusters; icluster++)
2855             { double tdistance;
2856 14406           j = centroids[icluster];
2857 14406 100         if (i==j)
2858 1372           { distance = 0.0;
2859 1372           tclusterid[i] = icluster;
2860 1372           break;
2861             }
2862 13034 100         tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2863 13034 100         if (tdistance < distance)
2864 7078           { distance = tdistance;
2865 7078           tclusterid[i] = icluster;
2866             }
2867             }
2868 4116           total += distance;
2869             }
2870 343 100         if (total>=previous) break;
2871             /* total>=previous is FALSE on some machines even if total and previous
2872             * are bitwise identical. */
2873 558 50         for (i = 0; i < nelements; i++)
2874 558 100         if (saved[i]!=tclusterid[i]) break;
2875 242 50         if (i==nelements)
2876 0           break; /* Identical solution found; break out of this loop */
2877 242           }
2878              
2879 101 100         if (npass <= 1) {
2880 1           *ifound = 1;
2881 1           *error = total;
2882             /* Replace by the centroid in each cluster. */
2883 13 100         for (j = 0; j < nelements; j++) {
2884 12           clusterid[j] = centroids[tclusterid[j]];
2885             }
2886 1           break;
2887             }
2888              
2889 453 100         for (i = 0; i < nelements; i++)
2890 438 100         { if (clusterid[i]!=centroids[tclusterid[i]])
2891 85 100         { if (total < *error)
2892 1           { *ifound = 1;
2893 1           *error = total;
2894             /* Replace by the centroid in each cluster. */
2895 13 100         for (j = 0; j < nelements; j++) {
2896 12           clusterid[j] = centroids[tclusterid[j]];
2897             }
2898             }
2899 85           break;
2900             }
2901             }
2902 100 100         if (i==nelements) (*ifound)++; /* break statement not encountered */
2903 100 100         } while (++ipass < npass);
2904              
2905             /* Deallocate temporarily used space */
2906 2 100         if (npass > 1) free(tclusterid);
2907              
2908 2           free(saved);
2909 2           free(centroids);
2910 2           free(errors);
2911              
2912 2           return;
2913             }
2914              
2915             /* ******************************************************************** */
2916              
2917 7           double** distancematrix (int nrows, int ncolumns, double** data,
2918             int** mask, double weights[], char dist, int transpose)
2919             /*
2920             Purpose
2921             =======
2922              
2923             The distancematrix routine calculates the distance matrix between genes or
2924             microarrays using their measured gene expression data. Several distance measures
2925             can be used. The routine returns a pointer to a ragged array containing the
2926             distances between the genes. As the distance matrix is symmetric, with zeros on
2927             the diagonal, only the lower triangular half of the distance matrix is saved.
2928             The distancematrix routine allocates space for the distance matrix. If the
2929             parameter transpose is set to a nonzero value, the distances between the columns
2930             (microarrays) are calculated, otherwise distances between the rows (genes) are
2931             calculated.
2932             If sufficient space in memory cannot be allocated to store the distance matrix,
2933             the routine returns a NULL pointer, and all memory allocated so far for the
2934             distance matrix is freed.
2935              
2936              
2937             Arguments
2938             =========
2939              
2940             nrows (input) int
2941             The number of rows in the gene expression data matrix (i.e., the number of
2942             genes)
2943              
2944             ncolumns (input) int
2945             The number of columns in the gene expression data matrix (i.e., the number of
2946             microarrays)
2947              
2948             data (input) double[nrows][ncolumns]
2949             The array containing the gene expression data.
2950              
2951             mask (input) int[nrows][ncolumns]
2952             This array shows which data values are missing. If mask[i][j]==0, then
2953             data[i][j] is missing.
2954              
2955             weight (input) double[n]
2956             The weights that are used to calculate the distance. The length of this vector
2957             is equal to the number of columns if the distances between genes are calculated,
2958             or the number of rows if the distances between microarrays are calculated.
2959              
2960             dist (input) char
2961             Defines which distance measure is used, as given by the table:
2962             dist=='e': Euclidean distance
2963             dist=='b': City-block distance
2964             dist=='c': correlation
2965             dist=='a': absolute value of the correlation
2966             dist=='u': uncentered correlation
2967             dist=='x': absolute uncentered correlation
2968             dist=='s': Spearman's rank correlation
2969             dist=='k': Kendall's tau
2970             For other values of dist, the default (Euclidean distance) is used.
2971              
2972             transpose (input) int
2973             If transpose is equal to zero, the distances between the rows is
2974             calculated. Otherwise, the distances between the columns is calculated.
2975             The former is needed when genes are being clustered; the latter is used
2976             when microarrays are being clustered.
2977              
2978             ========================================================================
2979             */
2980             { /* First determine the size of the distance matrix */
2981 7 50         const int n = (transpose==0) ? nrows : ncolumns;
2982 7 50         const int ndata = (transpose==0) ? ncolumns : nrows;
2983             int i,j;
2984             double** matrix;
2985              
2986             /* Set the metric function as indicated by dist */
2987 7           double (*metric)
2988             (int, double**, double**, int**, int**, const double[], int, int, int) =
2989 7           setmetric(dist);
2990              
2991 7 50         if (n < 2) return NULL;
2992              
2993             /* Set up the ragged array */
2994 7           matrix = malloc(n*sizeof(double*));
2995 7 50         if(matrix==NULL) return NULL; /* Not enough memory available */
2996 7           matrix[0] = NULL;
2997             /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2998 55 100         for (i = 1; i < n; i++)
2999 48           { matrix[i] = malloc(i*sizeof(double));
3000 48 50         if (matrix[i]==NULL) break; /* Not enough memory available */
3001             }
3002 7 50         if (i < n) /* break condition encountered */
3003 0           { j = i;
3004 0 0         for (i = 1; i < j; i++) free(matrix[i]);
3005 0           return NULL;
3006             }
3007              
3008             /* Calculate the distances and save them in the ragged array */
3009 55 100         for (i = 1; i < n; i++)
3010 306 100         for (j = 0; j < i; j++)
3011 258           matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
3012              
3013 7           return matrix;
3014             }
3015              
3016             /* ******************************************************************** */
3017              
3018 0           double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
3019             double weights[], int transpose, char dist, double cutoff, double exponent)
3020              
3021             /*
3022             Purpose
3023             =======
3024              
3025             This function calculates the weights using the weighting scheme proposed by
3026             Michael Eisen:
3027             w[i] = 1.0 / sum_{j where d[i][j]
3028             where the cutoff and the exponent are specified by the user.
3029              
3030              
3031             Arguments
3032             =========
3033              
3034             nrows (input) int
3035             The number of rows in the gene expression data matrix, equal to the number of
3036             genes.
3037              
3038             ncolumns (input) int
3039             The number of columns in the gene expression data matrix, equal to the number of
3040             microarrays.
3041              
3042             data (input) double[nrows][ncolumns]
3043             The array containing the gene expression data.
3044              
3045             mask (input) int[nrows][ncolumns]
3046             This array shows which data values are missing. If mask[i][j]==0, then
3047             data[i][j] is missing.
3048              
3049             weight (input) int[ncolumns] if transpose==0,
3050             int[nrows] if transpose==1
3051             The weights that are used to calculate the distance. The length of this vector
3052             is ncolumns if gene weights are being clustered, and nrows if microarrays
3053             weights are being clustered.
3054              
3055             transpose (input) int
3056             If transpose==0, the weights of the rows of the data matrix are calculated.
3057             Otherwise, the weights of the columns of the data matrix are calculated.
3058              
3059             dist (input) char
3060             Defines which distance measure is used, as given by the table:
3061             dist=='e': Euclidean distance
3062             dist=='b': City-block distance
3063             dist=='c': correlation
3064             dist=='a': absolute value of the correlation
3065             dist=='u': uncentered correlation
3066             dist=='x': absolute uncentered correlation
3067             dist=='s': Spearman's rank correlation
3068             dist=='k': Kendall's tau
3069             For other values of dist, the default (Euclidean distance) is used.
3070              
3071             cutoff (input) double
3072             The cutoff to be used to calculate the weights.
3073              
3074             exponent (input) double
3075             The exponent to be used to calculate the weights.
3076              
3077              
3078             Return value
3079             ============
3080              
3081             The function returns a pointer to a newly allocated array containing the
3082             calculated weights for the rows (if transpose==0) or columns (if
3083             transpose==1). If not enough memory could be allocated to store the
3084             weights array, the function returns NULL.
3085              
3086             ========================================================================
3087             */
3088             { int i,j;
3089 0 0         const int ndata = (transpose==0) ? ncolumns : nrows;
3090 0 0         const int nelements = (transpose==0) ? nrows : ncolumns;
3091              
3092             /* Set the metric function as indicated by dist */
3093 0           double (*metric)
3094             (int, double**, double**, int**, int**, const double[], int, int, int) =
3095 0           setmetric(dist);
3096              
3097 0           double* result = malloc(nelements*sizeof(double));
3098 0 0         if (!result) return NULL;
3099 0           memset(result, 0, nelements*sizeof(double));
3100              
3101 0 0         for (i = 0; i < nelements; i++)
3102 0           { result[i] += 1.0;
3103 0 0         for (j = 0; j < i; j++)
3104 0           { const double distance = metric(ndata, data, data, mask, mask, weights,
3105             i, j, transpose);
3106 0 0         if (distance < cutoff)
3107 0           { const double dweight = exp(exponent*log(1-distance/cutoff));
3108             /* pow() causes a crash on AIX */
3109 0           result[i] += dweight;
3110 0           result[j] += dweight;
3111             }
3112             }
3113             }
3114 0 0         for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
3115 0           return result;
3116             }
3117              
3118             /* ******************************************************************** */
3119              
3120 0           void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3121              
3122             /*
3123             Purpose
3124             =======
3125              
3126             The cuttree routine takes the output of a hierarchical clustering routine, and
3127             divides the elements in the tree structure into clusters based on the
3128             hierarchical clustering result. The number of clusters is specified by the user.
3129              
3130             Arguments
3131             =========
3132              
3133             nelements (input) int
3134             The number of elements that were clustered.
3135              
3136             tree (input) Node[nelements-1]
3137             The clustering solution. Each node in the array describes one linking event,
3138             with tree[i].left and tree[i].right representing the elements that were joined.
3139             The original elements are numbered 0..nelements-1, nodes are numbered
3140             -1..-(nelements-1).
3141              
3142             nclusters (input) int
3143             The number of clusters to be formed.
3144              
3145             clusterid (output) int[nelements]
3146             The number of the cluster to which each element was assigned. Clusters are
3147             numbered 0..nclusters-1 in the left-to-right order in which they appear in the
3148             hierarchical clustering tree. Space for the clusterid array should be allocated
3149             before calling the cuttree routine. If a memory error occured, all elements in
3150             clusterid are set to -1.
3151              
3152             ========================================================================
3153             */
3154 0           { int i = -nelements+1; /* top node */
3155             int j;
3156 0           int k = -1;
3157 0           int previous = nelements;
3158 0           const int n = nelements-nclusters; /* number of nodes to join */
3159             int* parents;
3160 0 0         if (nclusters==1) {
3161 0 0         for (i = 0; i < nelements; i++) clusterid[i] = 0;
3162 0           return;
3163             }
3164 0           parents = malloc((nelements-1)*sizeof(int));
3165 0 0         if (!parents)
3166 0 0         { for (i = 0; i < nelements; i++) clusterid[i] = -1;
3167 0           return;
3168             }
3169             while (1) {
3170 0 0         if (i >= 0) {
3171 0           clusterid[i] = k;
3172 0           j = i;
3173 0           i = previous;
3174 0           previous = j;
3175             }
3176             else {
3177 0           j = -i-1;
3178 0 0         if (previous == tree[j].left) {
3179 0           previous = i;
3180 0           i = tree[j].right;
3181 0 0         if (j >= n && (i >= 0 || -i-1 < n)) k++;
    0          
    0          
3182             }
3183 0 0         else if (previous == tree[j].right) {
3184 0           previous = i;
3185 0           i = parents[j];
3186 0 0         if (i==nelements) break;
3187             }
3188             else {
3189 0           parents[j] = previous;
3190 0           previous = i;
3191 0           i = tree[j].left;
3192 0 0         if (j >= n && (i >= 0 || -i-1 < n)) k++;
    0          
    0          
3193             }
3194             }
3195 0           }
3196 0           free(parents);
3197             }
3198              
3199             /* ******************************************************************** */
3200              
3201             static
3202 2           Node* pclcluster (int nrows, int ncolumns, double** data, int** mask,
3203             double weight[], double** distmatrix, char dist, int transpose)
3204              
3205             /*
3206              
3207             Purpose
3208             =======
3209              
3210             The pclcluster routine performs clustering using pairwise centroid-linking
3211             on a given set of gene expression data, using the distance metric given by dist.
3212              
3213             Arguments
3214             =========
3215              
3216             nrows (input) int
3217             The number of rows in the gene expression data matrix, equal to the number of
3218             genes.
3219              
3220             ncolumns (input) int
3221             The number of columns in the gene expression data matrix, equal to the number of
3222             microarrays.
3223              
3224             data (input) double[nrows][ncolumns]
3225             The array containing the gene expression data.
3226              
3227             mask (input) int[nrows][ncolumns]
3228             This array shows which data values are missing. If
3229             mask[i][j] == 0, then data[i][j] is missing.
3230              
3231             weight (input) double[ncolumns] if transpose==0;
3232             double[nrows] if transpose==1
3233             The weights that are used to calculate the distance. The length of this vector
3234             is ncolumns if genes are being clustered, and nrows if microarrays are being
3235             clustered.
3236              
3237             transpose (input) int
3238             If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3239             of the matrix are clustered.
3240              
3241             dist (input) char
3242             Defines which distance measure is used, as given by the table:
3243             dist=='e': Euclidean distance
3244             dist=='b': City-block distance
3245             dist=='c': correlation
3246             dist=='a': absolute value of the correlation
3247             dist=='u': uncentered correlation
3248             dist=='x': absolute uncentered correlation
3249             dist=='s': Spearman's rank correlation
3250             dist=='k': Kendall's tau
3251             For other values of dist, the default (Euclidean distance) is used.
3252              
3253             distmatrix (input) double**
3254             The distance matrix. This matrix is precalculated by the calling routine
3255             treecluster. The pclcluster routine modifies the contents of distmatrix, but
3256             does not deallocate it.
3257              
3258             Return value
3259             ============
3260              
3261             A pointer to a newly allocated array of Node structs, describing the
3262             hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3263             whether genes (rows) or microarrays (columns) were clustered, nelements is
3264             equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3265             structure.
3266             If a memory error occurs, pclcluster returns NULL.
3267             ========================================================================
3268             */
3269             { int i, j;
3270 2 50         const int nelements = (transpose==0) ? nrows : ncolumns;
3271             int inode;
3272 2 50         const int ndata = transpose ? nrows : ncolumns;
3273 2           const int nnodes = nelements - 1;
3274              
3275             /* Set the metric function as indicated by dist */
3276 2           double (*metric)
3277             (int, double**, double**, int**, int**, const double[], int, int, int) =
3278 2           setmetric(dist);
3279              
3280             Node* result;
3281             double** newdata;
3282             int** newmask;
3283 2           int* distid = malloc(nelements*sizeof(int));
3284 2 50         if(!distid) return NULL;
3285 2           result = malloc(nnodes*sizeof(Node));
3286 2 50         if(!result)
3287 0           { free(distid);
3288 0           return NULL;
3289             }
3290 2 50         if(!makedatamask(nelements, ndata, &newdata, &newmask))
3291 0           { free(result);
3292 0           free(distid);
3293 0           return NULL;
3294             }
3295              
3296 19 100         for (i = 0; i < nelements; i++) distid[i] = i;
3297             /* To remember which row/column in the distance matrix contains what */
3298              
3299             /* Storage for node data */
3300 2 50         if (transpose)
3301 0 0         { for (i = 0; i < nelements; i++)
3302 0 0         { for (j = 0; j < ndata; j++)
3303 0           { newdata[i][j] = data[j][i];
3304 0           newmask[i][j] = mask[j][i];
3305             }
3306             }
3307 0           data = newdata;
3308 0           mask = newmask;
3309             }
3310             else
3311 19 100         { for (i = 0; i < nelements; i++)
3312 17           { memcpy(newdata[i], data[i], ndata*sizeof(double));
3313 17           memcpy(newmask[i], mask[i], ndata*sizeof(int));
3314             }
3315 2           data = newdata;
3316 2           mask = newmask;
3317             }
3318              
3319 17 100         for (inode = 0; inode < nnodes; inode++)
3320             { /* Find the pair with the shortest distance */
3321 15           int is = 1;
3322 15           int js = 0;
3323 15           result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3324 15           result[inode].left = distid[js];
3325 15           result[inode].right = distid[is];
3326              
3327             /* Make node js the new node */
3328 54 100         for (i = 0; i < ndata; i++)
3329 39           { data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
3330 39           mask[js][i] += mask[is][i];
3331 39 50         if (mask[js][i]) data[js][i] /= mask[js][i];
3332             }
3333 15           free(data[is]);
3334 15           free(mask[is]);
3335 15           data[is] = data[nnodes-inode];
3336 15           mask[is] = mask[nnodes-inode];
3337              
3338             /* Fix the distances */
3339 15           distid[is] = distid[nnodes-inode];
3340 70 100         for (i = 0; i < is; i++)
3341 55           distmatrix[is][i] = distmatrix[nnodes-inode][i];
3342 35 100         for (i = is + 1; i < nnodes-inode; i++)
3343 20           distmatrix[i][is] = distmatrix[nnodes-inode][i];
3344              
3345 15           distid[js] = -inode-1;
3346 39 100         for (i = 0; i < js; i++)
3347 24           distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3348 60 100         for (i = js + 1; i < nnodes-inode; i++)
3349 45           distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3350             }
3351              
3352             /* Free temporarily allocated space */
3353 2           free(data[0]);
3354 2           free(mask[0]);
3355 2           free(data);
3356 2           free(mask);
3357 2           free(distid);
3358              
3359 2           return result;
3360             }
3361              
3362             /* ******************************************************************** */
3363              
3364             static
3365 31           int nodecompare(const void* a, const void* b)
3366             /* Helper function for qsort. */
3367 31           { const Node* node1 = (const Node*)a;
3368 31           const Node* node2 = (const Node*)b;
3369 31           const double term1 = node1->distance;
3370 31           const double term2 = node2->distance;
3371 31 100         if (term1 < term2) return -1;
3372 14 50         if (term1 > term2) return +1;
3373 0           return 0;
3374             }
3375              
3376             /* ---------------------------------------------------------------------- */
3377              
3378             static
3379 2           Node* pslcluster (int nrows, int ncolumns, double** data, int** mask,
3380             double weight[], double** distmatrix, char dist, int transpose)
3381              
3382             /*
3383              
3384             Purpose
3385             =======
3386              
3387             The pslcluster routine performs single-linkage hierarchical clustering, using
3388             either the distance matrix directly, if available, or by calculating the
3389             distances from the data array. This implementation is based on the SLINK
3390             algorithm, described in:
3391             Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3392             cluster method. The Computer Journal, 16(1): 30-34.
3393             The output of this algorithm is identical to conventional single-linkage
3394             hierarchical clustering, but is much more memory-efficient and faster. Hence,
3395             it can be applied to large data sets, for which the conventional single-
3396             linkage algorithm fails due to lack of memory.
3397              
3398              
3399             Arguments
3400             =========
3401              
3402             nrows (input) int
3403             The number of rows in the gene expression data matrix, equal to the number of
3404             genes.
3405              
3406             ncolumns (input) int
3407             The number of columns in the gene expression data matrix, equal to the number of
3408             microarrays.
3409              
3410             data (input) double[nrows][ncolumns]
3411             The array containing the gene expression data.
3412              
3413             mask (input) int[nrows][ncolumns]
3414             This array shows which data values are missing. If
3415             mask[i][j] == 0, then data[i][j] is missing.
3416              
3417             weight (input) double[n]
3418             The weights that are used to calculate the distance. The length of this vector
3419             is ncolumns if genes are being clustered, and nrows if microarrays are being
3420             clustered.
3421              
3422             transpose (input) int
3423             If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3424             of the matrix are clustered.
3425              
3426             dist (input) char
3427             Defines which distance measure is used, as given by the table:
3428             dist=='e': Euclidean distance
3429             dist=='b': City-block distance
3430             dist=='c': correlation
3431             dist=='a': absolute value of the correlation
3432             dist=='u': uncentered correlation
3433             dist=='x': absolute uncentered correlation
3434             dist=='s': Spearman's rank correlation
3435             dist=='k': Kendall's tau
3436             For other values of dist, the default (Euclidean distance) is used.
3437              
3438             distmatrix (input) double**
3439             The distance matrix. If the distance matrix is passed by the calling routine
3440             treecluster, it is used by pslcluster to speed up the clustering calculation.
3441             The pslcluster routine does not modify the contents of distmatrix, and does
3442             not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3443             by the pslcluster routine from the gene expression data (the data and mask
3444             arrays) and stored in temporary arrays. If distmatrix is passed, the original
3445             gene expression data (specified by the data and mask arguments) are not needed
3446             and are therefore ignored.
3447              
3448              
3449             Return value
3450             ============
3451              
3452             A pointer to a newly allocated array of Node structs, describing the
3453             hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3454             whether genes (rows) or microarrays (columns) were clustered, nelements is
3455             equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3456             structure.
3457             If a memory error occurs, pslcluster returns NULL.
3458              
3459             ========================================================================
3460             */
3461             { int i, j, k;
3462 2 50         const int nelements = transpose ? ncolumns : nrows;
3463 2           const int nnodes = nelements - 1;
3464             int* vector;
3465             double* temp;
3466             int* index;
3467             Node* result;
3468 2           temp = malloc(nnodes*sizeof(double));
3469 2 50         if(!temp) return NULL;
3470 2           index = malloc(nelements*sizeof(int));
3471 2 50         if(!index)
3472 0           { free(temp);
3473 0           return NULL;
3474             }
3475 2           vector = malloc(nnodes*sizeof(int));
3476 2 50         if(!vector)
3477 0           { free(index);
3478 0           free(temp);
3479 0           return NULL;
3480             }
3481 2           result = malloc(nelements*sizeof(Node));
3482 2 50         if(!result)
3483 0           { free(vector);
3484 0           free(index);
3485 0           free(temp);
3486 0           return NULL;
3487             }
3488              
3489 17 100         for (i = 0; i < nnodes; i++) vector[i] = i;
3490              
3491 2 50         if(distmatrix)
3492 0 0         { for (i = 0; i < nrows; i++)
3493 0           { result[i].distance = DBL_MAX;
3494 0 0         for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3495 0 0         for (j = 0; j < i; j++)
3496 0           { k = vector[j];
3497 0 0         if (result[j].distance >= temp[j])
3498 0 0         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3499 0           result[j].distance = temp[j];
3500 0           vector[j] = i;
3501             }
3502 0 0         else if (temp[j] < temp[k]) temp[k] = temp[j];
3503             }
3504 0 0         for (j = 0; j < i; j++)
3505             {
3506 0 0         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3507             }
3508             }
3509             }
3510             else
3511 2 50         { const int ndata = transpose ? nrows : ncolumns;
3512             /* Set the metric function as indicated by dist */
3513 2           double (*metric)
3514             (int, double**, double**, int**, int**, const double[], int, int, int) =
3515 2           setmetric(dist);
3516              
3517 19 100         for (i = 0; i < nelements; i++)
3518 17           { result[i].distance = DBL_MAX;
3519 101 100         for (j = 0; j < i; j++) temp[j] =
3520 84           metric(ndata, data, data, mask, mask, weight, i, j, transpose);
3521 101 100         for (j = 0; j < i; j++)
3522 84           { k = vector[j];
3523 84 100         if (result[j].distance >= temp[j])
3524 25 100         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3525 25           result[j].distance = temp[j];
3526 25           vector[j] = i;
3527             }
3528 59 100         else if (temp[j] < temp[k]) temp[k] = temp[j];
3529             }
3530 101 100         for (j = 0; j < i; j++)
3531 84 100         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3532             }
3533             }
3534 2           free(temp);
3535              
3536 17 100         for (i = 0; i < nnodes; i++) result[i].left = i;
3537 2           qsort(result, nnodes, sizeof(Node), nodecompare);
3538              
3539 19 100         for (i = 0; i < nelements; i++) index[i] = i;
3540 17 100         for (i = 0; i < nnodes; i++)
3541 15           { j = result[i].left;
3542 15           k = vector[j];
3543 15           result[i].left = index[j];
3544 15           result[i].right = index[k];
3545 15           index[k] = -i-1;
3546             }
3547 2           free(vector);
3548 2           free(index);
3549              
3550 2           result = realloc(result, nnodes*sizeof(Node));
3551              
3552 2           return result;
3553             }
3554             /* ******************************************************************** */
3555              
3556 2           static Node* pmlcluster (int nelements, double** distmatrix)
3557             /*
3558              
3559             Purpose
3560             =======
3561              
3562             The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3563             linking on the given distance matrix.
3564              
3565             Arguments
3566             =========
3567              
3568             nelements (input) int
3569             The number of elements to be clustered.
3570              
3571             distmatrix (input) double**
3572             The distance matrix, with nelements rows, each row being filled up to the
3573             diagonal. The elements on the diagonal are not used, as they are assumed to be
3574             zero. The distance matrix will be modified by this routine.
3575              
3576             Return value
3577             ============
3578              
3579             A pointer to a newly allocated array of Node structs, describing the
3580             hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3581             whether genes (rows) or microarrays (columns) were clustered, nelements is
3582             equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3583             structure.
3584             If a memory error occurs, pmlcluster returns NULL.
3585             ========================================================================
3586             */
3587             { int j;
3588             int n;
3589             int* clusterid;
3590             Node* result;
3591              
3592 2           clusterid = malloc(nelements*sizeof(int));
3593 2 50         if(!clusterid) return NULL;
3594 2           result = malloc((nelements-1)*sizeof(Node));
3595 2 50         if (!result)
3596 0           { free(clusterid);
3597 0           return NULL;
3598             }
3599              
3600             /* Setup a list specifying to which cluster a gene belongs */
3601 19 100         for (j = 0; j < nelements; j++) clusterid[j] = j;
3602              
3603 17 100         for (n = nelements; n > 1; n--)
3604 15           { int is = 1;
3605 15           int js = 0;
3606 15           result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3607              
3608             /* Fix the distances */
3609 37 100         for (j = 0; j < js; j++)
3610 22 100         distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3611 28 100         for (j = js+1; j < is; j++)
3612 13 100         distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3613 49 100         for (j = is+1; j < n; j++)
3614 34 100         distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3615              
3616 65 100         for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3617 39 100         for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3618              
3619             /* Update clusterids */
3620 15           result[nelements-n].left = clusterid[is];
3621 15           result[nelements-n].right = clusterid[js];
3622 15           clusterid[js] = n-nelements-1;
3623 15           clusterid[is] = clusterid[n-1];
3624             }
3625 2           free(clusterid);
3626              
3627 2           return result;
3628             }
3629              
3630             /* ******************************************************************* */
3631              
3632 2           static Node* palcluster (int nelements, double** distmatrix)
3633             /*
3634             Purpose
3635             =======
3636              
3637             The palcluster routine performs clustering using pairwise average
3638             linking on the given distance matrix.
3639              
3640             Arguments
3641             =========
3642              
3643             nelements (input) int
3644             The number of elements to be clustered.
3645              
3646             distmatrix (input) double**
3647             The distance matrix, with nelements rows, each row being filled up to the
3648             diagonal. The elements on the diagonal are not used, as they are assumed to be
3649             zero. The distance matrix will be modified by this routine.
3650              
3651             Return value
3652             ============
3653              
3654             A pointer to a newly allocated array of Node structs, describing the
3655             hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3656             whether genes (rows) or microarrays (columns) were clustered, nelements is
3657             equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3658             structure.
3659             If a memory error occurs, palcluster returns NULL.
3660             ========================================================================
3661             */
3662             { int j;
3663             int n;
3664             int* clusterid;
3665             int* number;
3666             Node* result;
3667              
3668 2           clusterid = malloc(nelements*sizeof(int));
3669 2 50         if(!clusterid) return NULL;
3670 2           number = malloc(nelements*sizeof(int));
3671 2 50         if(!number)
3672 0           { free(clusterid);
3673 0           return NULL;
3674             }
3675 2           result = malloc((nelements-1)*sizeof(Node));
3676 2 50         if (!result)
3677 0           { free(clusterid);
3678 0           free(number);
3679 0           return NULL;
3680             }
3681              
3682             /* Setup a list specifying to which cluster a gene belongs, and keep track
3683             * of the number of elements in each cluster (needed to calculate the
3684             * average). */
3685 19 100         for (j = 0; j < nelements; j++)
3686 17           { number[j] = 1;
3687 17           clusterid[j] = j;
3688             }
3689              
3690 17 100         for (n = nelements; n > 1; n--)
3691             { int sum;
3692 15           int is = 1;
3693 15           int js = 0;
3694 15           result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3695              
3696             /* Save result */
3697 15           result[nelements-n].left = clusterid[is];
3698 15           result[nelements-n].right = clusterid[js];
3699              
3700             /* Fix the distances */
3701 15           sum = number[is] + number[js];
3702 39 100         for (j = 0; j < js; j++)
3703 48           { distmatrix[js][j] = distmatrix[is][j]*number[is]
3704 24           + distmatrix[js][j]*number[js];
3705 24           distmatrix[js][j] /= sum;
3706             }
3707 31 100         for (j = js+1; j < is; j++)
3708 32           { distmatrix[j][js] = distmatrix[is][j]*number[is]
3709 16           + distmatrix[j][js]*number[js];
3710 16           distmatrix[j][js] /= sum;
3711             }
3712 44 100         for (j = is+1; j < n; j++)
3713 58           { distmatrix[j][js] = distmatrix[j][is]*number[is]
3714 29           + distmatrix[j][js]*number[js];
3715 29           distmatrix[j][js] /= sum;
3716             }
3717              
3718 70 100         for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3719 35 100         for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3720              
3721             /* Update number of elements in the clusters */
3722 15           number[js] = sum;
3723 15           number[is] = number[n-1];
3724              
3725             /* Update clusterids */
3726 15           clusterid[js] = n-nelements-1;
3727 15           clusterid[is] = clusterid[n-1];
3728             }
3729 2           free(clusterid);
3730 2           free(number);
3731              
3732 2           return result;
3733             }
3734              
3735             /* ******************************************************************* */
3736              
3737 8           Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
3738             double weight[], int transpose, char dist, char method, double** distmatrix)
3739             /*
3740             Purpose
3741             =======
3742              
3743             The treecluster routine performs hierarchical clustering using pairwise
3744             single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3745             given set of gene expression data, using the distance metric given by dist.
3746             If successful, the function returns a pointer to a newly allocated Tree struct
3747             containing the hierarchical clustering solution, and NULL if a memory error
3748             occurs. The pointer should be freed by the calling routine to prevent memory
3749             leaks.
3750              
3751             Arguments
3752             =========
3753              
3754             nrows (input) int
3755             The number of rows in the data matrix, equal to the number of genes.
3756              
3757             ncolumns (input) int
3758             The number of columns in the data matrix, equal to the number of microarrays.
3759              
3760             data (input) double[nrows][ncolumns]
3761             The array containing the data of the vectors to be clustered.
3762              
3763             mask (input) int[nrows][ncolumns]
3764             This array shows which data values are missing. If mask[i][j]==0, then
3765             data[i][j] is missing.
3766              
3767             weight (input) double array[n]
3768             The weights that are used to calculate the distance.
3769              
3770             transpose (input) int
3771             If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3772             of the matrix are clustered.
3773              
3774             dist (input) char
3775             Defines which distance measure is used, as given by the table:
3776             dist=='e': Euclidean distance
3777             dist=='b': City-block distance
3778             dist=='c': correlation
3779             dist=='a': absolute value of the correlation
3780             dist=='u': uncentered correlation
3781             dist=='x': absolute uncentered correlation
3782             dist=='s': Spearman's rank correlation
3783             dist=='k': Kendall's tau
3784             For other values of dist, the default (Euclidean distance) is used.
3785              
3786             method (input) char
3787             Defines which hierarchical clustering method is used:
3788             method=='s': pairwise single-linkage clustering
3789             method=='m': pairwise maximum- (or complete-) linkage clustering
3790             method=='a': pairwise average-linkage clustering
3791             method=='c': pairwise centroid-linkage clustering
3792             For the first three, either the distance matrix or the gene expression data is
3793             sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3794             clustering, however, the gene expression data are always needed, even if the
3795             distance matrix itself is available.
3796              
3797             distmatrix (input) double**
3798             The distance matrix. If the distance matrix is zero initially, the distance
3799             matrix will be allocated and calculated from the data by treecluster, and
3800             deallocated before treecluster returns. If the distance matrix is passed by the
3801             calling routine, treecluster will modify the contents of the distance matrix as
3802             part of the clustering algorithm, but will not deallocate it. The calling
3803             routine should deallocate the distance matrix after the return from treecluster.
3804              
3805             Return value
3806             ============
3807              
3808             A pointer to a newly allocated array of Node structs, describing the
3809             hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3810             whether genes (rows) or microarrays (columns) were clustered, nelements is
3811             equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3812             structure.
3813             If a memory error occurs, treecluster returns NULL.
3814              
3815             ========================================================================
3816             */
3817 8           { Node* result = NULL;
3818 8 50         const int nelements = (transpose==0) ? nrows : ncolumns;
3819 8 50         const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
    100          
3820              
3821 8 50         if (nelements < 2) return NULL;
3822              
3823             /* Calculate the distance matrix if the user didn't give it */
3824 8 100         if(ldistmatrix)
3825 6           { distmatrix =
3826 6           distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
3827 6 50         if (!distmatrix) return NULL; /* Insufficient memory */
3828             }
3829              
3830 8           switch(method)
3831             { case 's':
3832 2           result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3833             dist, transpose);
3834 2           break;
3835             case 'm':
3836 2           result = pmlcluster(nelements, distmatrix);
3837 2           break;
3838             case 'a':
3839 2           result = palcluster(nelements, distmatrix);
3840 2           break;
3841             case 'c':
3842 2           result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3843             dist, transpose);
3844 2           break;
3845             }
3846              
3847             /* Deallocate space for distance matrix, if it was allocated by treecluster */
3848 8 100         if(ldistmatrix)
3849             { int i;
3850 51 100         for (i = 1; i < nelements; i++) free(distmatrix[i]);
3851 6           free (distmatrix);
3852             }
3853              
3854 8           return result;
3855             }
3856              
3857             /* ******************************************************************* */
3858              
3859 0           int sorttree(const int nnodes, Node* tree, const double order[], int indices[])
3860             /*
3861             Purpose
3862             =======
3863              
3864             The sorttree routine sorts the items in a hierarchical clustering solution
3865             based on their order values, while remaining consistent with the hierchical
3866             clustering solution.
3867              
3868             Arguments
3869             =========
3870              
3871             nnodes (input) int
3872             The number of nodes in the hierarchical clustering tree.
3873              
3874             tree (input) Node[nnodes]
3875             The hierarchical clustering tree describing the clustering solution.
3876              
3877             order (input) double[nnodes+1]
3878             The preferred order of the items.
3879              
3880             indices (output) int*
3881             The indices of each item after sorting, with item i appearing at indices[i]
3882             after sorting.
3883              
3884             Return value
3885             ============
3886              
3887             If no errors occur, sorttree returns 1.
3888             If a memory error occurs, sorttree returns 0.
3889              
3890             ========================================================================
3891             */
3892              
3893             { int i;
3894             int index;
3895             int i1, i2;
3896             double order1, order2;
3897             int counts1, counts2;
3898 0           int* nodecounts = malloc(nnodes*sizeof(int));
3899 0 0         if (!nodecounts) return 0;
3900 0 0         if (order) {
3901 0           double* nodeorder = malloc(nnodes*sizeof(double));
3902 0 0         if (!nodeorder) {
3903 0           free(nodecounts);
3904 0           return 0;
3905             }
3906 0 0         for (i = 0; i < nnodes; i++)
3907 0           { i1 = tree[i].left;
3908 0           i2 = tree[i].right;
3909             /* i1 and i2 are the elements that are to be joined */
3910 0 0         if (i1 < 0)
3911 0           { index = -i1-1;
3912 0           order1 = nodeorder[index];
3913 0           counts1 = nodecounts[index];
3914             }
3915             else
3916 0           { order1 = order[i1];
3917 0           counts1 = 1;
3918             }
3919 0 0         if (i2 < 0)
3920 0           { index = -i2-1;
3921 0           order2 = nodeorder[index];
3922 0           counts2 = nodecounts[index];
3923             }
3924             else
3925 0           { order2 = order[i2];
3926 0           counts2 = 1;
3927             }
3928 0 0         if (order1 > order2) {
3929 0           tree[i].left = i2;
3930 0           tree[i].right = i1;
3931             }
3932 0           nodecounts[i] = counts1 + counts2;
3933 0           nodeorder[i] = (counts1*order1 + counts2*order2) / (counts1 + counts2);
3934             }
3935 0           free(nodeorder);
3936             }
3937             else
3938 0 0         { for (i = 0; i < nnodes; i++)
3939 0           { i1 = tree[i].left;
3940 0           i2 = tree[i].right;
3941             /* i1 and i2 are the elements that are to be joined */
3942 0 0         counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1;
3943 0 0         counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1;
3944 0           nodecounts[i] = counts1 + counts2;
3945             }
3946             }
3947 0           i--;
3948 0           nodecounts[i] = 0;
3949 0 0         for ( ; i >= 0; i--)
3950 0           { i1 = tree[i].left;
3951 0           i2 = tree[i].right;
3952 0 0         counts1 = (i1<0) ? nodecounts[-i1-1] : 1;
3953 0           index = nodecounts[i];
3954 0 0         if (i1 >= 0) indices[index] = i1;
3955 0           else nodecounts[-i1-1] = index;
3956 0           index += counts1;
3957 0 0         if (i2 >= 0) indices[index] = i2;
3958 0           else nodecounts[-i2-1] = index;
3959             }
3960 0           free(nodecounts);
3961 0           return 1;
3962             }
3963              
3964             /* ******************************************************************* */
3965              
3966             static
3967 2           void somworker (int nrows, int ncolumns, double** data, int** mask,
3968             const double weights[], int transpose, int nxgrid, int nygrid,
3969             double inittau, double*** celldata, int niter, char dist)
3970              
3971 2 50         { const int nelements = (transpose==0) ? nrows : ncolumns;
3972 2 50         const int ndata = (transpose==0) ? ncolumns : nrows;
3973             int i, j;
3974 2           double* stddata = calloc(nelements,sizeof(double));
3975             int** dummymask;
3976             int ix, iy;
3977             int* index;
3978             int iter;
3979             /* Maximum radius in which nodes are adjusted */
3980 2           double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
3981              
3982             /* Set the metric function as indicated by dist */
3983 2           double (*metric)
3984             (int, double**, double**, int**, int**, const double[], int, int, int) =
3985 2           setmetric(dist);
3986              
3987             /* Calculate the standard deviation for each row or column */
3988 2 50         if (transpose==0)
3989 19 100         { for (i = 0; i < nelements; i++)
3990 17           { int n = 0;
3991 63 100         for (j = 0; j < ndata; j++)
3992 46 50         { if (mask[i][j])
3993 46           { double term = data[i][j];
3994 46           term = term * term;
3995 46           stddata[i] += term;
3996 46           n++;
3997             }
3998             }
3999 17 50         if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
4000 0           else stddata[i] = 1;
4001             }
4002             }
4003             else
4004 0 0         { for (i = 0; i < nelements; i++)
4005 0           { int n = 0;
4006 0 0         for (j = 0; j < ndata; j++)
4007 0 0         { if (mask[j][i])
4008 0           { double term = data[j][i];
4009 0           term = term * term;
4010 0           stddata[i] += term;
4011 0           n++;
4012             }
4013             }
4014 0 0         if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
4015 0           else stddata[i] = 1;
4016             }
4017             }
4018              
4019 2 50         if (transpose==0)
4020 2           { dummymask = malloc(nygrid*sizeof(int*));
4021 22 100         for (i = 0; i < nygrid; i++)
4022 20           { dummymask[i] = malloc(ndata*sizeof(int));
4023 90 100         for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
4024             }
4025             }
4026             else
4027 0           { dummymask = malloc(ndata*sizeof(int*));
4028 0 0         for (i = 0; i < ndata; i++)
4029 0           { dummymask[i] = malloc(sizeof(int));
4030 0           dummymask[i][0] = 1;
4031             }
4032             }
4033              
4034             /* Randomly initialize the nodes */
4035 22 100         for (ix = 0; ix < nxgrid; ix++)
4036 220 100         { for (iy = 0; iy < nygrid; iy++)
4037 200           { double sum = 0.;
4038 900 100         for (i = 0; i < ndata; i++)
4039 700           { double term = -1.0 + 2.0*uniform();
4040 700           celldata[ix][iy][i] = term;
4041 700           sum += term * term;
4042             }
4043 200           sum = sqrt(sum/ndata);
4044 900 100         for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4045             }
4046             }
4047              
4048             /* Randomize the order in which genes or arrays will be used */
4049 2           index = malloc(nelements*sizeof(int));
4050 19 100         for (i = 0; i < nelements; i++) index[i] = i;
4051 19 100         for (i = 0; i < nelements; i++)
4052 17           { j = (int) (i + (nelements-i)*uniform());
4053 17           ix = index[j];
4054 17           index[j] = index[i];
4055 17           index[i] = ix;
4056             }
4057              
4058             /* Start the iteration */
4059 202 100         for (iter = 0; iter < niter; iter++)
4060 200           { int ixbest = 0;
4061 200           int iybest = 0;
4062 200           int iobject = iter % nelements;
4063 200           iobject = index[iobject];
4064 200 50         if (transpose==0)
4065 200           { double closest = metric(ndata,data,celldata[ixbest],
4066             mask,dummymask,weights,iobject,iybest,transpose);
4067 200           double radius = maxradius * (1. - ((double)iter)/((double)niter));
4068 200           double tau = inittau * (1. - ((double)iter)/((double)niter));
4069              
4070 2200 100         for (ix = 0; ix < nxgrid; ix++)
4071 22000 100         { for (iy = 0; iy < nygrid; iy++)
4072 20000           { double distance =
4073 20000           metric (ndata,data,celldata[ix],
4074             mask,dummymask,weights,iobject,iy,transpose);
4075 20000 100         if (distance < closest)
4076 589           { ixbest = ix;
4077 589           iybest = iy;
4078 589           closest = distance;
4079             }
4080             }
4081             }
4082 2200 100         for (ix = 0; ix < nxgrid; ix++)
4083 22000 100         { for (iy = 0; iy < nygrid; iy++)
4084 20000 100         { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))
4085 13018           { double sum = 0.;
4086 59142 100         for (i = 0; i < ndata; i++)
4087 46124 50         { if (mask[iobject][i]==0) continue;
4088 46124           celldata[ix][iy][i] +=
4089 46124           tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
4090             }
4091 59142 100         for (i = 0; i < ndata; i++)
4092 46124           { double term = celldata[ix][iy][i];
4093 46124           term = term * term;
4094 46124           sum += term;
4095             }
4096 13018 50         if (sum>0)
4097 13018           { sum = sqrt(sum/ndata);
4098 59142 100         for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4099             }
4100             }
4101             }
4102             }
4103             }
4104             else
4105             { double closest;
4106 0           double** celldatavector = malloc(ndata*sizeof(double*));
4107 0           double radius = maxradius * (1. - ((double)iter)/((double)niter));
4108 0           double tau = inittau * (1. - ((double)iter)/((double)niter));
4109              
4110 0 0         for (i = 0; i < ndata; i++)
4111 0           celldatavector[i] = &(celldata[ixbest][iybest][i]);
4112 0           closest = metric(ndata,data,celldatavector,
4113             mask,dummymask,weights,iobject,0,transpose);
4114 0 0         for (ix = 0; ix < nxgrid; ix++)
4115 0 0         { for (iy = 0; iy < nygrid; iy++)
4116             { double distance;
4117 0 0         for (i = 0; i < ndata; i++)
4118 0           celldatavector[i] = &(celldata[ixbest][iybest][i]);
4119 0           distance =
4120             metric (ndata,data,celldatavector,
4121             mask,dummymask,weights,iobject,0,transpose);
4122 0 0         if (distance < closest)
4123 0           { ixbest = ix;
4124 0           iybest = iy;
4125 0           closest = distance;
4126             }
4127             }
4128             }
4129 0           free(celldatavector);
4130 0 0         for (ix = 0; ix < nxgrid; ix++)
4131 0 0         { for (iy = 0; iy < nygrid; iy++)
4132 0 0         { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))
4133 0           { double sum = 0.;
4134 0 0         for (i = 0; i < ndata; i++)
4135 0 0         { if (mask[i][iobject]==0) continue;
4136 0           celldata[ix][iy][i] +=
4137 0           tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
4138             }
4139 0 0         for (i = 0; i < ndata; i++)
4140 0           { double term = celldata[ix][iy][i];
4141 0           term = term * term;
4142 0           sum += term;
4143             }
4144 0 0         if (sum>0)
4145 0           { sum = sqrt(sum/ndata);
4146 0 0         for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4147             }
4148             }
4149             }
4150             }
4151             }
4152             }
4153 2 50         if (transpose==0)
4154 22 100         for (i = 0; i < nygrid; i++) free(dummymask[i]);
4155             else
4156 0 0         for (i = 0; i < ndata; i++) free(dummymask[i]);
4157 2           free(dummymask);
4158 2           free(stddata);
4159 2           free(index);
4160 2           return;
4161             }
4162              
4163             /* ******************************************************************* */
4164              
4165             static
4166 2           void somassign (int nrows, int ncolumns, double** data, int** mask,
4167             const double weights[], int transpose, int nxgrid, int nygrid,
4168             double*** celldata, char dist, int clusterid[][2])
4169             /* Collect clusterids */
4170 2 50         { const int ndata = (transpose==0) ? ncolumns : nrows;
4171             int i,j;
4172              
4173             /* Set the metric function as indicated by dist */
4174 2           double (*metric)
4175             (int, double**, double**, int**, int**, const double[], int, int, int) =
4176 2           setmetric(dist);
4177              
4178 2 50         if (transpose==0)
4179 2           { int** dummymask = malloc(nygrid*sizeof(int*));
4180 22 100         for (i = 0; i < nygrid; i++)
4181 20           { dummymask[i] = malloc(ncolumns*sizeof(int));
4182 90 100         for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
4183             }
4184 19 100         for (i = 0; i < nrows; i++)
4185 17           { int ixbest = 0;
4186 17           int iybest = 0;
4187 17           double closest = metric(ndata,data,celldata[ixbest],
4188             mask,dummymask,weights,i,iybest,transpose);
4189             int ix, iy;
4190 187 100         for (ix = 0; ix < nxgrid; ix++)
4191 1870 100         { for (iy = 0; iy < nygrid; iy++)
4192 1700           { double distance =
4193 1700           metric (ndata,data,celldata[ix],
4194             mask,dummymask,weights,i,iy,transpose);
4195 1700 100         if (distance < closest)
4196 57           { ixbest = ix;
4197 57           iybest = iy;
4198 57           closest = distance;
4199             }
4200             }
4201             }
4202 17           clusterid[i][0] = ixbest;
4203 17           clusterid[i][1] = iybest;
4204             }
4205 22 100         for (i = 0; i < nygrid; i++) free(dummymask[i]);
4206 2           free(dummymask);
4207             }
4208             else
4209 0           { double** celldatavector = malloc(ndata*sizeof(double*));
4210 0           int** dummymask = malloc(nrows*sizeof(int*));
4211 0           int ixbest = 0;
4212 0           int iybest = 0;
4213 0 0         for (i = 0; i < nrows; i++)
4214 0           { dummymask[i] = malloc(sizeof(int));
4215 0           dummymask[i][0] = 1;
4216             }
4217 0 0         for (i = 0; i < ncolumns; i++)
4218             { double closest;
4219             int ix, iy;
4220 0 0         for (j = 0; j < ndata; j++)
4221 0           celldatavector[j] = &(celldata[ixbest][iybest][j]);
4222 0           closest = metric(ndata,data,celldatavector,
4223             mask,dummymask,weights,i,0,transpose);
4224 0 0         for (ix = 0; ix < nxgrid; ix++)
4225 0 0         { for (iy = 0; iy < nygrid; iy++)
4226             { double distance;
4227 0 0         for(j = 0; j < ndata; j++)
4228 0           celldatavector[j] = &(celldata[ix][iy][j]);
4229 0           distance = metric(ndata,data,celldatavector,
4230             mask,dummymask,weights,i,0,transpose);
4231 0 0         if (distance < closest)
4232 0           { ixbest = ix;
4233 0           iybest = iy;
4234 0           closest = distance;
4235             }
4236             }
4237             }
4238 0           clusterid[i][0] = ixbest;
4239 0           clusterid[i][1] = iybest;
4240             }
4241 0           free(celldatavector);
4242 0 0         for (i = 0; i < nrows; i++) free(dummymask[i]);
4243 0           free(dummymask);
4244             }
4245 2           return;
4246             }
4247              
4248             /* ******************************************************************* */
4249              
4250 2           void somcluster (int nrows, int ncolumns, double** data, int** mask,
4251             const double weight[], int transpose, int nxgrid, int nygrid,
4252             double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4253             /*
4254              
4255             Purpose
4256             =======
4257              
4258             The somcluster routine implements a self-organizing map (Kohonen) on a
4259             rectangular grid, using a given set of vectors. The distance measure to be
4260             used to find the similarity between genes and nodes is given by dist.
4261              
4262             Arguments
4263             =========
4264              
4265             nrows (input) int
4266             The number of rows in the data matrix, equal to the number of genes.
4267              
4268             ncolumns (input) int
4269             The number of columns in the data matrix, equal to the number of microarrays.
4270              
4271             data (input) double[nrows][ncolumns]
4272             The array containing the gene expression data.
4273              
4274             mask (input) int[nrows][ncolumns]
4275             This array shows which data values are missing. If
4276             mask[i][j] == 0, then data[i][j] is missing.
4277              
4278             weights (input) double[ncolumns] if transpose==0;
4279             double[nrows] if transpose==1
4280             The weights that are used to calculate the distance. The length of this vector
4281             is ncolumns if genes are being clustered, or nrows if microarrays are being
4282             clustered.
4283              
4284             transpose (input) int
4285             If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
4286             columns (microarrays) of the matrix are clustered.
4287              
4288             nxgrid (input) int
4289             The number of grid cells horizontally in the rectangular topology of clusters.
4290              
4291             nygrid (input) int
4292             The number of grid cells horizontally in the rectangular topology of clusters.
4293              
4294             inittau (input) double
4295             The initial value of tau, representing the neighborhood function.
4296              
4297             niter (input) int
4298             The number of iterations to be performed.
4299              
4300             dist (input) char
4301             Defines which distance measure is used, as given by the table:
4302             dist=='e': Euclidean distance
4303             dist=='b': City-block distance
4304             dist=='c': correlation
4305             dist=='a': absolute value of the correlation
4306             dist=='u': uncentered correlation
4307             dist=='x': absolute uncentered correlation
4308             dist=='s': Spearman's rank correlation
4309             dist=='k': Kendall's tau
4310             For other values of dist, the default (Euclidean distance) is used.
4311              
4312             celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
4313             double[nxgrid][nygrid][nrows] if tranpose==1
4314             The gene expression data for each node (cell) in the 2D grid. This can be
4315             interpreted as the centroid for the cluster corresponding to that cell. If
4316             celldata is NULL, then the centroids are not returned. If celldata is not
4317             NULL, enough space should be allocated to store the centroid data before callingsomcluster.
4318              
4319             clusterid (output), int[nrows][2] if transpose==0;
4320             int[ncolumns][2] if transpose==1
4321             For each item (gene or microarray) that is clustered, the coordinates of the
4322             cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
4323             cluster assignments are not returned. If clusterid is not NULL, enough memory
4324             should be allocated to store the clustering information before calling
4325             somcluster.
4326              
4327             ========================================================================
4328             */
4329 2 50         { const int nobjects = (transpose==0) ? nrows : ncolumns;
4330 2 50         const int ndata = (transpose==0) ? ncolumns : nrows;
4331             int i,j;
4332 2           const int lcelldata = (celldata==NULL) ? 0 : 1;
4333              
4334 2 50         if (nobjects < 2) return;
4335              
4336 2 50         if (lcelldata==0)
4337 2           { celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
4338 22 100         for (i = 0; i < nxgrid; i++)
4339 20           { celldata[i] = malloc(nygrid*ndata*sizeof(double*));
4340 220 100         for (j = 0; j < nygrid; j++)
4341 200           celldata[i][j] = malloc(ndata*sizeof(double));
4342             }
4343             }
4344              
4345 2           somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
4346             inittau, celldata, niter, dist);
4347 2 50         if (clusterid)
4348 2           somassign (nrows, ncolumns, data, mask, weight, transpose,
4349             nxgrid, nygrid, celldata, dist, clusterid);
4350 2 50         if(lcelldata==0)
4351 22 100         { for (i = 0; i < nxgrid; i++)
4352 220 100         for (j = 0; j < nygrid; j++)
4353 200           free(celldata[i][j]);
4354 22 100         for (i = 0; i < nxgrid; i++)
4355 20           free(celldata[i]);
4356 2           free(celldata);
4357             }
4358 2           return;
4359             }
4360              
4361             /* ******************************************************************** */
4362              
4363 46           double clusterdistance (int nrows, int ncolumns, double** data,
4364             int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4365             char dist, char method, int transpose)
4366              
4367             /*
4368             Purpose
4369             =======
4370              
4371             The clusterdistance routine calculates the distance between two clusters
4372             containing genes or microarrays using the measured gene expression vectors. The
4373             distance between clusters, given the genes/microarrays in each cluster, can be
4374             defined in several ways. Several distance measures can be used.
4375              
4376             The routine returns the distance in double precision.
4377             If the parameter transpose is set to a nonzero value, the clusters are
4378             interpreted as clusters of microarrays, otherwise as clusters of gene.
4379              
4380             Arguments
4381             =========
4382              
4383             nrows (input) int
4384             The number of rows (i.e., the number of genes) in the gene expression data
4385             matrix.
4386              
4387             ncolumns (input) int
4388             The number of columns (i.e., the number of microarrays) in the gene expression
4389             data matrix.
4390              
4391             data (input) double[nrows][ncolumns]
4392             The array containing the data of the vectors.
4393              
4394             mask (input) int[nrows][ncolumns]
4395             This array shows which data values are missing. If mask[i][j]==0, then
4396             data[i][j] is missing.
4397              
4398             weight (input) double[ncolumns] if transpose==0;
4399             double[nrows] if transpose==1
4400             The weights that are used to calculate the distance.
4401              
4402             n1 (input) int
4403             The number of elements in the first cluster.
4404              
4405             n2 (input) int
4406             The number of elements in the second cluster.
4407              
4408             index1 (input) int[n1]
4409             Identifies which genes/microarrays belong to the first cluster.
4410              
4411             index2 (input) int[n2]
4412             Identifies which genes/microarrays belong to the second cluster.
4413              
4414             dist (input) char
4415             Defines which distance measure is used, as given by the table:
4416             dist=='e': Euclidean distance
4417             dist=='b': City-block distance
4418             dist=='c': correlation
4419             dist=='a': absolute value of the correlation
4420             dist=='u': uncentered correlation
4421             dist=='x': absolute uncentered correlation
4422             dist=='s': Spearman's rank correlation
4423             dist=='k': Kendall's tau
4424             For other values of dist, the default (Euclidean distance) is used.
4425              
4426             method (input) char
4427             Defines how the distance between two clusters is defined, given which genes
4428             belong to which cluster:
4429             method=='a': the distance between the arithmetic means of the two clusters
4430             method=='m': the distance between the medians of the two clusters
4431             method=='s': the smallest pairwise distance between members of the two clusters
4432             method=='x': the largest pairwise distance between members of the two clusters
4433             method=='v': average of the pairwise distances between members of the clusters
4434              
4435             transpose (input) int
4436             If transpose is equal to zero, the distances between the rows is
4437             calculated. Otherwise, the distances between the columns is calculated.
4438             The former is needed when genes are being clustered; the latter is used
4439             when microarrays are being clustered.
4440              
4441             ========================================================================
4442             */
4443             { /* Set the metric function as indicated by dist */
4444 46           double (*metric)
4445             (int, double**, double**, int**, int**, const double[], int, int, int) =
4446 46           setmetric(dist);
4447              
4448             /* if one or both clusters are empty, return */
4449 46 50         if (n1 < 1 || n2 < 1) return -1.0;
    50          
4450             /* Check the indices */
4451 46 50         if (transpose==0)
4452             { int i;
4453 102 100         for (i = 0; i < n1; i++)
4454 56           { int index = index1[i];
4455 56 50         if (index < 0 || index >= nrows) return -1.0;
    50          
4456             }
4457 136 100         for (i = 0; i < n2; i++)
4458 90           { int index = index2[i];
4459 90 50         if (index < 0 || index >= nrows) return -1.0;
    50          
4460             }
4461             }
4462             else
4463             { int i;
4464 0 0         for (i = 0; i < n1; i++)
4465 0           { int index = index1[i];
4466 0 0         if (index < 0 || index >= ncolumns) return -1.0;
    0          
4467             }
4468 0 0         for (i = 0; i < n2; i++)
4469 0           { int index = index2[i];
4470 0 0         if (index < 0 || index >= ncolumns) return -1.0;
    0          
4471             }
4472             }
4473              
4474 46           switch (method)
4475             { case 'a':
4476             { /* Find the center */
4477             int i,j,k;
4478 14 50         if (transpose==0)
4479             { double distance;
4480             double* cdata[2];
4481             int* cmask[2];
4482             int* count[2];
4483 14           count[0] = calloc(ncolumns,sizeof(int));
4484 14           count[1] = calloc(ncolumns,sizeof(int));
4485 14           cdata[0] = calloc(ncolumns,sizeof(double));
4486 14           cdata[1] = calloc(ncolumns,sizeof(double));
4487 14           cmask[0] = malloc(ncolumns*sizeof(int));
4488 14           cmask[1] = malloc(ncolumns*sizeof(int));
4489 38 100         for (i = 0; i < n1; i++)
4490 24           { k = index1[i];
4491 92 100         for (j = 0; j < ncolumns; j++)
4492 68 50         if (mask[k][j] != 0)
4493 68           { cdata[0][j] = cdata[0][j] + data[k][j];
4494 68           count[0][j] = count[0][j] + 1;
4495             }
4496             }
4497 40 100         for (i = 0; i < n2; i++)
4498 26           { k = index2[i];
4499 106 100         for (j = 0; j < ncolumns; j++)
4500 80 50         if (mask[k][j] != 0)
4501 80           { cdata[1][j] = cdata[1][j] + data[k][j];
4502 80           count[1][j] = count[1][j] + 1;
4503             }
4504             }
4505 42 100         for (i = 0; i < 2; i++)
4506 118 100         for (j = 0; j < ncolumns; j++)
4507 90 50         { if (count[i][j]>0)
4508 90           { cdata[i][j] = cdata[i][j] / count[i][j];
4509 90           cmask[i][j] = 1;
4510             }
4511             else
4512 0           cmask[i][j] = 0;
4513             }
4514 14           distance =
4515             metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4516 42 100         for (i = 0; i < 2; i++)
4517 28           { free (cdata[i]);
4518 28           free (cmask[i]);
4519 28           free (count[i]);
4520             }
4521 14           return distance;
4522             }
4523             else
4524             { double distance;
4525 0           int** count = malloc(nrows*sizeof(int*));
4526 0           double** cdata = malloc(nrows*sizeof(double*));
4527 0           int** cmask = malloc(nrows*sizeof(int*));
4528 0 0         for (i = 0; i < nrows; i++)
4529 0           { count[i] = calloc(2,sizeof(int));
4530 0           cdata[i] = calloc(2,sizeof(double));
4531 0           cmask[i] = malloc(2*sizeof(int));
4532             }
4533 0 0         for (i = 0; i < n1; i++)
4534 0           { k = index1[i];
4535 0 0         for (j = 0; j < nrows; j++)
4536 0 0         { if (mask[j][k] != 0)
4537 0           { cdata[j][0] = cdata[j][0] + data[j][k];
4538 0           count[j][0] = count[j][0] + 1;
4539             }
4540             }
4541             }
4542 0 0         for (i = 0; i < n2; i++)
4543 0           { k = index2[i];
4544 0 0         for (j = 0; j < nrows; j++)
4545 0 0         { if (mask[j][k] != 0)
4546 0           { cdata[j][1] = cdata[j][1] + data[j][k];
4547 0           count[j][1] = count[j][1] + 1;
4548             }
4549             }
4550             }
4551 0 0         for (i = 0; i < nrows; i++)
4552 0 0         for (j = 0; j < 2; j++)
4553 0 0         if (count[i][j]>0)
4554 0           { cdata[i][j] = cdata[i][j] / count[i][j];
4555 0           cmask[i][j] = 1;
4556             }
4557             else
4558 0           cmask[i][j] = 0;
4559 0           distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4560 0 0         for (i = 0; i < nrows; i++)
4561 0           { free (count[i]);
4562 0           free (cdata[i]);
4563 0           free (cmask[i]);
4564             }
4565 0           free (count);
4566 0           free (cdata);
4567 0           free (cmask);
4568 0           return distance;
4569             }
4570             }
4571             case 'm':
4572             { int i, j, k;
4573 8 50         if (transpose==0)
4574             { double distance;
4575 8           double* temp = malloc(nrows*sizeof(double));
4576             double* cdata[2];
4577             int* cmask[2];
4578 24 100         for (i = 0; i < 2; i++)
4579 16           { cdata[i] = malloc(ncolumns*sizeof(double));
4580 16           cmask[i] = malloc(ncolumns*sizeof(int));
4581             }
4582 32 100         for (j = 0; j < ncolumns; j++)
4583 24           { int count = 0;
4584 48 100         for (k = 0; k < n1; k++)
4585 24           { i = index1[k];
4586 24 50         if (mask[i][j])
4587 24           { temp[count] = data[i][j];
4588 24           count++;
4589             }
4590             }
4591 24 50         if (count>0)
4592 24           { cdata[0][j] = median (count,temp);
4593 24           cmask[0][j] = 1;
4594             }
4595             else
4596 0           { cdata[0][j] = 0.;
4597 0           cmask[0][j] = 0;
4598             }
4599             }
4600 32 100         for (j = 0; j < ncolumns; j++)
4601 24           { int count = 0;
4602 72 100         for (k = 0; k < n2; k++)
4603 48           { i = index2[k];
4604 48 50         if (mask[i][j])
4605 48           { temp[count] = data[i][j];
4606 48           count++;
4607             }
4608             }
4609 24 50         if (count>0)
4610 24           { cdata[1][j] = median (count,temp);
4611 24           cmask[1][j] = 1;
4612             }
4613             else
4614 0           { cdata[1][j] = 0.;
4615 0           cmask[1][j] = 0;
4616             }
4617             }
4618 8           distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4619 24 100         for (i = 0; i < 2; i++)
4620 16           { free (cdata[i]);
4621 16           free (cmask[i]);
4622             }
4623 8           free(temp);
4624 8           return distance;
4625             }
4626             else
4627             { double distance;
4628 0           double* temp = malloc(ncolumns*sizeof(double));
4629 0           double** cdata = malloc(nrows*sizeof(double*));
4630 0           int** cmask = malloc(nrows*sizeof(int*));
4631 0 0         for (i = 0; i < nrows; i++)
4632 0           { cdata[i] = malloc(2*sizeof(double));
4633 0           cmask[i] = malloc(2*sizeof(int));
4634             }
4635 0 0         for (j = 0; j < nrows; j++)
4636 0           { int count = 0;
4637 0 0         for (k = 0; k < n1; k++)
4638 0           { i = index1[k];
4639 0 0         if (mask[j][i])
4640 0           { temp[count] = data[j][i];
4641 0           count++;
4642             }
4643             }
4644 0 0         if (count>0)
4645 0           { cdata[j][0] = median (count,temp);
4646 0           cmask[j][0] = 1;
4647             }
4648             else
4649 0           { cdata[j][0] = 0.;
4650 0           cmask[j][0] = 0;
4651             }
4652             }
4653 0 0         for (j = 0; j < nrows; j++)
4654 0           { int count = 0;
4655 0 0         for (k = 0; k < n2; k++)
4656 0           { i = index2[k];
4657 0 0         if (mask[j][i])
4658 0           { temp[count] = data[j][i];
4659 0           count++;
4660             }
4661             }
4662 0 0         if (count>0)
4663 0           { cdata[j][1] = median (count,temp);
4664 0           cmask[j][1] = 1;
4665             }
4666             else
4667 0           { cdata[j][1] = 0.;
4668 0           cmask[j][1] = 0;
4669             }
4670             }
4671 0           distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4672 0 0         for (i = 0; i < nrows; i++)
4673 0           { free (cdata[i]);
4674 0           free (cmask[i]);
4675             }
4676 0           free(cdata);
4677 0           free(cmask);
4678 0           free(temp);
4679 0           return distance;
4680             }
4681             }
4682             case 's':
4683             { int i1, i2, j1, j2;
4684 8 50         const int n = (transpose==0) ? ncolumns : nrows;
4685 8           double mindistance = DBL_MAX;
4686 16 100         for (i1 = 0; i1 < n1; i1++)
4687 24 100         for (i2 = 0; i2 < n2; i2++)
4688             { double distance;
4689 16           j1 = index1[i1];
4690 16           j2 = index2[i2];
4691 16           distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4692 16 100         if (distance < mindistance) mindistance = distance;
4693             }
4694 8           return mindistance;
4695             }
4696             case 'x':
4697             { int i1, i2, j1, j2;
4698 8 50         const int n = (transpose==0) ? ncolumns : nrows;
4699 8           double maxdistance = 0;
4700 16 100         for (i1 = 0; i1 < n1; i1++)
4701 24 100         for (i2 = 0; i2 < n2; i2++)
4702             { double distance;
4703 16           j1 = index1[i1];
4704 16           j2 = index2[i2];
4705 16           distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4706 16 100         if (distance > maxdistance) maxdistance = distance;
4707             }
4708 8           return maxdistance;
4709             }
4710             case 'v':
4711             { int i1, i2, j1, j2;
4712 8 50         const int n = (transpose==0) ? ncolumns : nrows;
4713 8           double distance = 0;
4714 16 100         for (i1 = 0; i1 < n1; i1++)
4715 24 100         for (i2 = 0; i2 < n2; i2++)
4716 16           { j1 = index1[i1];
4717 16           j2 = index2[i2];
4718 16           distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4719             }
4720 8           distance /= (n1*n2);
4721 8           return distance;
4722             }
4723             }
4724             /* Never get here */
4725 0           return -2.0;
4726             }