File Coverage

cluster.c

Criterion	Covered	Total	%
statement	759	1929	39.3
branch	504	1472	34.2
condition			n/a
subroutine			n/a
pod			n/a
total	1263	3401	37.1

line	stmt	bran	code
1			/* The C clustering library.
2			* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3			*
4			* This library was written at the Laboratory of DNA Information Analysis,
5			* Human Genome Center, Institute of Medical Science, University of Tokyo,
6			* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7			* Contact: michiel.dehoon 'AT' riken.jp
8			*
9			* Permission to use, copy, modify, and distribute this software and its
10			* documentation with or without modifications and for any purpose and
11			* without fee is hereby granted, provided that any copyright notices
12			* appear in all copies and that both those copyright notices and this
13			* permission notice appear in supporting documentation, and that the
14			* names of the contributors or copyright holders not be used in
15			* advertising or publicity pertaining to distribution of the software
16			* without specific prior permission.
17			*
18			* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19			* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20			* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21			* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22			* OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23			* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24			* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25			* OR PERFORMANCE OF THIS SOFTWARE.
26			*
27			*/
28
29			#include
30			#include
31			#include
32			#include
33			#include
34			#include
35			#include "cluster.h"
36			#ifdef WINDOWS
37			# include
38			#endif
39
40			/* ************************************************************************ */
41
42			#ifdef WINDOWS
43			/* Then we make a Windows DLL */
44			int WINAPI
45			clusterdll_init (HANDLE h, DWORD reason, void* foo)
46			{
47			return 1;
48			}
49			#endif
50
51			/* ************************************************************************ */
52
53	2		double mean(int n, double x[])
54	2		{ double result = 0.;
55			int i;
56	9	100	for (i = 0; i < n; i++) result += x[i];
57	2		result /= n;
58	2		return result;
59			}
60
61			/* ************************************************************************ */
62
63	50		double median (int n, double x[])
64			/*
65			Find the median of X(1), ... , X(N), using as much of the quicksort
66			algorithm as is needed to isolate it.
67			N.B. On exit, the array X is partially ordered.
68			Based on Alan J. Miller's median.f90 routine.
69			*/
70
71			{ int i, j;
72	50		int nr = n / 2;
73	50		int nl = nr - 1;
74	50		int even = 0;
75			/* hi & lo are position limits encompassing the median. */
76	50		int lo = 0;
77	50		int hi = n-1;
78
79	50	100	if (n==2*nr) even = 1;
80	50	100	if (n<3)
81	36	50	{ if (n<1) return 0.;
82	36	50	if (n == 1) return x[0];
83	0		return 0.5*(x[0]+x[1]);
84			}
85
86			/* Find median of 1st, middle & last values. */
87			do
88			{ int loop;
89	15		int mid = (lo + hi)/2;
90	15		double result = x[mid];
91	15		double xlo = x[lo];
92	15		double xhi = x[hi];
93	15	100	if (xhi
94	1		{ double temp = xlo;
95	1		xlo = xhi;
96	1		xhi = temp;
97			}
98	15	50	if (result>xhi) result = xhi;
99	15	50	else if (result
100			/* The basic quicksort algorithm to move all values <= the sort key (XMED)
101			* to the left-hand end, and all higher values to the other end.
102			*/
103	15		i = lo;
104	15		j = hi;
105			do
106	30	100	{ while (x[i]
107	31	100	while (x[j]>result) j--;
108	16		loop = 0;
109	16	100	if (i
110	1		{ double temp = x[i];
111	1		x[i] = x[j];
112	1		x[j] = temp;
113	1		i++;
114	1		j--;
115	1	50	if (i<=j) loop = 1;
116			}
117	16	100	} while (loop); /* Decide which half the median is in. */
118
119	15	100	if (even)
120	2	100	{ if (j==nl && i==nr)
		50
121			/* Special case, n even, j = n/2 & i = j + 1, so the median is
122			* between the two halves of the series. Find max. of the first
123			* half & min. of the second half, then average.
124			*/
125			{ int k;
126	0		double xmax = x[0];
127	0		double xmin = x[n-1];
128	0	0	for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
		0
129	0	0	for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
		0
130	0		return 0.5*(xmin + xmax);
131			}
132	2	50	if (j
133	2	50	if (i>nr) hi = j;
134	2	50	if (i==j)
135	2	100	{ if (i==nl) lo = nl;
136	2	100	if (j==nr) hi = nr;
137			}
138			}
139			else
140	13	50	{ if (j
141	13	50	if (i>nr) hi = j;
142			/* Test whether median has been isolated. */
143	13	50	if (i==j && i==nr) return result;
		50
144			}
145			}
146	2	100	while (lo
147
148	1	50	if (even) return (0.5*(x[nl]+x[nr]));
149	0	0	if (x[lo]>x[hi])
150	0		{ double temp = x[lo];
151	0		x[lo] = x[hi];
152	0		x[hi] = temp;
153			}
154	0		return x[nr];
155			}
156
157			/* ********************************************************************** */
158
159			static const double* sortdata = NULL; /* used in the quicksort algorithm */
160
161			/* ---------------------------------------------------------------------- */
162
163			static
164	0		int compare(const void* a, const void* b)
165			/* Helper function for sort. Previously, this was a nested function under
166			* sort, which is not allowed under ANSI C.
167			*/
168	0		{ const int i1 = (const int)a;
169	0		const int i2 = (const int)b;
170	0		const double term1 = sortdata[i1];
171	0		const double term2 = sortdata[i2];
172	0	0	if (term1 < term2) return -1;
173	0	0	if (term1 > term2) return +1;
174	0		return 0;
175			}
176
177			/* ---------------------------------------------------------------------- */
178
179	0		void sort(int n, const double data[], int index[])
180			/* Sets up an index table given the data, such that data[index[]] is in
181			* increasing order. Sorting is done on the indices; the array data
182			* is unchanged.
183			*/
184			{ int i;
185	0		sortdata = data;
186	0	0	for (i = 0; i < n; i++) index[i] = i;
187	0		qsort(index, n, sizeof(int), compare);
188	0		}
189
190			/* ********************************************************************** */
191
192	0		static double* getrank (int n, double data[])
193			/* Calculates the ranks of the elements in the array data. Two elements with
194			* the same value get the same rank, equal to the average of the ranks had the
195			* elements different values. The ranks are returned as a newly allocated
196			* array that should be freed by the calling routine. If getrank fails due to
197			* a memory allocation error, it returns NULL.
198			*/
199			{ int i;
200			double* rank;
201			int* index;
202	0		rank = malloc(n*sizeof(double));
203	0	0	if (!rank) return NULL;
204	0		index = malloc(n*sizeof(int));
205	0	0	if (!index)
206	0		{ free(rank);
207	0		return NULL;
208			}
209			/* Call sort to get an index table */
210	0		sort (n, data, index);
211			/* Build a rank table */
212	0	0	for (i = 0; i < n; i++) rank[index[i]] = i;
213			/* Fix for equal ranks */
214	0		i = 0;
215	0	0	while (i < n)
216			{ int m;
217	0		double value = data[index[i]];
218	0		int j = i + 1;
219	0	0	while (j < n && data[index[j]] == value) j++;
		0
220	0		m = j - i; /* number of equal ranks found */
221	0		value = rank[index[i]] + (m-1)/2.;
222	0	0	for (j = i; j < i + m; j++) rank[index[j]] = value;
223	0		i += m;
224			}
225	0		free (index);
226	0		return rank;
227			}
228
229			/* ---------------------------------------------------------------------- */
230
231			static int
232	5		makedatamask(int nrows, int ncols, double* pdata, int* pmask)
233			{ int i;
234			double** data;
235			int** mask;
236	5		data = malloc(nrowssizeof(double));
237	5	50	if(!data) return 0;
238	5		mask = malloc(nrowssizeof(int));
239	5	50	if(!mask)
240	0		{ free(data);
241	0		return 0;
242			}
243	31	100	for (i = 0; i < nrows; i++)
244	26		{ data[i] = malloc(ncols*sizeof(double));
245	26	50	if(!data[i]) break;
246	26		mask[i] = malloc(ncols*sizeof(int));
247	26	50	if(!mask[i])
248	0		{ free(data[i]);
249	0		break;
250			}
251			}
252	5	50	if (i==nrows) /* break not encountered */
253	5		{ *pdata = data;
254	5		*pmask = mask;
255	5		return 1;
256			}
257	0		*pdata = NULL;
258	0		*pmask = NULL;
259	0		nrows = i;
260	0	0	for (i = 0; i < nrows; i++)
261	0		{ free(data[i]);
262	0		free(mask[i]);
263			}
264	0		free(data);
265	0		free(mask);
266	0		return 0;
267			}
268
269			/* ---------------------------------------------------------------------- */
270
271			static void
272	3		freedatamask(int n, double data, int mask)
273			{ int i;
274	12	100	for (i = 0; i < n; i++)
275	9		{ free(mask[i]);
276	9		free(data[i]);
277			}
278	3		free(mask);
279	3		free(data);
280	3		}
281
282			/* ---------------------------------------------------------------------- */
283
284			static
285	45		double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
286			/*
287			This function searches the distance matrix to find the pair with the shortest
288			distance between them. The indices of the pair are returned in ip and jp; the
289			distance itself is returned by the function.
290
291			n (input) int
292			The number of elements in the distance matrix.
293
294			distmatrix (input) double**
295			A ragged array containing the distance matrix. The number of columns in each
296			row is one less than the row index.
297
298			ip (output) int*
299			A pointer to the integer that is to receive the first index of the pair with
300			the shortest distance.
301
302			jp (output) int*
303			A pointer to the integer that is to receive the second index of the pair with
304			the shortest distance.
305			*/
306			{ int i, j;
307			double temp;
308	45		double distance = distmatrix[1][0];
309	45		*ip = 1;
310	45		*jp = 0;
311	297	100	for (i = 1; i < n; i++)
312	1374	100	{ for (j = 0; j < i; j++)
313	1122		{ temp = distmatrix[i][j];
314	1122	100	if (temp
315	72		{ distance = temp;
316	72		*ip = i;
317	72		*jp = j;
318			}
319			}
320			}
321	45		return distance;
322			}
323
324			/* ********************************************************************* */
325
326	0		static int svd(int m, int n, double u, double w[], double vt)
327			/*
328			* This subroutine is a translation of the Algol procedure svd,
329			* Num. Math. 14, 403-420(1970) by Golub and Reinsch.
330			* Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
331			*
332			* This subroutine determines the singular value decomposition
333			* t
334			* A=usv of a real m by n rectangular matrix, where m is greater
335			* than or equal to n. Householder bidiagonalization and a variant
336			* of the QR algorithm are used.
337			*
338			*
339			* On input.
340			*
341			* m is the number of rows of A (and u).
342			*
343			* n is the number of columns of A (and u) and the order of v.
344			*
345			* u contains the rectangular input matrix A to be decomposed.
346			*
347			* On output.
348			*
349			* the routine returns an integer ierr equal to
350			* 0 to indicate a normal return,
351			* k if the k-th singular value has not been
352			* determined after 30 iterations,
353			* -1 if memory allocation fails.
354			*
355			*
356			* w contains the n (non-negative) singular values of a (the
357			* diagonal elements of s). they are unordered. if an
358			* error exit is made, the singular values should be correct
359			* for indices ierr+1,ierr+2,...,n.
360			*
361			*
362			* u contains the matrix u (orthogonal column vectors) of the
363			* decomposition.
364			* if an error exit is made, the columns of u corresponding
365			* to indices of correct singular values should be correct.
366			*
367			* t
368			* vt contains the matrix v (orthogonal) of the decomposition.
369			* if an error exit is made, the columns of v corresponding
370			* to indices of correct singular values should be correct.
371			*
372			*
373			* Questions and comments should be directed to B. S. Garbow,
374			* Applied Mathematics division, Argonne National Laboratory
375			*
376			* Modified to eliminate machep
377			*
378			* Translated to C by Michiel de Hoon, Human Genome Center,
379			* University of Tokyo, for inclusion in the C Clustering Library.
380			* This routine is less general than the original svd routine, as
381			* it focuses on the singular value decomposition as needed for
382			* clustering. In particular,
383			* - We calculate both u and v in all cases
384			* - We pass the input array A via u; this array is subsequently
385			* overwritten.
386			* - We allocate for the array rv1, used as a working space,
387			* internally in this routine, instead of passing it as an
388			* argument. If the allocation fails, svd returns -1.
389			* 2003.06.05
390			*/
391			{ int i, j, k, i1, k1, l1, its;
392			double c,f,h,s,x,y,z;
393	0		int l = 0;
394	0		int ierr = 0;
395	0		double g = 0.0;
396	0		double scale = 0.0;
397	0		double anorm = 0.0;
398	0		double* rv1 = malloc(n*sizeof(double));
399	0	0	if (!rv1) return -1;
400	0	0	if (m >= n)
401			{ /* Householder reduction to bidiagonal form */
402	0	0	for (i = 0; i < n; i++)
403	0		{ l = i + 1;
404	0		rv1[i] = scale * g;
405	0		g = 0.0;
406	0		s = 0.0;
407	0		scale = 0.0;
408	0	0	for (k = i; k < m; k++) scale += fabs(u[k][i]);
409	0	0	if (scale != 0.0)
410	0	0	{ for (k = i; k < m; k++)
411	0		{ u[k][i] /= scale;
412	0		s += u[k][i]*u[k][i];
413			}
414	0		f = u[i][i];
415	0	0	g = (f >= 0) ? -sqrt(s) : sqrt(s);
416	0		h = f * g - s;
417	0		u[i][i] = f - g;
418	0	0	if (i < n-1)
419	0	0	{ for (j = l; j < n; j++)
420	0		{ s = 0.0;
421	0	0	for (k = i; k < m; k++) s += u[k][i] * u[k][j];
422	0		f = s / h;
423	0	0	for (k = i; k < m; k++) u[k][j] += f * u[k][i];
424			}
425			}
426	0	0	for (k = i; k < m; k++) u[k][i] *= scale;
427			}
428	0		w[i] = scale * g;
429	0		g = 0.0;
430	0		s = 0.0;
431	0		scale = 0.0;
432	0	0	if (i
433	0	0	{ for (k = l; k < n; k++) scale += fabs(u[i][k]);
434	0	0	if (scale != 0.0)
435	0	0	{ for (k = l; k < n; k++)
436	0		{ u[i][k] /= scale;
437	0		s += u[i][k] * u[i][k];
438			}
439	0		f = u[i][l];
440	0	0	g = (f >= 0) ? -sqrt(s) : sqrt(s);
441	0		h = f * g - s;
442	0		u[i][l] = f - g;
443	0	0	for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
444	0	0	for (j = l; j < m; j++)
445	0		{ s = 0.0;
446	0	0	for (k = l; k < n; k++) s += u[j][k] * u[i][k];
447	0	0	for (k = l; k < n; k++) u[j][k] += s * rv1[k];
448			}
449	0	0	for (k = l; k < n; k++) u[i][k] *= scale;
450			}
451			}
452	0	0	anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
453			}
454			/* accumulation of right-hand transformations */
455	0	0	for (i = n-1; i>=0; i--)
456	0	0	{ if (i < n-1)
457	0	0	{ if (g != 0.0)
458	0	0	{ for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
459			/* double division avoids possible underflow */
460	0	0	for (j = l; j < n; j++)
461	0		{ s = 0.0;
462	0	0	for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
463	0	0	for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
464			}
465			}
466			}
467	0	0	for (j = l; j < n; j++)
468	0		{ vt[j][i] = 0.0;
469	0		vt[i][j] = 0.0;
470			}
471	0		vt[i][i] = 1.0;
472	0		g = rv1[i];
473	0		l = i;
474			}
475			/* accumulation of left-hand transformations */
476	0	0	for (i = n-1; i >= 0; i--)
477	0		{ l = i + 1;
478	0		g = w[i];
479	0	0	if (i!=n-1)
480	0	0	for (j = l; j < n; j++) u[i][j] = 0.0;
481	0	0	if (g!=0.0)
482	0	0	{ if (i!=n-1)
483	0	0	{ for (j = l; j < n; j++)
484	0		{ s = 0.0;
485	0	0	for (k = l; k < m; k++) s += u[k][i] * u[k][j];
486			/* double division avoids possible underflow */
487	0		f = (s / u[i][i]) / g;
488	0	0	for (k = i; k < m; k++) u[k][j] += f * u[k][i];
489			}
490			}
491	0	0	for (j = i; j < m; j++) u[j][i] /= g;
492			}
493			else
494	0	0	for (j = i; j < m; j++) u[j][i] = 0.0;
495	0		u[i][i] += 1.0;
496			}
497			/* diagonalization of the bidiagonal form */
498	0	0	for (k = n-1; k >= 0; k--)
499	0		{ k1 = k-1;
500	0		its = 0;
501			while(1)
502			/* test for splitting */
503	0	0	{ for (l = k; l >= 0; l--)
504	0		{ l1 = l-1;
505	0	0	if (fabs(rv1[l]) + anorm == anorm) break;
506			/* rv1[0] is always zero, so there is no exit
507			* through the bottom of the loop */
508	0	0	if (fabs(w[l1]) + anorm == anorm)
509			/* cancellation of rv1[l] if l greater than 0 */
510	0		{ c = 0.0;
511	0		s = 1.0;
512	0	0	for (i = l; i <= k; i++)
513	0		{ f = s * rv1[i];
514	0		rv1[i] *= c;
515	0	0	if (fabs(f) + anorm == anorm) break;
516	0		g = w[i];
517	0		h = sqrt(ff+gg);
518	0		w[i] = h;
519	0		c = g / h;
520	0		s = -f / h;
521	0	0	for (j = 0; j < m; j++)
522	0		{ y = u[j][l1];
523	0		z = u[j][i];
524	0		u[j][l1] = y * c + z * s;
525	0		u[j][i] = -y * s + z * c;
526			}
527			}
528	0		break;
529			}
530			}
531			/* test for convergence */
532	0		z = w[k];
533	0	0	if (l==k) /* convergence */
534	0	0	{ if (z < 0.0)
535			/* w[k] is made non-negative */
536	0		{ w[k] = -z;
537	0	0	for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
538			}
539	0		break;
540			}
541	0	0	else if (its==30)
542	0		{ ierr = k;
543	0		break;
544			}
545			else
546			/* shift from bottom 2 by 2 minor */
547	0		{ its++;
548	0		x = w[l];
549	0		y = w[k1];
550	0		g = rv1[k1];
551	0		h = rv1[k];
552	0		f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
553	0		g = sqrt(f*f+1.0);
554	0	0	f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
555			/* next qr transformation */
556	0		c = 1.0;
557	0		s = 1.0;
558	0	0	for (i1 = l; i1 <= k1; i1++)
559	0		{ i = i1 + 1;
560	0		g = rv1[i];
561	0		y = w[i];
562	0		h = s * g;
563	0		g = c * g;
564	0		z = sqrt(ff+hh);
565	0		rv1[i1] = z;
566	0		c = f / z;
567	0		s = h / z;
568	0		f = x * c + g * s;
569	0		g = -x * s + g * c;
570	0		h = y * s;
571	0		y = y * c;
572	0	0	for (j = 0; j < n; j++)
573	0		{ x = vt[i1][j];
574	0		z = vt[i][j];
575	0		vt[i1][j] = x * c + z * s;
576	0		vt[i][j] = -x * s + z * c;
577			}
578	0		z = sqrt(ff+hh);
579	0		w[i1] = z;
580			/* rotation can be arbitrary if z is zero */
581	0	0	if (z!=0.0)
582	0		{ c = f / z;
583	0		s = h / z;
584			}
585	0		f = c * g + s * y;
586	0		x = -s * g + c * y;
587	0	0	for (j = 0; j < m; j++)
588	0		{ y = u[j][i1];
589	0		z = u[j][i];
590	0		u[j][i1] = y * c + z * s;
591	0		u[j][i] = -y * s + z * c;
592			}
593			}
594	0		rv1[l] = 0.0;
595	0		rv1[k] = f;
596	0		w[k] = x;
597			}
598	0		}
599			}
600			}
601			else /* m < n */
602			{ /* Householder reduction to bidiagonal form */
603	0	0	for (i = 0; i < m; i++)
604	0		{ l = i + 1;
605	0		rv1[i] = scale * g;
606	0		g = 0.0;
607	0		s = 0.0;
608	0		scale = 0.0;
609	0	0	for (k = i; k < n; k++) scale += fabs(u[i][k]);
610	0	0	if (scale != 0.0)
611	0	0	{ for (k = i; k < n; k++)
612	0		{ u[i][k] /= scale;
613	0		s += u[i][k]*u[i][k];
614			}
615	0		f = u[i][i];
616	0	0	g = (f >= 0) ? -sqrt(s) : sqrt(s);
617	0		h = f * g - s;
618	0		u[i][i] = f - g;
619	0	0	if (i < m-1)
620	0	0	{ for (j = l; j < m; j++)
621	0		{ s = 0.0;
622	0	0	for (k = i; k < n; k++) s += u[i][k] * u[j][k];
623	0		f = s / h;
624	0	0	for (k = i; k < n; k++) u[j][k] += f * u[i][k];
625			}
626			}
627	0	0	for (k = i; k < n; k++) u[i][k] *= scale;
628			}
629	0		w[i] = scale * g;
630	0		g = 0.0;
631	0		s = 0.0;
632	0		scale = 0.0;
633	0	0	if (i
634	0	0	{ for (k = l; k < m; k++) scale += fabs(u[k][i]);
635	0	0	if (scale != 0.0)
636	0	0	{ for (k = l; k < m; k++)
637	0		{ u[k][i] /= scale;
638	0		s += u[k][i] * u[k][i];
639			}
640	0		f = u[l][i];
641	0	0	g = (f >= 0) ? -sqrt(s) : sqrt(s);
642	0		h = f * g - s;
643	0		u[l][i] = f - g;
644	0	0	for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
645	0	0	for (j = l; j < n; j++)
646	0		{ s = 0.0;
647	0	0	for (k = l; k < m; k++) s += u[k][j] * u[k][i];
648	0	0	for (k = l; k < m; k++) u[k][j] += s * rv1[k];
649			}
650	0	0	for (k = l; k < m; k++) u[k][i] *= scale;
651			}
652			}
653	0	0	anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
654			}
655			/* accumulation of right-hand transformations */
656	0	0	for (i = m-1; i>=0; i--)
657	0	0	{ if (i < m-1)
658	0	0	{ if (g != 0.0)
659	0	0	{ for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
660			/* double division avoids possible underflow */
661	0	0	for (j = l; j < m; j++)
662	0		{ s = 0.0;
663	0	0	for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
664	0	0	for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
665			}
666			}
667			}
668	0	0	for (j = l; j < m; j++)
669	0		{ vt[i][j] = 0.0;
670	0		vt[j][i] = 0.0;
671			}
672	0		vt[i][i] = 1.0;
673	0		g = rv1[i];
674	0		l = i;
675			}
676			/* accumulation of left-hand transformations */
677	0	0	for (i = m-1; i >= 0; i--)
678	0		{ l = i + 1;
679	0		g = w[i];
680	0	0	if (i!=m-1)
681	0	0	for (j = l; j < m; j++) u[j][i] = 0.0;
682	0	0	if (g!=0.0)
683	0	0	{ if (i!=m-1)
684	0	0	{ for (j = l; j < m; j++)
685	0		{ s = 0.0;
686	0	0	for (k = l; k < n; k++) s += u[i][k] * u[j][k];
687			/* double division avoids possible underflow */
688	0		f = (s / u[i][i]) / g;
689	0	0	for (k = i; k < n; k++) u[j][k] += f * u[i][k];
690			}
691			}
692	0	0	for (j = i; j < n; j++) u[i][j] /= g;
693			}
694			else
695	0	0	for (j = i; j < n; j++) u[i][j] = 0.0;
696	0		u[i][i] += 1.0;
697			}
698			/* diagonalization of the bidiagonal form */
699	0	0	for (k = m-1; k >= 0; k--)
700	0		{ k1 = k-1;
701	0		its = 0;
702			while(1)
703			/* test for splitting */
704	0	0	{ for (l = k; l >= 0; l--)
705	0		{ l1 = l-1;
706	0	0	if (fabs(rv1[l]) + anorm == anorm) break;
707			/* rv1[0] is always zero, so there is no exit
708			* through the bottom of the loop */
709	0	0	if (fabs(w[l1]) + anorm == anorm)
710			/* cancellation of rv1[l] if l greater than 0 */
711	0		{ c = 0.0;
712	0		s = 1.0;
713	0	0	for (i = l; i <= k; i++)
714	0		{ f = s * rv1[i];
715	0		rv1[i] *= c;
716	0	0	if (fabs(f) + anorm == anorm) break;
717	0		g = w[i];
718	0		h = sqrt(ff+gg);
719	0		w[i] = h;
720	0		c = g / h;
721	0		s = -f / h;
722	0	0	for (j = 0; j < n; j++)
723	0		{ y = u[l1][j];
724	0		z = u[i][j];
725	0		u[l1][j] = y * c + z * s;
726	0		u[i][j] = -y * s + z * c;
727			}
728			}
729	0		break;
730			}
731			}
732			/* test for convergence */
733	0		z = w[k];
734	0	0	if (l==k) /* convergence */
735	0	0	{ if (z < 0.0)
736			/* w[k] is made non-negative */
737	0		{ w[k] = -z;
738	0	0	for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
739			}
740	0		break;
741			}
742	0	0	else if (its==30)
743	0		{ ierr = k;
744	0		break;
745			}
746			else
747			/* shift from bottom 2 by 2 minor */
748	0		{ its++;
749	0		x = w[l];
750	0		y = w[k1];
751	0		g = rv1[k1];
752	0		h = rv1[k];
753	0		f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
754	0		g = sqrt(f*f+1.0);
755	0	0	f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
756			/* next qr transformation */
757	0		c = 1.0;
758	0		s = 1.0;
759	0	0	for (i1 = l; i1 <= k1; i1++)
760	0		{ i = i1 + 1;
761	0		g = rv1[i];
762	0		y = w[i];
763	0		h = s * g;
764	0		g = c * g;
765	0		z = sqrt(ff+hh);
766	0		rv1[i1] = z;
767	0		c = f / z;
768	0		s = h / z;
769	0		f = x * c + g * s;
770	0		g = -x * s + g * c;
771	0		h = y * s;
772	0		y = y * c;
773	0	0	for (j = 0; j < m; j++)
774	0		{ x = vt[j][i1];
775	0		z = vt[j][i];
776	0		vt[j][i1] = x * c + z * s;
777	0		vt[j][i] = -x * s + z * c;
778			}
779	0		z = sqrt(ff+hh);
780	0		w[i1] = z;
781			/* rotation can be arbitrary if z is zero */
782	0	0	if (z!=0.0)
783	0		{ c = f / z;
784	0		s = h / z;
785			}
786	0		f = c * g + s * y;
787	0		x = -s * g + c * y;
788	0	0	for (j = 0; j < n; j++)
789	0		{ y = u[i1][j];
790	0		z = u[i][j];
791	0		u[i1][j] = y * c + z * s;
792	0		u[i][j] = -y * s + z * c;
793			}
794			}
795	0		rv1[l] = 0.0;
796	0		rv1[k] = f;
797	0		w[k] = x;
798			}
799	0		}
800			}
801			}
802	0		free(rv1);
803	0		return ierr;
804			}
805
806			/* ********************************************************************* */
807
808	0		int pca(int nrows, int ncolumns, double u, double v, double* w)
809			/*
810			Purpose
811			=======
812
813			This subroutine uses the singular value decomposition to perform principal
814			components analysis of a real nrows by ncolumns rectangular matrix.
815
816			Arguments
817			=========
818
819			nrows (input) int
820			The number of rows in the matrix u.
821
822			ncolumns (input) int
823			The number of columns in the matrix v.
824
825			u (input) double[nrows][ncolumns]
826			On input, the array containing the data to which the principal component
827			analysis should be applied. The function assumes that the mean has already been
828			subtracted of each column, and hence that the mean of each column is zero.
829			On output, see below.
830
831			v (input) double[n][n], where n = min(nrows, ncolumns)
832			Not used on input.
833
834			w (input) double[n], where n = min(nrows, ncolumns)
835			Not used on input.
836
837
838			Return value
839			============
840
841			On output:
842
843			If nrows >= ncolumns, then
844
845			u contains the coordinates with respect to the principal components;
846			v contains the principal component vectors.
847
848			The dot product u . v reproduces the data that were passed in u.
849
850
851			If nrows < ncolumns, then
852
853			u contains the principal component vectors;
854			v contains the coordinates with respect to the principal components.
855
856			The dot product v . u reproduces the data that were passed in u.
857
858			The eigenvalues of the covariance matrix are returned in w.
859
860			The arrays u, v, and w are sorted according to eigenvalue, with the largest
861			eigenvalues appearing first.
862
863			The function returns 0 if successful, -1 if memory allocation fails, and a
864			positive integer if the singular value decomposition fails to converge.
865			*/
866			{
867			int i;
868			int j;
869			int error;
870	0		int* index = malloc(ncolumns*sizeof(int));
871	0		double* temp = malloc(ncolumns*sizeof(double));
872	0	0	if (!index \|\| !temp)
		0
873	0	0	{ if (index) free(index);
874	0	0	if (temp) free(temp);
875	0		return -1;
876			}
877	0		error = svd(nrows, ncolumns, u, w, v);
878	0	0	if (error==0)
879			{
880	0	0	if (nrows >= ncolumns)
881	0	0	{ for (j = 0; j < ncolumns; j++)
882	0		{ const double s = w[j];
883	0	0	for (i = 0; i < nrows; i++) u[i][j] *= s;
884			}
885	0		sort(ncolumns, w, index);
886	0	0	for (i = 0; i < ncolumns/2; i++)
887	0		{ j = index[i];
888	0		index[i] = index[ncolumns-1-i];
889	0		index[ncolumns-1-i] = j;
890			}
891	0	0	for (i = 0; i < nrows; i++)
892	0	0	{ for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
893	0	0	for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
894			}
895	0	0	for (i = 0; i < ncolumns; i++)
896	0	0	{ for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
897	0	0	for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
898			}
899	0	0	for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
900	0	0	for (i = 0; i < ncolumns; i++) w[i] = temp[i];
901			}
902			else /* nrows < ncolumns */
903	0	0	{ for (j = 0; j < nrows; j++)
904	0		{ const double s = w[j];
905	0	0	for (i = 0; i < nrows; i++) v[i][j] *= s;
906			}
907	0		sort(nrows, w, index);
908	0	0	for (i = 0; i < nrows/2; i++)
909	0		{ j = index[i];
910	0		index[i] = index[nrows-1-i];
911	0		index[nrows-1-i] = j;
912			}
913	0	0	for (j = 0; j < ncolumns; j++)
914	0	0	{ for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
915	0	0	for (i = 0; i < nrows; i++) u[i][j] = temp[i];
916			}
917	0	0	for (j = 0; j < nrows; j++)
918	0	0	{ for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
919	0	0	for (i = 0; i < nrows; i++) v[j][i] = temp[i];
920			}
921	0	0	for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
922	0	0	for (i = 0; i < nrows; i++) w[i] = temp[i];
923			}
924			}
925	0		free(index);
926	0		free(temp);
927	0		return error;
928			}
929
930			/* ********************************************************************* */
931
932			static
933	36704		double euclid (int n, double data1, double data2, int mask1, int mask2,
934			const double weight[], int index1, int index2, int transpose)
935
936			/*
937			Purpose
938			=======
939
940			The euclid routine calculates the weighted Euclidean distance between two
941			rows or columns in a matrix.
942
943			Arguments
944			=========
945
946			n (input) int
947			The number of elements in a row or column. If transpose==0, then n is the number
948			of columns; otherwise, n is the number of rows.
949
950			data1 (input) double array
951			The data array containing the first vector.
952
953			data2 (input) double array
954			The data array containing the second vector.
955
956			mask1 (input) int array
957			This array which elements in data1 are missing. If mask1[i][j]==0, then
958			data1[i][j] is missing.
959
960			mask2 (input) int array
961			This array which elements in data2 are missing. If mask2[i][j]==0, then
962			data2[i][j] is missing.
963
964			weight (input) double[n]
965			The weights that are used to calculate the distance.
966
967			index1 (input) int
968			Index of the first row or column.
969
970			index2 (input) int
971			Index of the second row or column.
972
973			transpose (input) int
974			If transpose==0, the distance between two rows in the matrix is calculated.
975			Otherwise, the distance between two columns in the matrix is calculated.
976
977			============================================================================
978			*/
979	36704		{ double result = 0.;
980	36704		double tweight = 0;
981			int i;
982	36704	50	if (transpose==0) /* Calculate the distance between two rows */
983	145242	100	{ for (i = 0; i < n; i++)
984	108538	50	{ if (mask1[index1][i] && mask2[index2][i])
		50
985	108538		{ double term = data1[index1][i] - data2[index2][i];
986	108538		result += weight[i]termterm;
987	108538		tweight += weight[i];
988			}
989			}
990			}
991			else
992	0	0	{ for (i = 0; i < n; i++)
993	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
994	0		{ double term = data1[i][index1] - data2[i][index2];
995	0		result += weight[i]termterm;
996	0		tweight += weight[i];
997			}
998			}
999			}
1000	36704	50	if (!tweight) return 0; /* usually due to empty clusters */
1001	36704		result /= tweight;
1002	36704		return result;
1003			}
1004
1005			/* ********************************************************************* */
1006
1007			static
1008	64		double cityblock (int n, double data1, double data2, int** mask1,
1009			int** mask2, const double weight[], int index1, int index2, int transpose)
1010
1011			/*
1012			Purpose
1013			=======
1014
1015			The cityblock routine calculates the weighted "City Block" distance between
1016			two rows or columns in a matrix. City Block distance is defined as the
1017			absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
1018			equivalent to taking an "up and over" path.
1019
1020			Arguments
1021			=========
1022
1023			n (input) int
1024			The number of elements in a row or column. If transpose==0, then n is the number
1025			of columns; otherwise, n is the number of rows.
1026
1027			data1 (input) double array
1028			The data array containing the first vector.
1029
1030			data2 (input) double array
1031			The data array containing the second vector.
1032
1033			mask1 (input) int array
1034			This array which elements in data1 are missing. If mask1[i][j]==0, then
1035			data1[i][j] is missing.
1036
1037			mask2 (input) int array
1038			This array which elements in data2 are missing. If mask2[i][j]==0, then
1039			data2[i][j] is missing.
1040
1041			weight (input) double[n]
1042			The weights that are used to calculate the distance.
1043
1044			index1 (input) int
1045			Index of the first row or column.
1046
1047			index2 (input) int
1048			Index of the second row or column.
1049
1050			transpose (input) int
1051			If transpose==0, the distance between two rows in the matrix is calculated.
1052			Otherwise, the distance between two columns in the matrix is calculated.
1053
1054			============================================================================ */
1055	64		{ double result = 0.;
1056	64		double tweight = 0;
1057			int i;
1058	64	50	if (transpose==0) /* Calculate the distance between two rows */
1059	256	100	{ for (i = 0; i < n; i++)
1060	192	50	{ if (mask1[index1][i] && mask2[index2][i])
		50
1061	192		{ double term = data1[index1][i] - data2[index2][i];
1062	192		result = result + weight[i]*fabs(term);
1063	192		tweight += weight[i];
1064			}
1065			}
1066			}
1067			else
1068	0	0	{ for (i = 0; i < n; i++)
1069	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1070	0		{ double term = data1[i][index1] - data2[i][index2];
1071	0		result = result + weight[i]*fabs(term);
1072	0		tweight += weight[i];
1073			}
1074			}
1075			}
1076	64	50	if (!tweight) return 0; /* usually due to empty clusters */
1077	64		result /= tweight;
1078	64		return result;
1079			}
1080
1081			/* ********************************************************************* */
1082
1083			static
1084	0		double correlation (int n, double data1, double data2, int** mask1,
1085			int** mask2, const double weight[], int index1, int index2, int transpose)
1086			/*
1087			Purpose
1088			=======
1089
1090			The correlation routine calculates the weighted Pearson distance between two
1091			rows or columns in a matrix. We define the Pearson distance as one minus the
1092			Pearson correlation.
1093			This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1094			but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1095			(e.g., choose b = a + c).
1096
1097			Arguments
1098			=========
1099
1100			n (input) int
1101			The number of elements in a row or column. If transpose==0, then n is the number
1102			of columns; otherwise, n is the number of rows.
1103
1104			data1 (input) double array
1105			The data array containing the first vector.
1106
1107			data2 (input) double array
1108			The data array containing the second vector.
1109
1110			mask1 (input) int array
1111			This array which elements in data1 are missing. If mask1[i][j]==0, then
1112			data1[i][j] is missing.
1113
1114			mask2 (input) int array
1115			This array which elements in data2 are missing. If mask2[i][j]==0, then
1116			data2[i][j] is missing.
1117
1118			weight (input) double[n]
1119			The weights that are used to calculate the distance.
1120
1121			index1 (input) int
1122			Index of the first row or column.
1123
1124			index2 (input) int
1125			Index of the second row or column.
1126
1127			transpose (input) int
1128			If transpose==0, the distance between two rows in the matrix is calculated.
1129			Otherwise, the distance between two columns in the matrix is calculated.
1130			============================================================================
1131			*/
1132	0		{ double result = 0.;
1133	0		double sum1 = 0.;
1134	0		double sum2 = 0.;
1135	0		double denom1 = 0.;
1136	0		double denom2 = 0.;
1137	0		double tweight = 0.;
1138	0	0	if (transpose==0) /* Calculate the distance between two rows */
1139			{ int i;
1140	0	0	for (i = 0; i < n; i++)
1141	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1142	0		{ double term1 = data1[index1][i];
1143	0		double term2 = data2[index2][i];
1144	0		double w = weight[i];
1145	0		sum1 += w*term1;
1146	0		sum2 += w*term2;
1147	0		result += wterm1term2;
1148	0		denom1 += wterm1term1;
1149	0		denom2 += wterm2term2;
1150	0		tweight += w;
1151			}
1152			}
1153			}
1154			else
1155			{ int i;
1156	0	0	for (i = 0; i < n; i++)
1157	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1158	0		{ double term1 = data1[i][index1];
1159	0		double term2 = data2[i][index2];
1160	0		double w = weight[i];
1161	0		sum1 += w*term1;
1162	0		sum2 += w*term2;
1163	0		result += wterm1term2;
1164	0		denom1 += wterm1term1;
1165	0		denom2 += wterm2term2;
1166	0		tweight += w;
1167			}
1168			}
1169			}
1170	0	0	if (!tweight) return 0; /* usually due to empty clusters */
1171	0		result -= sum1 * sum2 / tweight;
1172	0		denom1 -= sum1 * sum1 / tweight;
1173	0		denom2 -= sum2 * sum2 / tweight;
1174	0	0	if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1175	0	0	if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1176	0		result = result / sqrt(denom1*denom2);
1177	0		result = 1. - result;
1178	0		return result;
1179			}
1180
1181			/* ********************************************************************* */
1182
1183			static
1184	0		double acorrelation (int n, double data1, double data2, int** mask1,
1185			int** mask2, const double weight[], int index1, int index2, int transpose)
1186			/*
1187			Purpose
1188			=======
1189
1190			The acorrelation routine calculates the weighted Pearson distance between two
1191			rows or columns, using the absolute value of the correlation.
1192			This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1193			but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1194			(e.g., choose b = a + c).
1195
1196			Arguments
1197			=========
1198
1199			n (input) int
1200			The number of elements in a row or column. If transpose==0, then n is the number
1201			of columns; otherwise, n is the number of rows.
1202
1203			data1 (input) double array
1204			The data array containing the first vector.
1205
1206			data2 (input) double array
1207			The data array containing the second vector.
1208
1209			mask1 (input) int array
1210			This array which elements in data1 are missing. If mask1[i][j]==0, then
1211			data1[i][j] is missing.
1212
1213			mask2 (input) int array
1214			This array which elements in data2 are missing. If mask2[i][j]==0, then
1215			data2[i][j] is missing.
1216
1217			weight (input) double[n]
1218			The weights that are used to calculate the distance.
1219
1220			index1 (input) int
1221			Index of the first row or column.
1222
1223			index2 (input) int
1224			Index of the second row or column.
1225
1226			transpose (input) int
1227			If transpose==0, the distance between two rows in the matrix is calculated.
1228			Otherwise, the distance between two columns in the matrix is calculated.
1229			============================================================================
1230			*/
1231	0		{ double result = 0.;
1232	0		double sum1 = 0.;
1233	0		double sum2 = 0.;
1234	0		double denom1 = 0.;
1235	0		double denom2 = 0.;
1236	0		double tweight = 0.;
1237	0	0	if (transpose==0) /* Calculate the distance between two rows */
1238			{ int i;
1239	0	0	for (i = 0; i < n; i++)
1240	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1241	0		{ double term1 = data1[index1][i];
1242	0		double term2 = data2[index2][i];
1243	0		double w = weight[i];
1244	0		sum1 += w*term1;
1245	0		sum2 += w*term2;
1246	0		result += wterm1term2;
1247	0		denom1 += wterm1term1;
1248	0		denom2 += wterm2term2;
1249	0		tweight += w;
1250			}
1251			}
1252			}
1253			else
1254			{ int i;
1255	0	0	for (i = 0; i < n; i++)
1256	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1257	0		{ double term1 = data1[i][index1];
1258	0		double term2 = data2[i][index2];
1259	0		double w = weight[i];
1260	0		sum1 += w*term1;
1261	0		sum2 += w*term2;
1262	0		result += wterm1term2;
1263	0		denom1 += wterm1term1;
1264	0		denom2 += wterm2term2;
1265	0		tweight += w;
1266			}
1267			}
1268			}
1269	0	0	if (!tweight) return 0; /* usually due to empty clusters */
1270	0		result -= sum1 * sum2 / tweight;
1271	0		denom1 -= sum1 * sum1 / tweight;
1272	0		denom2 -= sum2 * sum2 / tweight;
1273	0	0	if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1274	0	0	if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1275	0		result = fabs(result) / sqrt(denom1*denom2);
1276	0		result = 1. - result;
1277	0		return result;
1278			}
1279
1280			/* ********************************************************************* */
1281
1282			static
1283	0		double ucorrelation (int n, double data1, double data2, int** mask1,
1284			int** mask2, const double weight[], int index1, int index2, int transpose)
1285			/*
1286			Purpose
1287			=======
1288
1289			The ucorrelation routine calculates the weighted Pearson distance between two
1290			rows or columns, using the uncentered version of the Pearson correlation. In the
1291			uncentered Pearson correlation, a zero mean is used for both vectors even if
1292			the actual mean is nonzero.
1293			This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1294			but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1295			(e.g., choose b = a + c).
1296
1297			Arguments
1298			=========
1299
1300			n (input) int
1301			The number of elements in a row or column. If transpose==0, then n is the number
1302			of columns; otherwise, n is the number of rows.
1303
1304			data1 (input) double array
1305			The data array containing the first vector.
1306
1307			data2 (input) double array
1308			The data array containing the second vector.
1309
1310			mask1 (input) int array
1311			This array which elements in data1 are missing. If mask1[i][j]==0, then
1312			data1[i][j] is missing.
1313
1314			mask2 (input) int array
1315			This array which elements in data2 are missing. If mask2[i][j]==0, then
1316			data2[i][j] is missing.
1317
1318			weight (input) double[n]
1319			The weights that are used to calculate the distance.
1320
1321			index1 (input) int
1322			Index of the first row or column.
1323
1324			index2 (input) int
1325			Index of the second row or column.
1326
1327			transpose (input) int
1328			If transpose==0, the distance between two rows in the matrix is calculated.
1329			Otherwise, the distance between two columns in the matrix is calculated.
1330			============================================================================
1331			*/
1332	0		{ double result = 0.;
1333	0		double denom1 = 0.;
1334	0		double denom2 = 0.;
1335	0		int flag = 0;
1336			/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1337			* found.
1338			*/
1339	0	0	if (transpose==0) /* Calculate the distance between two rows */
1340			{ int i;
1341	0	0	for (i = 0; i < n; i++)
1342	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1343	0		{ double term1 = data1[index1][i];
1344	0		double term2 = data2[index2][i];
1345	0		double w = weight[i];
1346	0		result += wterm1term2;
1347	0		denom1 += wterm1term1;
1348	0		denom2 += wterm2term2;
1349	0		flag = 1;
1350			}
1351			}
1352			}
1353			else
1354			{ int i;
1355	0	0	for (i = 0; i < n; i++)
1356	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1357	0		{ double term1 = data1[i][index1];
1358	0		double term2 = data2[i][index2];
1359	0		double w = weight[i];
1360	0		result += wterm1term2;
1361	0		denom1 += wterm1term1;
1362	0		denom2 += wterm2term2;
1363	0		flag = 1;
1364			}
1365			}
1366			}
1367	0	0	if (!flag) return 0.;
1368	0	0	if (denom1==0.) return 1.;
1369	0	0	if (denom2==0.) return 1.;
1370	0		result = result / sqrt(denom1*denom2);
1371	0		result = 1. - result;
1372	0		return result;
1373			}
1374
1375			/* ********************************************************************* */
1376
1377			static
1378	0		double uacorrelation (int n, double data1, double data2, int** mask1,
1379			int** mask2, const double weight[], int index1, int index2, int transpose)
1380			/*
1381			Purpose
1382			=======
1383
1384			The uacorrelation routine calculates the weighted Pearson distance between two
1385			rows or columns, using the absolute value of the uncentered version of the
1386			Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1387			for both vectors even if the actual mean is nonzero.
1388			This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1389			but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1390			(e.g., choose b = a + c).
1391
1392			Arguments
1393			=========
1394
1395			n (input) int
1396			The number of elements in a row or column. If transpose==0, then n is the number
1397			of columns; otherwise, n is the number of rows.
1398
1399			data1 (input) double array
1400			The data array containing the first vector.
1401
1402			data2 (input) double array
1403			The data array containing the second vector.
1404
1405			mask1 (input) int array
1406			This array which elements in data1 are missing. If mask1[i][j]==0, then
1407			data1[i][j] is missing.
1408
1409			mask2 (input) int array
1410			This array which elements in data2 are missing. If mask2[i][j]==0, then
1411			data2[i][j] is missing.
1412
1413			weight (input) double[n]
1414			The weights that are used to calculate the distance.
1415
1416			index1 (input) int
1417			Index of the first row or column.
1418
1419			index2 (input) int
1420			Index of the second row or column.
1421
1422			transpose (input) int
1423			If transpose==0, the distance between two rows in the matrix is calculated.
1424			Otherwise, the distance between two columns in the matrix is calculated.
1425			============================================================================
1426			*/
1427	0		{ double result = 0.;
1428	0		double denom1 = 0.;
1429	0		double denom2 = 0.;
1430	0		int flag = 0;
1431			/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1432			* found.
1433			*/
1434	0	0	if (transpose==0) /* Calculate the distance between two rows */
1435			{ int i;
1436	0	0	for (i = 0; i < n; i++)
1437	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1438	0		{ double term1 = data1[index1][i];
1439	0		double term2 = data2[index2][i];
1440	0		double w = weight[i];
1441	0		result += wterm1term2;
1442	0		denom1 += wterm1term1;
1443	0		denom2 += wterm2term2;
1444	0		flag = 1;
1445			}
1446			}
1447			}
1448			else
1449			{ int i;
1450	0	0	for (i = 0; i < n; i++)
1451	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1452	0		{ double term1 = data1[i][index1];
1453	0		double term2 = data2[i][index2];
1454	0		double w = weight[i];
1455	0		result += wterm1term2;
1456	0		denom1 += wterm1term1;
1457	0		denom2 += wterm2term2;
1458	0		flag = 1;
1459			}
1460			}
1461			}
1462	0	0	if (!flag) return 0.;
1463	0	0	if (denom1==0.) return 1.;
1464	0	0	if (denom2==0.) return 1.;
1465	0		result = fabs(result) / sqrt(denom1*denom2);
1466	0		result = 1. - result;
1467	0		return result;
1468			}
1469
1470			/* ********************************************************************* */
1471
1472			static
1473	0		double spearman (int n, double data1, double data2, int** mask1,
1474			int** mask2, const double weight[], int index1, int index2, int transpose)
1475			/*
1476			Purpose
1477			=======
1478
1479			The spearman routine calculates the Spearman distance between two rows or
1480			columns. The Spearman distance is defined as one minus the Spearman rank
1481			correlation.
1482
1483			Arguments
1484			=========
1485
1486			n (input) int
1487			The number of elements in a row or column. If transpose==0, then n is the number
1488			of columns; otherwise, n is the number of rows.
1489
1490			data1 (input) double array
1491			The data array containing the first vector.
1492
1493			data2 (input) double array
1494			The data array containing the second vector.
1495
1496			mask1 (input) int array
1497			This array which elements in data1 are missing. If mask1[i][j]==0, then
1498			data1[i][j] is missing.
1499
1500			mask2 (input) int array
1501			This array which elements in data2 are missing. If mask2[i][j]==0, then
1502			data2[i][j] is missing.
1503
1504			weight (input) double[n]
1505			These weights are ignored, but included for consistency with other distance
1506			measures.
1507
1508			index1 (input) int
1509			Index of the first row or column.
1510
1511			index2 (input) int
1512			Index of the second row or column.
1513
1514			transpose (input) int
1515			If transpose==0, the distance between two rows in the matrix is calculated.
1516			Otherwise, the distance between two columns in the matrix is calculated.
1517			============================================================================
1518			*/
1519			{ int i;
1520	0		int m = 0;
1521			double* rank1;
1522			double* rank2;
1523	0		double result = 0.;
1524	0		double denom1 = 0.;
1525	0		double denom2 = 0.;
1526			double avgrank;
1527			double* tdata1;
1528			double* tdata2;
1529	0		tdata1 = malloc(n*sizeof(double));
1530	0	0	if(!tdata1) return 0.0; /* Memory allocation error */
1531	0		tdata2 = malloc(n*sizeof(double));
1532	0	0	if(!tdata2) /* Memory allocation error */
1533	0		{ free(tdata1);
1534	0		return 0.0;
1535			}
1536	0	0	if (transpose==0)
1537	0	0	{ for (i = 0; i < n; i++)
1538	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1539	0		{ tdata1[m] = data1[index1][i];
1540	0		tdata2[m] = data2[index2][i];
1541	0		m++;
1542			}
1543			}
1544			}
1545			else
1546	0	0	{ for (i = 0; i < n; i++)
1547	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1548	0		{ tdata1[m] = data1[i][index1];
1549	0		tdata2[m] = data2[i][index2];
1550	0		m++;
1551			}
1552			}
1553			}
1554	0	0	if (m==0)
1555	0		{ free(tdata1);
1556	0		free(tdata2);
1557	0		return 0;
1558			}
1559	0		rank1 = getrank(m, tdata1);
1560	0		free(tdata1);
1561	0	0	if(!rank1)
1562	0		{ free(tdata2);
1563	0		return 0.0; /* Memory allocation error */
1564			}
1565	0		rank2 = getrank(m, tdata2);
1566	0		free(tdata2);
1567	0	0	if(!rank2) /* Memory allocation error */
1568	0		{ free(rank1);
1569	0		return 0.0;
1570			}
1571	0		avgrank = 0.5(m-1); / Average rank */
1572	0	0	for (i = 0; i < m; i++)
1573	0		{ const double value1 = rank1[i];
1574	0		const double value2 = rank2[i];
1575	0		result += value1 * value2;
1576	0		denom1 += value1 * value1;
1577	0		denom2 += value2 * value2;
1578			}
1579			/* Note: denom1 and denom2 cannot be calculated directly from the number
1580			* of elements. If two elements have the same rank, the squared sum of
1581			* their ranks will change.
1582			*/
1583	0		free(rank1);
1584	0		free(rank2);
1585	0		result /= m;
1586	0		denom1 /= m;
1587	0		denom2 /= m;
1588	0		result -= avgrank * avgrank;
1589	0		denom1 -= avgrank * avgrank;
1590	0		denom2 -= avgrank * avgrank;
1591	0	0	if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1592	0	0	if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1593	0		result = result / sqrt(denom1*denom2);
1594	0		result = 1. - result;
1595	0		return result;
1596			}
1597
1598			/* ********************************************************************* */
1599
1600			static
1601	0		double kendall (int n, double data1, double data2, int mask1, int mask2,
1602			const double weight[], int index1, int index2, int transpose)
1603			/*
1604			Purpose
1605			=======
1606
1607			The kendall routine calculates the Kendall distance between two
1608			rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1609
1610			Arguments
1611			=========
1612
1613			n (input) int
1614			The number of elements in a row or column. If transpose==0, then n is the number
1615			of columns; otherwise, n is the number of rows.
1616
1617			data1 (input) double array
1618			The data array containing the first vector.
1619
1620			data2 (input) double array
1621			The data array containing the second vector.
1622
1623			mask1 (input) int array
1624			This array which elements in data1 are missing. If mask1[i][j]==0, then
1625			data1[i][j] is missing.
1626
1627			mask2 (input) int array
1628			This array which elements in data2 are missing. If mask2[i][j]==0, then
1629			data2[i][j] is missing.
1630
1631			weight (input) double[n]
1632			These weights are ignored, but included for consistency with other distance
1633			measures.
1634
1635			index1 (input) int
1636			Index of the first row or column.
1637
1638			index2 (input) int
1639			Index of the second row or column.
1640
1641			transpose (input) int
1642			If transpose==0, the distance between two rows in the matrix is calculated.
1643			Otherwise, the distance between two columns in the matrix is calculated.
1644			============================================================================
1645			*/
1646	0		{ int con = 0;
1647	0		int dis = 0;
1648	0		int exx = 0;
1649	0		int exy = 0;
1650	0		int flag = 0;
1651			/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1652			* found.
1653			*/
1654			double denomx;
1655			double denomy;
1656			double tau;
1657			int i, j;
1658	0	0	if (transpose==0)
1659	0	0	{ for (i = 0; i < n; i++)
1660	0	0	{ if (mask1[index1][i] && mask2[index2][i])
		0
1661	0	0	{ for (j = 0; j < i; j++)
1662	0	0	{ if (mask1[index1][j] && mask2[index2][j])
		0
1663	0		{ double x1 = data1[index1][i];
1664	0		double x2 = data1[index1][j];
1665	0		double y1 = data2[index2][i];
1666	0		double y2 = data2[index2][j];
1667	0	0	if (x1 < x2 && y1 < y2) con++;
		0
1668	0	0	if (x1 > x2 && y1 > y2) con++;
		0
1669	0	0	if (x1 < x2 && y1 > y2) dis++;
		0
1670	0	0	if (x1 > x2 && y1 < y2) dis++;
		0
1671	0	0	if (x1 == x2 && y1 != y2) exx++;
		0
1672	0	0	if (x1 != x2 && y1 == y2) exy++;
		0
1673	0		flag = 1;
1674			}
1675			}
1676			}
1677			}
1678			}
1679			else
1680	0	0	{ for (i = 0; i < n; i++)
1681	0	0	{ if (mask1[i][index1] && mask2[i][index2])
		0
1682	0	0	{ for (j = 0; j < i; j++)
1683	0	0	{ if (mask1[j][index1] && mask2[j][index2])
		0
1684	0		{ double x1 = data1[i][index1];
1685	0		double x2 = data1[j][index1];
1686	0		double y1 = data2[i][index2];
1687	0		double y2 = data2[j][index2];
1688	0	0	if (x1 < x2 && y1 < y2) con++;
		0
1689	0	0	if (x1 > x2 && y1 > y2) con++;
		0
1690	0	0	if (x1 < x2 && y1 > y2) dis++;
		0
1691	0	0	if (x1 > x2 && y1 < y2) dis++;
		0
1692	0	0	if (x1 == x2 && y1 != y2) exx++;
		0
1693	0	0	if (x1 != x2 && y1 == y2) exy++;
		0
1694	0		flag = 1;
1695			}
1696			}
1697			}
1698			}
1699			}
1700	0	0	if (!flag) return 0.;
1701	0		denomx = con + dis + exx;
1702	0		denomy = con + dis + exy;
1703	0	0	if (denomx==0) return 1;
1704	0	0	if (denomy==0) return 1;
1705	0		tau = (con-dis)/sqrt(denomx*denomy);
1706	0		return 1.-tau;
1707			}
1708
1709			/* ********************************************************************* */
1710
1711	64		static double(*setmetric(char dist))
1712			(int, double, double, int, int, const double[], int, int, int)
1713	64		{ switch(dist)
1714	24		{ case 'e': return &euclid;
1715	40		case 'b': return &cityblock;
1716	0		case 'c': return &correlation;
1717	0		case 'a': return &acorrelation;
1718	0		case 'u': return &ucorrelation;
1719	0		case 'x': return &uacorrelation;
1720	0		case 's': return &spearman;
1721	0		case 'k': return &kendall;
1722	0		default: return &euclid;
1723			}
1724			return NULL; /* Never get here */
1725			}
1726
1727			/* ********************************************************************* */
1728
1729	4317		static double uniform(void)
1730			/*
1731			Purpose
1732			=======
1733
1734			This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1735			and 1.0 are excluded. This random number generator is described in:
1736
1737			Pierre l'Ecuyer
1738			Efficient and Portable Combined Random Number Generators
1739			Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1740
1741			The first time this routine is called, it initializes the random number
1742			generator using the current time. First, the current epoch time in seconds is
1743			used as a seed for the random number generator in the C library. The first two
1744			random numbers generated by this generator are used to initialize the random
1745			number generator implemented in this routine.
1746
1747
1748			Arguments
1749			=========
1750
1751			None.
1752
1753
1754			Return value
1755			============
1756
1757			A double-precison number between 0.0 and 1.0.
1758			============================================================================
1759			*/
1760			{ int z;
1761			static const int m1 = 2147483563;
1762			static const int m2 = 2147483399;
1763	4317		const double scale = 1.0/m1;
1764
1765			static int s1 = 0;
1766			static int s2 = 0;
1767
1768	4317	100	if (s1==0 \|\| s2==0) /* initialize */
		50
1769	3		{ unsigned int initseed = (unsigned int) time(0);
1770	3		srand(initseed);
1771	3		s1 = rand();
1772	3		s2 = rand();
1773			}
1774
1775			do
1776			{ int k;
1777	4317		k = s1/53668;
1778	4317		s1 = 40014(s1-k53668)-k*12211;
1779	4317	100	if (s1 < 0) s1+=m1;
1780	4317		k = s2/52774;
1781	4317		s2 = 40692(s2-k52774)-k*3791;
1782	4317	100	if(s2 < 0) s2+=m2;
1783	4317		z = s1-s2;
1784	4317	100	if(z < 1) z+=(m1-1);
1785	4317	50	} while (z==m1); /* To avoid returning 1.0 */
1786
1787	4317		return z*scale;
1788			}
1789
1790			/* ************************************************************************ */
1791
1792	700		static int binomial(int n, double p)
1793			/*
1794			Purpose
1795			=======
1796
1797			This routine generates a random number between 0 and n inclusive, following
1798			the binomial distribution with probability p and n trials. The routine is
1799			based on the BTPE algorithm, described in:
1800
1801			Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1802			Binomial Random Variate Generation
1803			Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1804
1805
1806			Arguments
1807			=========
1808
1809			p (input) double
1810			The probability of a single event. This probability should be less than or
1811			equal to 0.5.
1812
1813			n (input) int
1814			The number of trials.
1815
1816
1817			Return value
1818			============
1819
1820			An integer drawn from a binomial distribution with parameters (p, n).
1821
1822			============================================================================
1823			*/
1824	700		{ const double q = 1 - p;
1825	700	50	if (np < 30.0) / Algorithm BINV */
1826	700		{ const double s = p/q;
1827	700		const double a = (n+1)*s;
1828	700		double r = exp(nlog(q)); / pow() causes a crash on AIX */
1829	700		int x = 0;
1830	700		double u = uniform();
1831			while(1)
1832	2002	100	{ if (u < r) return x;
1833	1302		u-=r;
1834	1302		x++;
1835	1302		r *= (a/x)-s;
1836	1302		}
1837			}
1838			else /* Algorithm BTPE */
1839			{ /* Step 0 */
1840	0		const double fm = n*p + p;
1841	0		const int m = (int) fm;
1842	0		const double p1 = floor(2.195sqrt(npq) -4.6q) + 0.5;
1843	0		const double xm = m + 0.5;
1844	0		const double xl = xm - p1;
1845	0		const double xr = xm + p1;
1846	0		const double c = 0.134 + 20.5/(15.3+m);
1847	0		const double a = (fm-xl)/(fm-xl*p);
1848	0		const double b = (xr-fm)/(xr*q);
1849	0		const double lambdal = a(1.0+0.5a);
1850	0		const double lambdar = b(1.0+0.5b);
1851	0		const double p2 = p1(1+2c);
1852	0		const double p3 = p2 + c/lambdal;
1853	0		const double p4 = p3 + c/lambdar;
1854			while (1)
1855			{ /* Step 1 */
1856			int y;
1857			int k;
1858	0		double u = uniform();
1859	0		double v = uniform();
1860	0		u *= p4;
1861	0	0	if (u <= p1) return (int)(xm-p1*v+u);
1862			/* Step 2 */
1863	0	0	if (u > p2)
1864			{ /* Step 3 */
1865	0	0	if (u > p3)
1866			{ /* Step 4 */
1867	0		y = (int)(xr-log(v)/lambdar);
1868	0	0	if (y > n) continue;
1869			/* Go to step 5 */
1870	0		v = v(u-p3)lambdar;
1871			}
1872			else
1873	0		{ y = (int)(xl+log(v)/lambdal);
1874	0	0	if (y < 0) continue;
1875			/* Go to step 5 */
1876	0		v = v(u-p2)lambdal;
1877			}
1878			}
1879			else
1880	0		{ const double x = xl + (u-p1)/c;
1881	0		v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1882	0	0	if (v > 1) continue;
1883			/* Go to step 5 */
1884	0		y = (int)x;
1885			}
1886			/* Step 5 */
1887			/* Step 5.0 */
1888	0		k = abs(y-m);
1889	0	0	if (k > 20 && k < 0.5np*q-1.0)
		0
1890			{ /* Step 5.2 */
1891	0		double rho = (k/(npq))((k(k/3.0 + 0.625) + 0.1666666666666)/(npq)+0.5);
1892	0		double t = -kk/(2npq);
1893	0		double A = log(v);
1894	0	0	if (A < t-rho) return y;
1895	0	0	else if (A > t+rho) continue;
1896			else
1897			{ /* Step 5.3 */
1898	0		double x1 = y+1;
1899	0		double f1 = m+1;
1900	0		double z = n+1-m;
1901	0		double w = n-y+1;
1902	0		double x2 = x1*x1;
1903	0		double f2 = f1*f1;
1904	0		double z2 = z*z;
1905	0		double w2 = w*w;
1906	0	0	if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1907	0		+ (y-m)log(wp/(x1*q))
1908	0		+ (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1909	0		+ (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1910	0		+ (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1911	0		+ (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1912	0		continue;
1913	0		return y;
1914			}
1915			}
1916			else
1917			{ /* Step 5.1 */
1918			int i;
1919	0		const double s = p/q;
1920	0		const double aa = s*(n+1);
1921	0		double f = 1.0;
1922	0	0	for (i = m; i < y; f *= (aa/(++i)-s));
1923	0	0	for (i = y; i < m; f /= (aa/(++i)-s));
1924	0	0	if (v > f) continue;
1925	0		return y;
1926			}
1927	0		}
1928			}
1929			/* Never get here */
1930			return -1;
1931			}
1932
1933			/* ************************************************************************ */
1934
1935	300		static void randomassign (int nclusters, int nelements, int clusterid[])
1936			/*
1937			Purpose
1938			=======
1939
1940			The randomassign routine performs an initial random clustering, needed for
1941			k-means or k-median clustering. Elements (genes or microarrays) are randomly
1942			assigned to clusters. The number of elements in each cluster is chosen
1943			randomly, making sure that each cluster will receive at least one element.
1944
1945
1946			Arguments
1947			=========
1948
1949			nclusters (input) int
1950			The number of clusters.
1951
1952			nelements (input) int
1953			The number of elements to be clustered (i.e., the number of genes or microarrays
1954			to be clustered).
1955
1956			clusterid (output) int[nelements]
1957			The cluster number to which an element was assigned.
1958
1959			============================================================================
1960			*/
1961			{ int i, j;
1962	300		int k = 0;
1963			double p;
1964	300		int n = nelements-nclusters;
1965			/* Draw the number of elements in each cluster from a multinomial
1966			* distribution, reserving ncluster elements to set independently
1967			* in order to guarantee that none of the clusters are empty.
1968			*/
1969	1000	100	for (i = 0; i < nclusters-1; i++)
1970	700		{ p = 1.0/(nclusters-i);
1971	700		j = binomial(n, p);
1972	700		n -= j;
1973	700		j += k+1; /* Assign at least one element to cluster i */
1974	2702	100	for ( ; k < j; k++) clusterid[k] = i;
1975			}
1976			/* Assign the remaining elements to the last cluster */
1977	1198	100	for ( ; k < nelements; k++) clusterid[k] = i;
1978
1979			/* Create a random permutation of the cluster assignments */
1980	3200	100	for (i = 0; i < nelements; i++)
1981	2900		{ j = (int) (i + (nelements-i)*uniform());
1982	2900		k = clusterid[j];
1983	2900		clusterid[j] = clusterid[i];
1984	2900		clusterid[i] = k;
1985			}
1986
1987	300		return;
1988			}
1989
1990			/* ********************************************************************* */
1991
1992	555		static void getclustermeans(int nclusters, int nrows, int ncolumns,
1993			double data, int mask, int clusterid[], double cdata, int cmask,
1994			int transpose)
1995			/*
1996			Purpose
1997			=======
1998
1999			The getclustermeans routine calculates the cluster centroids, given to which
2000			cluster each element belongs. The centroid is defined as the mean over all
2001			elements for each dimension.
2002
2003			Arguments
2004			=========
2005
2006			nclusters (input) int
2007			The number of clusters.
2008
2009			nrows (input) int
2010			The number of rows in the gene expression data matrix, equal to the number of
2011			genes.
2012
2013			ncolumns (input) int
2014			The number of columns in the gene expression data matrix, equal to the number of
2015			microarrays.
2016
2017			data (input) double[nrows][ncolumns]
2018			The array containing the gene expression data.
2019
2020			mask (input) int[nrows][ncolumns]
2021			This array shows which data values are missing. If mask[i][j]==0, then
2022			data[i][j] is missing.
2023
2024			clusterid (output) int[nrows] if transpose==0
2025			int[ncolumns] if transpose==1
2026			The cluster number to which each element belongs. If transpose==0, then the
2027			dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2028			is equal to ncolumns (the number of microarrays).
2029
2030			cdata (output) double[nclusters][ncolumns] if transpose==0
2031			double[nrows][nclusters] if transpose==1
2032			On exit of getclustermeans, this array contains the cluster centroids.
2033
2034			cmask (output) int[nclusters][ncolumns] if transpose==0
2035			int[nrows][nclusters] if transpose==1
2036			This array shows which data values of are missing for each centroid. If
2037			cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2038			a centroid if all corresponding data values of the cluster members are missing.
2039
2040			transpose (input) int
2041			If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2042			columns (microarrays) are specified.
2043
2044			========================================================================
2045			*/
2046			{ int i, j, k;
2047	555	50	if (transpose==0)
2048	2220	100	{ for (i = 0; i < nclusters; i++)
2049	6813	100	{ for (j = 0; j < ncolumns; j++)
2050	5148		{ cmask[i][j] = 0;
2051	5148		cdata[i][j] = 0.;
2052			}
2053			}
2054	5952	100	for (k = 0; k < nrows; k++)
2055	5397		{ i = clusterid[k];
2056	18615	100	for (j = 0; j < ncolumns; j++)
2057	13218	50	{ if (mask[k][j] != 0)
2058	13218		{ cdata[i][j]+=data[k][j];
2059	13218		cmask[i][j]++;
2060			}
2061			}
2062			}
2063	2220	100	for (i = 0; i < nclusters; i++)
2064	6813	100	{ for (j = 0; j < ncolumns; j++)
2065	5148	50	{ if (cmask[i][j]>0)
2066	5148		{ cdata[i][j] /= cmask[i][j];
2067	5148		cmask[i][j] = 1;
2068			}
2069			}
2070			}
2071			}
2072			else
2073	0	0	{ for (i = 0; i < nrows; i++)
2074	0	0	{ for (j = 0; j < nclusters; j++)
2075	0		{ cdata[i][j] = 0.;
2076	0		cmask[i][j] = 0;
2077			}
2078			}
2079	0	0	for (k = 0; k < ncolumns; k++)
2080	0		{ i = clusterid[k];
2081	0	0	for (j = 0; j < nrows; j++)
2082	0	0	{ if (mask[j][k] != 0)
2083	0		{ cdata[j][i]+=data[j][k];
2084	0		cmask[j][i]++;
2085			}
2086			}
2087			}
2088	0	0	for (i = 0; i < nrows; i++)
2089	0	0	{ for (j = 0; j < nclusters; j++)
2090	0	0	{ if (cmask[i][j]>0)
2091	0		{ cdata[i][j] /= cmask[i][j];
2092	0		cmask[i][j] = 1;
2093			}
2094			}
2095			}
2096			}
2097	555		}
2098
2099			/* ********************************************************************* */
2100
2101			static void
2102	0		getclustermedians(int nclusters, int nrows, int ncolumns,
2103			double data, int mask, int clusterid[], double cdata, int cmask,
2104			int transpose, double cache[])
2105			/*
2106			Purpose
2107			=======
2108
2109			The getclustermedians routine calculates the cluster centroids, given to which
2110			cluster each element belongs. The centroid is defined as the median over all
2111			elements for each dimension.
2112
2113			Arguments
2114			=========
2115
2116			nclusters (input) int
2117			The number of clusters.
2118
2119			nrows (input) int
2120			The number of rows in the gene expression data matrix, equal to the number of
2121			genes.
2122
2123			ncolumns (input) int
2124			The number of columns in the gene expression data matrix, equal to the number of
2125			microarrays.
2126
2127			data (input) double[nrows][ncolumns]
2128			The array containing the gene expression data.
2129
2130			mask (input) int[nrows][ncolumns]
2131			This array shows which data values are missing. If mask[i][j]==0, then
2132			data[i][j] is missing.
2133
2134			clusterid (output) int[nrows] if transpose==0
2135			int[ncolumns] if transpose==1
2136			The cluster number to which each element belongs. If transpose==0, then the
2137			dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2138			is equal to ncolumns (the number of microarrays).
2139
2140			cdata (output) double[nclusters][ncolumns] if transpose==0
2141			double[nrows][nclusters] if transpose==1
2142			On exit of getclustermedians, this array contains the cluster centroids.
2143
2144			cmask (output) int[nclusters][ncolumns] if transpose==0
2145			int[nrows][nclusters] if transpose==1
2146			This array shows which data values of are missing for each centroid. If
2147			cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2148			a centroid if all corresponding data values of the cluster members are missing.
2149
2150			transpose (input) int
2151			If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2152			columns (microarrays) are specified.
2153
2154			cache (input) double[nrows] if transpose==0
2155			double[ncolumns] if transpose==1
2156			This array should be allocated before calling getclustermedians; its contents
2157			on input is not relevant. This array is used as a temporary storage space when
2158			calculating the medians.
2159
2160			========================================================================
2161			*/
2162			{ int i, j, k;
2163	0	0	if (transpose==0)
2164	0	0	{ for (i = 0; i < nclusters; i++)
2165	0	0	{ for (j = 0; j < ncolumns; j++)
2166	0		{ int count = 0;
2167	0	0	for (k = 0; k < nrows; k++)
2168	0	0	{ if (i==clusterid[k] && mask[k][j])
		0
2169	0		{ cache[count] = data[k][j];
2170	0		count++;
2171			}
2172			}
2173	0	0	if (count>0)
2174	0		{ cdata[i][j] = median(count,cache);
2175	0		cmask[i][j] = 1;
2176			}
2177			else
2178	0		{ cdata[i][j] = 0.;
2179	0		cmask[i][j] = 0;
2180			}
2181			}
2182			}
2183			}
2184			else
2185	0	0	{ for (i = 0; i < nclusters; i++)
2186	0	0	{ for (j = 0; j < nrows; j++)
2187	0		{ int count = 0;
2188	0	0	for (k = 0; k < ncolumns; k++)
2189	0	0	{ if (i==clusterid[k] && mask[j][k])
		0
2190	0		{ cache[count] = data[j][k];
2191	0		count++;
2192			}
2193			}
2194	0	0	if (count>0)
2195	0		{ cdata[j][i] = median(count,cache);
2196	0		cmask[j][i] = 1;
2197			}
2198			else
2199	0		{ cdata[j][i] = 0.;
2200	0		cmask[j][i] = 0;
2201			}
2202			}
2203			}
2204			}
2205	0		}
2206
2207			/* ********************************************************************* */
2208
2209	0		int getclustercentroids(int nclusters, int nrows, int ncolumns,
2210			double data, int mask, int clusterid[], double cdata, int cmask,
2211			int transpose, char method)
2212			/*
2213			Purpose
2214			=======
2215
2216			The getclustercentroids routine calculates the cluster centroids, given to
2217			which cluster each element belongs. Depending on the argument method, the
2218			centroid is defined as either the mean or the median for each dimension over
2219			all elements belonging to a cluster.
2220
2221			Arguments
2222			=========
2223
2224			nclusters (input) int
2225			The number of clusters.
2226
2227			nrows (input) int
2228			The number of rows in the gene expression data matrix, equal to the number of
2229			genes.
2230
2231			ncolumns (input) int
2232			The number of columns in the gene expression data matrix, equal to the number of
2233			microarrays.
2234
2235			data (input) double[nrows][ncolumns]
2236			The array containing the gene expression data.
2237
2238			mask (input) int[nrows][ncolumns]
2239			This array shows which data values are missing. If mask[i][j]==0, then
2240			data[i][j] is missing.
2241
2242			clusterid (output) int[nrows] if transpose==0
2243			int[ncolumns] if transpose==1
2244			The cluster number to which each element belongs. If transpose==0, then the
2245			dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2246			is equal to ncolumns (the number of microarrays).
2247
2248			cdata (output) double[nclusters][ncolumns] if transpose==0
2249			double[nrows][nclusters] if transpose==1
2250			On exit of getclustercentroids, this array contains the cluster centroids.
2251
2252			cmask (output) int[nclusters][ncolumns] if transpose==0
2253			int[nrows][nclusters] if transpose==1
2254			This array shows which data values of are missing for each centroid. If
2255			cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2256			a centroid if all corresponding data values of the cluster members are missing.
2257
2258			transpose (input) int
2259			If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2260			columns (microarrays) are specified.
2261
2262			method (input) char
2263			For method=='a', the centroid is defined as the mean over all elements
2264			belonging to a cluster for each dimension.
2265			For method=='m', the centroid is defined as the median over all elements
2266			belonging to a cluster for each dimension.
2267
2268			Return value
2269			============
2270
2271			The function returns an integer to indicate success or failure. If a
2272			memory error occurs, or if method is not 'm' or 'a', getclustercentroids
2273			returns 0. If successful, getclustercentroids returns 1.
2274			========================================================================
2275			*/
2276	0		{ switch(method)
2277			{ case 'm':
2278	0	0	{ const int nelements = (transpose==0) ? nrows : ncolumns;
2279	0		double* cache = malloc(nelements*sizeof(double));
2280	0	0	if (!cache) return 0;
2281	0		getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
2282			cdata, cmask, transpose, cache);
2283	0		free(cache);
2284	0		return 1;
2285			}
2286			case 'a':
2287	0		{ getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
2288			cdata, cmask, transpose);
2289	0		return 1;
2290			}
2291			}
2292	0		return 0;
2293			}
2294
2295			/* ********************************************************************* */
2296
2297	343		void getclustermedoids(int nclusters, int nelements, double** distance,
2298			int clusterid[], int centroids[], double errors[])
2299			/*
2300			Purpose
2301			=======
2302
2303			The getclustermedoids routine calculates the cluster centroids, given to which
2304			cluster each element belongs. The centroid is defined as the element with the
2305			smallest sum of distances to the other elements.
2306
2307			Arguments
2308			=========
2309
2310			nclusters (input) int
2311			The number of clusters.
2312
2313			nelements (input) int
2314			The total number of elements.
2315
2316			distmatrix (input) double array, ragged
2317			(number of rows is nelements, number of columns is equal to the row number)
2318			The distance matrix. To save space, the distance matrix is given in the
2319			form of a ragged array. The distance matrix is symmetric and has zeros
2320			on the diagonal. See distancematrix for a description of the content.
2321
2322			clusterid (output) int[nelements]
2323			The cluster number to which each element belongs.
2324
2325			centroid (output) int[nclusters]
2326			The index of the element that functions as the centroid for each cluster.
2327
2328			errors (output) double[nclusters]
2329			The within-cluster sum of distances between the items and the cluster
2330			centroid.
2331
2332			========================================================================
2333			*/
2334			{ int i, j, k;
2335	1715	100	for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
2336	4459	100	for (i = 0; i < nelements; i++)
2337	4116		{ double d = 0.0;
2338	4116		j = clusterid[i];
2339	46379	100	for (k = 0; k < nelements; k++)
2340	43863	100	{ if (i==k \|\| clusterid[k]!=j) continue;
		100
2341	9493	100	d += (i < k ? distance[k][i] : distance[i][k]);
2342	9493	100	if (d > errors[j]) break;
2343			}
2344	4116	100	if (d < errors[j])
2345	2195		{ errors[j] = d;
2346	2195		centroids[j] = i;
2347			}
2348			}
2349	343		}
2350
2351			/* ********************************************************************* */
2352
2353			static int
2354	3		kmeans(int nclusters, int nrows, int ncolumns, double data, int mask,
2355			double weight[], int transpose, int npass, char dist,
2356			double cdata, int cmask, int clusterid[], double* error,
2357			int tclusterid[], int counts[], int mapping[])
2358			{ int i, j, k;
2359	3	50	const int nelements = (transpose==0) ? nrows : ncolumns;
2360	3	50	const int ndata = (transpose==0) ? ncolumns : nrows;
2361	3		int ifound = 1;
2362	3		int ipass = 0;
2363			/* Set the metric function as indicated by dist */
2364	3		double (*metric)
2365			(int, double, double, int, int, const double[], int, int, int) =
2366	3		setmetric(dist);
2367
2368			/* We save the clustering solution periodically and check if it reappears */
2369	3		int* saved = malloc(nelements*sizeof(int));
2370	3	50	if (saved==NULL) return -1;
2371
2372	3		*error = DBL_MAX;
2373
2374			do
2375	201		{ double total = DBL_MAX;
2376	201		int counter = 0;
2377	201		int period = 10;
2378
2379			/* Perform the EM algorithm. First, randomly assign elements to clusters. */
2380	201	100	if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2381
2382	804	100	for (i = 0; i < nclusters; i++) counts[i] = 0;
2383	1914	100	for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2384
2385			/* Start the loop */
2386			while(1)
2387	555		{ double previous = total;
2388	555		total = 0.0;
2389
2390	555	100	if (counter % period == 0) /* Save the current cluster assignments */
2391	1914	100	{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2392	201	50	if (period < INT_MAX / 2) period *= 2;
2393			}
2394	555		counter++;
2395
2396			/* Find the center */
2397	555		getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
2398			cdata, cmask, transpose);
2399
2400	5952	100	for (i = 0; i < nelements; i++)
2401			/* Calculate the distances */
2402			{ double distance;
2403	5397		k = tclusterid[i];
2404	5397	100	if (counts[k]==1) continue;
2405			/* No reassignment if that would lead to an empty cluster */
2406			/* Treat the present cluster as a special case */
2407	4790		distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2408	19160	100	for (j = 0; j < nclusters; j++)
2409			{ double tdistance;
2410	14370	100	if (j==k) continue;
2411	9580		tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2412	9580	100	if (tdistance < distance)
2413	932		{ distance = tdistance;
2414	932		counts[tclusterid[i]]--;
2415	932		tclusterid[i] = j;
2416	932		counts[j]++;
2417			}
2418			}
2419	4790		total += distance;
2420			}
2421	555	100	if (total>=previous) break;
2422			/* total>=previous is FALSE on some machines even if total and previous
2423			* are bitwise identical. */
2424	938	100	for (i = 0; i < nelements; i++)
2425	889	100	if (saved[i]!=tclusterid[i]) break;
2426	403	100	if (i==nelements)
2427	49		break; /* Identical solution found; break out of this loop */
2428	354		}
2429
2430	201	100	if (npass<=1)
2431	1		{ *error = total;
2432	1		break;
2433			}
2434
2435	800	100	for (i = 0; i < nclusters; i++) mapping[i] = -1;
2436	1662	100	for (i = 0; i < nelements; i++)
2437	1544		{ j = tclusterid[i];
2438	1544		k = clusterid[i];
2439	1544	100	if (mapping[k] == -1) mapping[k] = j;
2440	999	100	else if (mapping[k] != j)
2441	82	100	{ if (total < *error)
2442	2		{ ifound = 1;
2443	2		*error = total;
2444	19	100	for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2445			}
2446	82		break;
2447			}
2448			}
2449	200	100	if (i==nelements) ifound++; /* break statement not encountered */
2450	200	100	} while (++ipass < npass);
2451
2452	3		free(saved);
2453	3		return ifound;
2454			}
2455
2456			/* ---------------------------------------------------------------------- */
2457
2458			static int
2459	0		kmedians(int nclusters, int nrows, int ncolumns, double data, int mask,
2460			double weight[], int transpose, int npass, char dist,
2461			double cdata, int cmask, int clusterid[], double* error,
2462			int tclusterid[], int counts[], int mapping[], double cache[])
2463			{ int i, j, k;
2464	0	0	const int nelements = (transpose==0) ? nrows : ncolumns;
2465	0	0	const int ndata = (transpose==0) ? ncolumns : nrows;
2466	0		int ifound = 1;
2467	0		int ipass = 0;
2468			/* Set the metric function as indicated by dist */
2469	0		double (*metric)
2470			(int, double, double, int, int, const double[], int, int, int) =
2471	0		setmetric(dist);
2472
2473			/* We save the clustering solution periodically and check if it reappears */
2474	0		int* saved = malloc(nelements*sizeof(int));
2475	0	0	if (saved==NULL) return -1;
2476
2477	0		*error = DBL_MAX;
2478
2479			do
2480	0		{ double total = DBL_MAX;
2481	0		int counter = 0;
2482	0		int period = 10;
2483
2484			/* Perform the EM algorithm. First, randomly assign elements to clusters. */
2485	0	0	if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2486
2487	0	0	for (i = 0; i < nclusters; i++) counts[i]=0;
2488	0	0	for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2489
2490			/* Start the loop */
2491			while(1)
2492	0		{ double previous = total;
2493	0		total = 0.0;
2494
2495	0	0	if (counter % period == 0) /* Save the current cluster assignments */
2496	0	0	{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2497	0	0	if (period < INT_MAX / 2) period *= 2;
2498			}
2499	0		counter++;
2500
2501			/* Find the center */
2502	0		getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
2503			cdata, cmask, transpose, cache);
2504
2505	0	0	for (i = 0; i < nelements; i++)
2506			/* Calculate the distances */
2507			{ double distance;
2508	0		k = tclusterid[i];
2509	0	0	if (counts[k]==1) continue;
2510			/* No reassignment if that would lead to an empty cluster */
2511			/* Treat the present cluster as a special case */
2512	0		distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2513	0	0	for (j = 0; j < nclusters; j++)
2514			{ double tdistance;
2515	0	0	if (j==k) continue;
2516	0		tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2517	0	0	if (tdistance < distance)
2518	0		{ distance = tdistance;
2519	0		counts[tclusterid[i]]--;
2520	0		tclusterid[i] = j;
2521	0		counts[j]++;
2522			}
2523			}
2524	0		total += distance;
2525			}
2526	0	0	if (total>=previous) break;
2527			/* total>=previous is FALSE on some machines even if total and previous
2528			* are bitwise identical. */
2529	0	0	for (i = 0; i < nelements; i++)
2530	0	0	if (saved[i]!=tclusterid[i]) break;
2531	0	0	if (i==nelements)
2532	0		break; /* Identical solution found; break out of this loop */
2533	0		}
2534
2535	0	0	if (npass<=1)
2536	0		{ *error = total;
2537	0		break;
2538			}
2539
2540	0	0	for (i = 0; i < nclusters; i++) mapping[i] = -1;
2541	0	0	for (i = 0; i < nelements; i++)
2542	0		{ j = tclusterid[i];
2543	0		k = clusterid[i];
2544	0	0	if (mapping[k] == -1) mapping[k] = j;
2545	0	0	else if (mapping[k] != j)
2546	0	0	{ if (total < *error)
2547	0		{ ifound = 1;
2548	0		*error = total;
2549	0	0	for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2550			}
2551	0		break;
2552			}
2553			}
2554	0	0	if (i==nelements) ifound++; /* break statement not encountered */
2555	0	0	} while (++ipass < npass);
2556
2557	0		free(saved);
2558	0		return ifound;
2559			}
2560
2561			/* ********************************************************************* */
2562
2563	3		void kcluster (int nclusters, int nrows, int ncolumns,
2564			double data, int mask, double weight[], int transpose,
2565			int npass, char method, char dist,
2566			int clusterid[], double* error, int* ifound)
2567			/*
2568			Purpose
2569			=======
2570
2571			The kcluster routine performs k-means or k-median clustering on a given set of
2572			elements, using the specified distance measure. The number of clusters is given
2573			by the user. Multiple passes are being made to find the optimal clustering
2574			solution, each time starting from a different initial clustering.
2575
2576
2577			Arguments
2578			=========
2579
2580			nclusters (input) int
2581			The number of clusters to be found.
2582
2583			data (input) double[nrows][ncolumns]
2584			The array containing the data of the elements to be clustered (i.e., the gene
2585			expression data).
2586
2587			mask (input) int[nrows][ncolumns]
2588			This array shows which data values are missing. If
2589			mask[i][j] == 0, then data[i][j] is missing.
2590
2591			nrows (input) int
2592			The number of rows in the data matrix, equal to the number of genes.
2593
2594			ncolumns (input) int
2595			The number of columns in the data matrix, equal to the number of microarrays.
2596
2597			weight (input) double[n]
2598			The weights that are used to calculate the distance.
2599
2600			transpose (input) int
2601			If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2602			of the matrix are clustered.
2603
2604			npass (input) int
2605			The number of times clustering is performed. Clustering is performed npass
2606			times, each time starting from a different (random) initial assignment of
2607			genes to clusters. The clustering solution with the lowest within-cluster sum
2608			of distances is chosen.
2609			If npass==0, then the clustering algorithm will be run once, where the initial
2610			assignment of elements to clusters is taken from the clusterid array.
2611
2612			method (input) char
2613			Defines whether the arithmetic mean (method=='a') or the median
2614			(method=='m') is used to calculate the cluster center.
2615
2616			dist (input) char
2617			Defines which distance measure is used, as given by the table:
2618			dist=='e': Euclidean distance
2619			dist=='b': City-block distance
2620			dist=='c': correlation
2621			dist=='a': absolute value of the correlation
2622			dist=='u': uncentered correlation
2623			dist=='x': absolute uncentered correlation
2624			dist=='s': Spearman's rank correlation
2625			dist=='k': Kendall's tau
2626			For other values of dist, the default (Euclidean distance) is used.
2627
2628			clusterid (output; input) int[nrows] if transpose==0
2629			int[ncolumns] if transpose==1
2630			The cluster number to which a gene or microarray was assigned. If npass==0,
2631			then on input clusterid contains the initial clustering assignment from which
2632			the clustering algorithm starts. On output, it contains the clustering solution
2633			that was found.
2634
2635			error (output) double*
2636			The sum of distances to the cluster center of each item in the optimal k-means
2637			clustering solution that was found.
2638
2639			ifound (output) int*
2640			The number of times the optimal clustering solution was
2641			found. The value of ifound is at least 1; its maximum value is npass. If the
2642			number of clusters is larger than the number of elements being clustered,
2643			*ifound is set to 0 as an error code. If a memory allocation error occurs,
2644			*ifound is set to -1.
2645
2646			========================================================================
2647			*/
2648	3	50	{ const int nelements = (transpose==0) ? nrows : ncolumns;
2649	3	50	const int ndata = (transpose==0) ? ncolumns : nrows;
2650
2651			int i;
2652			int ok;
2653			int* tclusterid;
2654	3		int* mapping = NULL;
2655			double** cdata;
2656			int** cmask;
2657			int* counts;
2658
2659	3	50	if (nelements < nclusters)
2660	0		{ *ifound = 0;
2661	0		return;
2662			}
2663			/* More clusters asked for than elements available */
2664
2665	3		*ifound = -1;
2666
2667			/* This will contain the number of elements in each cluster, which is
2668			* needed to check for empty clusters. */
2669	3		counts = malloc(nclusters*sizeof(int));
2670	3	50	if(!counts) return;
2671
2672			/* Find out if the user specified an initial clustering */
2673	3	100	if (npass<=1) tclusterid = clusterid;
2674			else
2675	2		{ tclusterid = malloc(nelements*sizeof(int));
2676	2	50	if (!tclusterid)
2677	0		{ free(counts);
2678	0		return;
2679			}
2680	2		mapping = malloc(nclusters*sizeof(int));
2681	2	50	if (!mapping)
2682	0		{ free(counts);
2683	0		free(tclusterid);
2684	0		return;
2685			}
2686	19	100	for (i = 0; i < nelements; i++) clusterid[i] = 0;
2687			}
2688
2689			/* Allocate space to store the centroid data */
2690	3	50	if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
2691	0		else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
2692	3	50	if(!ok)
2693	0		{ free(counts);
2694	0	0	if(npass>1)
2695	0		{ free(tclusterid);
2696	0		free(mapping);
2697	0		return;
2698			}
2699			}
2700
2701	3	50	if (method=='m')
2702	0		{ double* cache = malloc(nelements*sizeof(double));
2703	0	0	if(cache)
2704	0		{ *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
2705			transpose, npass, dist, cdata, cmask, clusterid, error,
2706			tclusterid, counts, mapping, cache);
2707	0		free(cache);
2708			}
2709			}
2710			else
2711	3		*ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
2712			transpose, npass, dist, cdata, cmask, clusterid, error,
2713			tclusterid, counts, mapping);
2714
2715			/* Deallocate temporarily used space */
2716	3	100	if (npass > 1)
2717	2		{ free(mapping);
2718	2		free(tclusterid);
2719			}
2720
2721	3	50	if (transpose==0) freedatamask(nclusters, cdata, cmask);
2722	0		else freedatamask(ndata, cdata, cmask);
2723
2724	3		free(counts);
2725			}
2726
2727			/* *********************************************************************** */
2728
2729	2		void kmedoids (int nclusters, int nelements, double** distmatrix,
2730			int npass, int clusterid[], double* error, int* ifound)
2731			/*
2732			Purpose
2733			=======
2734
2735			The kmedoids routine performs k-medoids clustering on a given set of elements,
2736			using the distance matrix and the number of clusters passed by the user.
2737			Multiple passes are being made to find the optimal clustering solution, each
2738			time starting from a different initial clustering.
2739
2740
2741			Arguments
2742			=========
2743
2744			nclusters (input) int
2745			The number of clusters to be found.
2746
2747			nelements (input) int
2748			The number of elements to be clustered.
2749
2750			distmatrix (input) double array, ragged
2751			(number of rows is nelements, number of columns is equal to the row number)
2752			The distance matrix. To save space, the distance matrix is given in the
2753			form of a ragged array. The distance matrix is symmetric and has zeros
2754			on the diagonal. See distancematrix for a description of the content.
2755
2756			npass (input) int
2757			The number of times clustering is performed. Clustering is performed npass
2758			times, each time starting from a different (random) initial assignment of genes
2759			to clusters. The clustering solution with the lowest within-cluster sum of
2760			distances is chosen.
2761			If npass==0, then the clustering algorithm will be run once, where the initial
2762			assignment of elements to clusters is taken from the clusterid array.
2763
2764			clusterid (output; input) int[nelements]
2765			On input, if npass==0, then clusterid contains the initial clustering assignment
2766			from which the clustering algorithm starts; all numbers in clusterid should be
2767			between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2768			input.
2769			On output, clusterid contains the clustering solution that was found: clusterid
2770			contains the number of the cluster to which each item was assigned. On output,
2771			the number of a cluster is defined as the item number of the centroid of the
2772			cluster.
2773
2774			error (output) double
2775			The sum of distances to the cluster center of each item in the optimal k-medoids
2776			clustering solution that was found.
2777
2778			ifound (output) int
2779			If kmedoids is successful: the number of times the optimal clustering solution
2780			was found. The value of ifound is at least 1; its maximum value is npass.
2781			If the user requested more clusters than elements available, ifound is set
2782			to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2783
2784			========================================================================
2785			*/
2786			{ int i, j, icluster;
2787			int* tclusterid;
2788			int* saved;
2789			int* centroids;
2790			double* errors;
2791	2		int ipass = 0;
2792
2793	2	50	if (nelements < nclusters)
2794	0		{ *ifound = 0;
2795	0		return;
2796			} /* More clusters asked for than elements available */
2797
2798	2		*ifound = -1;
2799
2800			/* We save the clustering solution periodically and check if it reappears */
2801	2		saved = malloc(nelements*sizeof(int));
2802	2	50	if (saved==NULL) return;
2803
2804	2		centroids = malloc(nclusters*sizeof(int));
2805	2	50	if(!centroids)
2806	0		{ free(saved);
2807	0		return;
2808			}
2809
2810	2		errors = malloc(nclusters*sizeof(double));
2811	2	50	if(!errors)
2812	0		{ free(saved);
2813	0		free(centroids);
2814	0		return;
2815			}
2816
2817			/* Find out if the user specified an initial clustering */
2818	2	100	if (npass<=1) tclusterid = clusterid;
2819			else
2820	1		{ tclusterid = malloc(nelements*sizeof(int));
2821	1	50	if(!tclusterid)
2822	0		{ free(saved);
2823	0		free(centroids);
2824	0		free(errors);
2825	0		return;
2826			}
2827	13	100	for (i = 0; i < nelements; i++) clusterid[i] = -1;
2828			}
2829
2830	2		*error = DBL_MAX;
2831			do /* Start the loop */
2832	101		{ double total = DBL_MAX;
2833	101		int counter = 0;
2834	101		int period = 10;
2835
2836	101	100	if (npass!=0) randomassign(nclusters, nelements, tclusterid);
2837			while(1)
2838	343		{ double previous = total;
2839	343		total = 0.0;
2840
2841	343	100	if (counter % period == 0) /* Save the current cluster assignments */
2842	1313	100	{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2843	101	50	if (period < INT_MAX / 2) period *= 2;
2844			}
2845	343		counter++;
2846
2847			/* Find the center */
2848	343		getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2849			centroids, errors);
2850
2851	4459	100	for (i = 0; i < nelements; i++)
2852			/* Find the closest cluster */
2853	4116		{ double distance = DBL_MAX;
2854	17150	100	for (icluster = 0; icluster < nclusters; icluster++)
2855			{ double tdistance;
2856	14406		j = centroids[icluster];
2857	14406	100	if (i==j)
2858	1372		{ distance = 0.0;
2859	1372		tclusterid[i] = icluster;
2860	1372		break;
2861			}
2862	13034	100	tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2863	13034	100	if (tdistance < distance)
2864	7078		{ distance = tdistance;
2865	7078		tclusterid[i] = icluster;
2866			}
2867			}
2868	4116		total += distance;
2869			}
2870	343	100	if (total>=previous) break;
2871			/* total>=previous is FALSE on some machines even if total and previous
2872			* are bitwise identical. */
2873	558	50	for (i = 0; i < nelements; i++)
2874	558	100	if (saved[i]!=tclusterid[i]) break;
2875	242	50	if (i==nelements)
2876	0		break; /* Identical solution found; break out of this loop */
2877	242		}
2878
2879	101	100	if (npass <= 1) {
2880	1		*ifound = 1;
2881	1		*error = total;
2882			/* Replace by the centroid in each cluster. */
2883	13	100	for (j = 0; j < nelements; j++) {
2884	12		clusterid[j] = centroids[tclusterid[j]];
2885			}
2886	1		break;
2887			}
2888
2889	453	100	for (i = 0; i < nelements; i++)
2890	438	100	{ if (clusterid[i]!=centroids[tclusterid[i]])
2891	85	100	{ if (total < *error)
2892	1		{ *ifound = 1;
2893	1		*error = total;
2894			/* Replace by the centroid in each cluster. */
2895	13	100	for (j = 0; j < nelements; j++) {
2896	12		clusterid[j] = centroids[tclusterid[j]];
2897			}
2898			}
2899	85		break;
2900			}
2901			}
2902	100	100	if (i==nelements) (ifound)++; / break statement not encountered */
2903	100	100	} while (++ipass < npass);
2904
2905			/* Deallocate temporarily used space */
2906	2	100	if (npass > 1) free(tclusterid);
2907
2908	2		free(saved);
2909	2		free(centroids);
2910	2		free(errors);
2911
2912	2		return;
2913			}
2914
2915			/* ******************************************************************** */
2916
2917	7		double distancematrix (int nrows, int ncolumns, double data,
2918			int** mask, double weights[], char dist, int transpose)
2919			/*
2920			Purpose
2921			=======
2922
2923			The distancematrix routine calculates the distance matrix between genes or
2924			microarrays using their measured gene expression data. Several distance measures
2925			can be used. The routine returns a pointer to a ragged array containing the
2926			distances between the genes. As the distance matrix is symmetric, with zeros on
2927			the diagonal, only the lower triangular half of the distance matrix is saved.
2928			The distancematrix routine allocates space for the distance matrix. If the
2929			parameter transpose is set to a nonzero value, the distances between the columns
2930			(microarrays) are calculated, otherwise distances between the rows (genes) are
2931			calculated.
2932			If sufficient space in memory cannot be allocated to store the distance matrix,
2933			the routine returns a NULL pointer, and all memory allocated so far for the
2934			distance matrix is freed.
2935
2936
2937			Arguments
2938			=========
2939
2940			nrows (input) int
2941			The number of rows in the gene expression data matrix (i.e., the number of
2942			genes)
2943
2944			ncolumns (input) int
2945			The number of columns in the gene expression data matrix (i.e., the number of
2946			microarrays)
2947
2948			data (input) double[nrows][ncolumns]
2949			The array containing the gene expression data.
2950
2951			mask (input) int[nrows][ncolumns]
2952			This array shows which data values are missing. If mask[i][j]==0, then
2953			data[i][j] is missing.
2954
2955			weight (input) double[n]
2956			The weights that are used to calculate the distance. The length of this vector
2957			is equal to the number of columns if the distances between genes are calculated,
2958			or the number of rows if the distances between microarrays are calculated.
2959
2960			dist (input) char
2961			Defines which distance measure is used, as given by the table:
2962			dist=='e': Euclidean distance
2963			dist=='b': City-block distance
2964			dist=='c': correlation
2965			dist=='a': absolute value of the correlation
2966			dist=='u': uncentered correlation
2967			dist=='x': absolute uncentered correlation
2968			dist=='s': Spearman's rank correlation
2969			dist=='k': Kendall's tau
2970			For other values of dist, the default (Euclidean distance) is used.
2971
2972			transpose (input) int
2973			If transpose is equal to zero, the distances between the rows is
2974			calculated. Otherwise, the distances between the columns is calculated.
2975			The former is needed when genes are being clustered; the latter is used
2976			when microarrays are being clustered.
2977
2978			========================================================================
2979			*/
2980			{ /* First determine the size of the distance matrix */
2981	7	50	const int n = (transpose==0) ? nrows : ncolumns;
2982	7	50	const int ndata = (transpose==0) ? ncolumns : nrows;
2983			int i,j;
2984			double** matrix;
2985
2986			/* Set the metric function as indicated by dist */
2987	7		double (*metric)
2988			(int, double, double, int, int, const double[], int, int, int) =
2989	7		setmetric(dist);
2990
2991	7	50	if (n < 2) return NULL;
2992
2993			/* Set up the ragged array */
2994	7		matrix = malloc(nsizeof(double));
2995	7	50	if(matrix==NULL) return NULL; /* Not enough memory available */
2996	7		matrix[0] = NULL;
2997			/* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2998	55	100	for (i = 1; i < n; i++)
2999	48		{ matrix[i] = malloc(i*sizeof(double));
3000	48	50	if (matrix[i]==NULL) break; /* Not enough memory available */
3001			}
3002	7	50	if (i < n) /* break condition encountered */
3003	0		{ j = i;
3004	0	0	for (i = 1; i < j; i++) free(matrix[i]);
3005	0		return NULL;
3006			}
3007
3008			/* Calculate the distances and save them in the ragged array */
3009	55	100	for (i = 1; i < n; i++)
3010	306	100	for (j = 0; j < i; j++)
3011	258		matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
3012
3013	7		return matrix;
3014			}
3015
3016			/* ******************************************************************** */
3017
3018	0		double* calculate_weights(int nrows, int ncolumns, double data, int mask,
3019			double weights[], int transpose, char dist, double cutoff, double exponent)
3020
3021			/*
3022			Purpose
3023			=======
3024
3025			This function calculates the weights using the weighting scheme proposed by
3026			Michael Eisen:
3027			w[i] = 1.0 / sum_{j where d[i][j]
3028			where the cutoff and the exponent are specified by the user.
3029
3030
3031			Arguments
3032			=========
3033
3034			nrows (input) int
3035			The number of rows in the gene expression data matrix, equal to the number of
3036			genes.
3037
3038			ncolumns (input) int
3039			The number of columns in the gene expression data matrix, equal to the number of
3040			microarrays.
3041
3042			data (input) double[nrows][ncolumns]
3043			The array containing the gene expression data.
3044
3045			mask (input) int[nrows][ncolumns]
3046			This array shows which data values are missing. If mask[i][j]==0, then
3047			data[i][j] is missing.
3048
3049			weight (input) int[ncolumns] if transpose==0,
3050			int[nrows] if transpose==1
3051			The weights that are used to calculate the distance. The length of this vector
3052			is ncolumns if gene weights are being clustered, and nrows if microarrays
3053			weights are being clustered.
3054
3055			transpose (input) int
3056			If transpose==0, the weights of the rows of the data matrix are calculated.
3057			Otherwise, the weights of the columns of the data matrix are calculated.
3058
3059			dist (input) char
3060			Defines which distance measure is used, as given by the table:
3061			dist=='e': Euclidean distance
3062			dist=='b': City-block distance
3063			dist=='c': correlation
3064			dist=='a': absolute value of the correlation
3065			dist=='u': uncentered correlation
3066			dist=='x': absolute uncentered correlation
3067			dist=='s': Spearman's rank correlation
3068			dist=='k': Kendall's tau
3069			For other values of dist, the default (Euclidean distance) is used.
3070
3071			cutoff (input) double
3072			The cutoff to be used to calculate the weights.
3073
3074			exponent (input) double
3075			The exponent to be used to calculate the weights.
3076
3077
3078			Return value
3079			============
3080
3081			The function returns a pointer to a newly allocated array containing the
3082			calculated weights for the rows (if transpose==0) or columns (if
3083			transpose==1). If not enough memory could be allocated to store the
3084			weights array, the function returns NULL.
3085
3086			========================================================================
3087			*/
3088			{ int i,j;
3089	0	0	const int ndata = (transpose==0) ? ncolumns : nrows;
3090	0	0	const int nelements = (transpose==0) ? nrows : ncolumns;
3091
3092			/* Set the metric function as indicated by dist */
3093	0		double (*metric)
3094			(int, double, double, int, int, const double[], int, int, int) =
3095	0		setmetric(dist);
3096
3097	0		double* result = malloc(nelements*sizeof(double));
3098	0	0	if (!result) return NULL;
3099	0		memset(result, 0, nelements*sizeof(double));
3100
3101	0	0	for (i = 0; i < nelements; i++)
3102	0		{ result[i] += 1.0;
3103	0	0	for (j = 0; j < i; j++)
3104	0		{ const double distance = metric(ndata, data, data, mask, mask, weights,
3105			i, j, transpose);
3106	0	0	if (distance < cutoff)
3107	0		{ const double dweight = exp(exponent*log(1-distance/cutoff));
3108			/* pow() causes a crash on AIX */
3109	0		result[i] += dweight;
3110	0		result[j] += dweight;
3111			}
3112			}
3113			}
3114	0	0	for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
3115	0		return result;
3116			}
3117
3118			/* ******************************************************************** */
3119
3120	0		void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3121
3122			/*
3123			Purpose
3124			=======
3125
3126			The cuttree routine takes the output of a hierarchical clustering routine, and
3127			divides the elements in the tree structure into clusters based on the
3128			hierarchical clustering result. The number of clusters is specified by the user.
3129
3130			Arguments
3131			=========
3132
3133			nelements (input) int
3134			The number of elements that were clustered.
3135
3136			tree (input) Node[nelements-1]
3137			The clustering solution. Each node in the array describes one linking event,
3138			with tree[i].left and tree[i].right representing the elements that were joined.
3139			The original elements are numbered 0..nelements-1, nodes are numbered
3140			-1..-(nelements-1).
3141
3142			nclusters (input) int
3143			The number of clusters to be formed.
3144
3145			clusterid (output) int[nelements]
3146			The number of the cluster to which each element was assigned. Clusters are
3147			numbered 0..nclusters-1 in the left-to-right order in which they appear in the
3148			hierarchical clustering tree. Space for the clusterid array should be allocated
3149			before calling the cuttree routine. If a memory error occured, all elements in
3150			clusterid are set to -1.
3151
3152			========================================================================
3153			*/
3154	0		{ int i = -nelements+1; /* top node */
3155			int j;
3156	0		int k = -1;
3157	0		int previous = nelements;
3158	0		const int n = nelements-nclusters; /* number of nodes to join */
3159			int* parents;
3160	0	0	if (nclusters==1) {
3161	0	0	for (i = 0; i < nelements; i++) clusterid[i] = 0;
3162	0		return;
3163			}
3164	0		parents = malloc((nelements-1)*sizeof(int));
3165	0	0	if (!parents)
3166	0	0	{ for (i = 0; i < nelements; i++) clusterid[i] = -1;
3167	0		return;
3168			}
3169			while (1) {
3170	0	0	if (i >= 0) {
3171	0		clusterid[i] = k;
3172	0		j = i;
3173	0		i = previous;
3174	0		previous = j;
3175			}
3176			else {
3177	0		j = -i-1;
3178	0	0	if (previous == tree[j].left) {
3179	0		previous = i;
3180	0		i = tree[j].right;
3181	0	0	if (j >= n && (i >= 0 \|\| -i-1 < n)) k++;
		0
		0
3182			}
3183	0	0	else if (previous == tree[j].right) {
3184	0		previous = i;
3185	0		i = parents[j];
3186	0	0	if (i==nelements) break;
3187			}
3188			else {
3189	0		parents[j] = previous;
3190	0		previous = i;
3191	0		i = tree[j].left;
3192	0	0	if (j >= n && (i >= 0 \|\| -i-1 < n)) k++;
		0
		0
3193			}
3194			}
3195	0		}
3196	0		free(parents);
3197			}
3198
3199			/* ******************************************************************** */
3200
3201			static
3202	2		Node* pclcluster (int nrows, int ncolumns, double data, int mask,
3203			double weight[], double** distmatrix, char dist, int transpose)
3204
3205			/*
3206
3207			Purpose
3208			=======
3209
3210			The pclcluster routine performs clustering using pairwise centroid-linking
3211			on a given set of gene expression data, using the distance metric given by dist.
3212
3213			Arguments
3214			=========
3215
3216			nrows (input) int
3217			The number of rows in the gene expression data matrix, equal to the number of
3218			genes.
3219
3220			ncolumns (input) int
3221			The number of columns in the gene expression data matrix, equal to the number of
3222			microarrays.
3223
3224			data (input) double[nrows][ncolumns]
3225			The array containing the gene expression data.
3226
3227			mask (input) int[nrows][ncolumns]
3228			This array shows which data values are missing. If
3229			mask[i][j] == 0, then data[i][j] is missing.
3230
3231			weight (input) double[ncolumns] if transpose==0;
3232			double[nrows] if transpose==1
3233			The weights that are used to calculate the distance. The length of this vector
3234			is ncolumns if genes are being clustered, and nrows if microarrays are being
3235			clustered.
3236
3237			transpose (input) int
3238			If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3239			of the matrix are clustered.
3240
3241			dist (input) char
3242			Defines which distance measure is used, as given by the table:
3243			dist=='e': Euclidean distance
3244			dist=='b': City-block distance
3245			dist=='c': correlation
3246			dist=='a': absolute value of the correlation
3247			dist=='u': uncentered correlation
3248			dist=='x': absolute uncentered correlation
3249			dist=='s': Spearman's rank correlation
3250			dist=='k': Kendall's tau
3251			For other values of dist, the default (Euclidean distance) is used.
3252
3253			distmatrix (input) double**
3254			The distance matrix. This matrix is precalculated by the calling routine
3255			treecluster. The pclcluster routine modifies the contents of distmatrix, but
3256			does not deallocate it.
3257
3258			Return value
3259			============
3260
3261			A pointer to a newly allocated array of Node structs, describing the
3262			hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3263			whether genes (rows) or microarrays (columns) were clustered, nelements is
3264			equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3265			structure.
3266			If a memory error occurs, pclcluster returns NULL.
3267			========================================================================
3268			*/
3269			{ int i, j;
3270	2	50	const int nelements = (transpose==0) ? nrows : ncolumns;
3271			int inode;
3272	2	50	const int ndata = transpose ? nrows : ncolumns;
3273	2		const int nnodes = nelements - 1;
3274
3275			/* Set the metric function as indicated by dist */
3276	2		double (*metric)
3277			(int, double, double, int, int, const double[], int, int, int) =
3278	2		setmetric(dist);
3279
3280			Node* result;
3281			double** newdata;
3282			int** newmask;
3283	2		int* distid = malloc(nelements*sizeof(int));
3284	2	50	if(!distid) return NULL;
3285	2		result = malloc(nnodes*sizeof(Node));
3286	2	50	if(!result)
3287	0		{ free(distid);
3288	0		return NULL;
3289			}
3290	2	50	if(!makedatamask(nelements, ndata, &newdata, &newmask))
3291	0		{ free(result);
3292	0		free(distid);
3293	0		return NULL;
3294			}
3295
3296	19	100	for (i = 0; i < nelements; i++) distid[i] = i;
3297			/* To remember which row/column in the distance matrix contains what */
3298
3299			/* Storage for node data */
3300	2	50	if (transpose)
3301	0	0	{ for (i = 0; i < nelements; i++)
3302	0	0	{ for (j = 0; j < ndata; j++)
3303	0		{ newdata[i][j] = data[j][i];
3304	0		newmask[i][j] = mask[j][i];
3305			}
3306			}
3307	0		data = newdata;
3308	0		mask = newmask;
3309			}
3310			else
3311	19	100	{ for (i = 0; i < nelements; i++)
3312	17		{ memcpy(newdata[i], data[i], ndata*sizeof(double));
3313	17		memcpy(newmask[i], mask[i], ndata*sizeof(int));
3314			}
3315	2		data = newdata;
3316	2		mask = newmask;
3317			}
3318
3319	17	100	for (inode = 0; inode < nnodes; inode++)
3320			{ /* Find the pair with the shortest distance */
3321	15		int is = 1;
3322	15		int js = 0;
3323	15		result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3324	15		result[inode].left = distid[js];
3325	15		result[inode].right = distid[is];
3326
3327			/* Make node js the new node */
3328	54	100	for (i = 0; i < ndata; i++)
3329	39		{ data[js][i] = data[js][i]mask[js][i] + data[is][i]mask[is][i];
3330	39		mask[js][i] += mask[is][i];
3331	39	50	if (mask[js][i]) data[js][i] /= mask[js][i];
3332			}
3333	15		free(data[is]);
3334	15		free(mask[is]);
3335	15		data[is] = data[nnodes-inode];
3336	15		mask[is] = mask[nnodes-inode];
3337
3338			/* Fix the distances */
3339	15		distid[is] = distid[nnodes-inode];
3340	70	100	for (i = 0; i < is; i++)
3341	55		distmatrix[is][i] = distmatrix[nnodes-inode][i];
3342	35	100	for (i = is + 1; i < nnodes-inode; i++)
3343	20		distmatrix[i][is] = distmatrix[nnodes-inode][i];
3344
3345	15		distid[js] = -inode-1;
3346	39	100	for (i = 0; i < js; i++)
3347	24		distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3348	60	100	for (i = js + 1; i < nnodes-inode; i++)
3349	45		distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3350			}
3351
3352			/* Free temporarily allocated space */
3353	2		free(data[0]);
3354	2		free(mask[0]);
3355	2		free(data);
3356	2		free(mask);
3357	2		free(distid);
3358
3359	2		return result;
3360			}
3361
3362			/* ******************************************************************** */
3363
3364			static
3365	31		int nodecompare(const void* a, const void* b)
3366			/* Helper function for qsort. */
3367	31		{ const Node* node1 = (const Node*)a;
3368	31		const Node* node2 = (const Node*)b;
3369	31		const double term1 = node1->distance;
3370	31		const double term2 = node2->distance;
3371	31	100	if (term1 < term2) return -1;
3372	14	50	if (term1 > term2) return +1;
3373	0		return 0;
3374			}
3375
3376			/* ---------------------------------------------------------------------- */
3377
3378			static
3379	2		Node* pslcluster (int nrows, int ncolumns, double data, int mask,
3380			double weight[], double** distmatrix, char dist, int transpose)
3381
3382			/*
3383
3384			Purpose
3385			=======
3386
3387			The pslcluster routine performs single-linkage hierarchical clustering, using
3388			either the distance matrix directly, if available, or by calculating the
3389			distances from the data array. This implementation is based on the SLINK
3390			algorithm, described in:
3391			Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3392			cluster method. The Computer Journal, 16(1): 30-34.
3393			The output of this algorithm is identical to conventional single-linkage
3394			hierarchical clustering, but is much more memory-efficient and faster. Hence,
3395			it can be applied to large data sets, for which the conventional single-
3396			linkage algorithm fails due to lack of memory.
3397
3398
3399			Arguments
3400			=========
3401
3402			nrows (input) int
3403			The number of rows in the gene expression data matrix, equal to the number of
3404			genes.
3405
3406			ncolumns (input) int
3407			The number of columns in the gene expression data matrix, equal to the number of
3408			microarrays.
3409
3410			data (input) double[nrows][ncolumns]
3411			The array containing the gene expression data.
3412
3413			mask (input) int[nrows][ncolumns]
3414			This array shows which data values are missing. If
3415			mask[i][j] == 0, then data[i][j] is missing.
3416
3417			weight (input) double[n]
3418			The weights that are used to calculate the distance. The length of this vector
3419			is ncolumns if genes are being clustered, and nrows if microarrays are being
3420			clustered.
3421
3422			transpose (input) int
3423			If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3424			of the matrix are clustered.
3425
3426			dist (input) char
3427			Defines which distance measure is used, as given by the table:
3428			dist=='e': Euclidean distance
3429			dist=='b': City-block distance
3430			dist=='c': correlation
3431			dist=='a': absolute value of the correlation
3432			dist=='u': uncentered correlation
3433			dist=='x': absolute uncentered correlation
3434			dist=='s': Spearman's rank correlation
3435			dist=='k': Kendall's tau
3436			For other values of dist, the default (Euclidean distance) is used.
3437
3438			distmatrix (input) double**
3439			The distance matrix. If the distance matrix is passed by the calling routine
3440			treecluster, it is used by pslcluster to speed up the clustering calculation.
3441			The pslcluster routine does not modify the contents of distmatrix, and does
3442			not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3443			by the pslcluster routine from the gene expression data (the data and mask
3444			arrays) and stored in temporary arrays. If distmatrix is passed, the original
3445			gene expression data (specified by the data and mask arguments) are not needed
3446			and are therefore ignored.
3447
3448
3449			Return value
3450			============
3451
3452			A pointer to a newly allocated array of Node structs, describing the
3453			hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3454			whether genes (rows) or microarrays (columns) were clustered, nelements is
3455			equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3456			structure.
3457			If a memory error occurs, pslcluster returns NULL.
3458
3459			========================================================================
3460			*/
3461			{ int i, j, k;
3462	2	50	const int nelements = transpose ? ncolumns : nrows;
3463	2		const int nnodes = nelements - 1;
3464			int* vector;
3465			double* temp;
3466			int* index;
3467			Node* result;
3468	2		temp = malloc(nnodes*sizeof(double));
3469	2	50	if(!temp) return NULL;
3470	2		index = malloc(nelements*sizeof(int));
3471	2	50	if(!index)
3472	0		{ free(temp);
3473	0		return NULL;
3474			}
3475	2		vector = malloc(nnodes*sizeof(int));
3476	2	50	if(!vector)
3477	0		{ free(index);
3478	0		free(temp);
3479	0		return NULL;
3480			}
3481	2		result = malloc(nelements*sizeof(Node));
3482	2	50	if(!result)
3483	0		{ free(vector);
3484	0		free(index);
3485	0		free(temp);
3486	0		return NULL;
3487			}
3488
3489	17	100	for (i = 0; i < nnodes; i++) vector[i] = i;
3490
3491	2	50	if(distmatrix)
3492	0	0	{ for (i = 0; i < nrows; i++)
3493	0		{ result[i].distance = DBL_MAX;
3494	0	0	for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3495	0	0	for (j = 0; j < i; j++)
3496	0		{ k = vector[j];
3497	0	0	if (result[j].distance >= temp[j])
3498	0	0	{ if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3499	0		result[j].distance = temp[j];
3500	0		vector[j] = i;
3501			}
3502	0	0	else if (temp[j] < temp[k]) temp[k] = temp[j];
3503			}
3504	0	0	for (j = 0; j < i; j++)
3505			{
3506	0	0	if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3507			}
3508			}
3509			}
3510			else
3511	2	50	{ const int ndata = transpose ? nrows : ncolumns;
3512			/* Set the metric function as indicated by dist */
3513	2		double (*metric)
3514			(int, double, double, int, int, const double[], int, int, int) =
3515	2		setmetric(dist);
3516
3517	19	100	for (i = 0; i < nelements; i++)
3518	17		{ result[i].distance = DBL_MAX;
3519	101	100	for (j = 0; j < i; j++) temp[j] =
3520	84		metric(ndata, data, data, mask, mask, weight, i, j, transpose);
3521	101	100	for (j = 0; j < i; j++)
3522	84		{ k = vector[j];
3523	84	100	if (result[j].distance >= temp[j])
3524	25	100	{ if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3525	25		result[j].distance = temp[j];
3526	25		vector[j] = i;
3527			}
3528	59	100	else if (temp[j] < temp[k]) temp[k] = temp[j];
3529			}
3530	101	100	for (j = 0; j < i; j++)
3531	84	100	if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3532			}
3533			}
3534	2		free(temp);
3535
3536	17	100	for (i = 0; i < nnodes; i++) result[i].left = i;
3537	2		qsort(result, nnodes, sizeof(Node), nodecompare);
3538
3539	19	100	for (i = 0; i < nelements; i++) index[i] = i;
3540	17	100	for (i = 0; i < nnodes; i++)
3541	15		{ j = result[i].left;
3542	15		k = vector[j];
3543	15		result[i].left = index[j];
3544	15		result[i].right = index[k];
3545	15		index[k] = -i-1;
3546			}
3547	2		free(vector);
3548	2		free(index);
3549
3550	2		result = realloc(result, nnodes*sizeof(Node));
3551
3552	2		return result;
3553			}
3554			/* ******************************************************************** */
3555
3556	2		static Node* pmlcluster (int nelements, double** distmatrix)
3557			/*
3558
3559			Purpose
3560			=======
3561
3562			The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3563			linking on the given distance matrix.
3564
3565			Arguments
3566			=========
3567
3568			nelements (input) int
3569			The number of elements to be clustered.
3570
3571			distmatrix (input) double**
3572			The distance matrix, with nelements rows, each row being filled up to the
3573			diagonal. The elements on the diagonal are not used, as they are assumed to be
3574			zero. The distance matrix will be modified by this routine.
3575
3576			Return value
3577			============
3578
3579			A pointer to a newly allocated array of Node structs, describing the
3580			hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3581			whether genes (rows) or microarrays (columns) were clustered, nelements is
3582			equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3583			structure.
3584			If a memory error occurs, pmlcluster returns NULL.
3585			========================================================================
3586			*/
3587			{ int j;
3588			int n;
3589			int* clusterid;
3590			Node* result;
3591
3592	2		clusterid = malloc(nelements*sizeof(int));
3593	2	50	if(!clusterid) return NULL;
3594	2		result = malloc((nelements-1)*sizeof(Node));
3595	2	50	if (!result)
3596	0		{ free(clusterid);
3597	0		return NULL;
3598			}
3599
3600			/* Setup a list specifying to which cluster a gene belongs */
3601	19	100	for (j = 0; j < nelements; j++) clusterid[j] = j;
3602
3603	17	100	for (n = nelements; n > 1; n--)
3604	15		{ int is = 1;
3605	15		int js = 0;
3606	15		result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3607
3608			/* Fix the distances */
3609	37	100	for (j = 0; j < js; j++)
3610	22	100	distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3611	28	100	for (j = js+1; j < is; j++)
3612	13	100	distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3613	49	100	for (j = is+1; j < n; j++)
3614	34	100	distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3615
3616	65	100	for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3617	39	100	for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3618
3619			/* Update clusterids */
3620	15		result[nelements-n].left = clusterid[is];
3621	15		result[nelements-n].right = clusterid[js];
3622	15		clusterid[js] = n-nelements-1;
3623	15		clusterid[is] = clusterid[n-1];
3624			}
3625	2		free(clusterid);
3626
3627	2		return result;
3628			}
3629
3630			/* ******************************************************************* */
3631
3632	2		static Node* palcluster (int nelements, double** distmatrix)
3633			/*
3634			Purpose
3635			=======
3636
3637			The palcluster routine performs clustering using pairwise average
3638			linking on the given distance matrix.
3639
3640			Arguments
3641			=========
3642
3643			nelements (input) int
3644			The number of elements to be clustered.
3645
3646			distmatrix (input) double**
3647			The distance matrix, with nelements rows, each row being filled up to the
3648			diagonal. The elements on the diagonal are not used, as they are assumed to be
3649			zero. The distance matrix will be modified by this routine.
3650
3651			Return value
3652			============
3653
3654			A pointer to a newly allocated array of Node structs, describing the
3655			hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3656			whether genes (rows) or microarrays (columns) were clustered, nelements is
3657			equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3658			structure.
3659			If a memory error occurs, palcluster returns NULL.
3660			========================================================================
3661			*/
3662			{ int j;
3663			int n;
3664			int* clusterid;
3665			int* number;
3666			Node* result;
3667
3668	2		clusterid = malloc(nelements*sizeof(int));
3669	2	50	if(!clusterid) return NULL;
3670	2		number = malloc(nelements*sizeof(int));
3671	2	50	if(!number)
3672	0		{ free(clusterid);
3673	0		return NULL;
3674			}
3675	2		result = malloc((nelements-1)*sizeof(Node));
3676	2	50	if (!result)
3677	0		{ free(clusterid);
3678	0		free(number);
3679	0		return NULL;
3680			}
3681
3682			/* Setup a list specifying to which cluster a gene belongs, and keep track
3683			* of the number of elements in each cluster (needed to calculate the
3684			* average). */
3685	19	100	for (j = 0; j < nelements; j++)
3686	17		{ number[j] = 1;
3687	17		clusterid[j] = j;
3688			}
3689
3690	17	100	for (n = nelements; n > 1; n--)
3691			{ int sum;
3692	15		int is = 1;
3693	15		int js = 0;
3694	15		result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3695
3696			/* Save result */
3697	15		result[nelements-n].left = clusterid[is];
3698	15		result[nelements-n].right = clusterid[js];
3699
3700			/* Fix the distances */
3701	15		sum = number[is] + number[js];
3702	39	100	for (j = 0; j < js; j++)
3703	48		{ distmatrix[js][j] = distmatrix[is][j]*number[is]
3704	24		+ distmatrix[js][j]*number[js];
3705	24		distmatrix[js][j] /= sum;
3706			}
3707	31	100	for (j = js+1; j < is; j++)
3708	32		{ distmatrix[j][js] = distmatrix[is][j]*number[is]
3709	16		+ distmatrix[j][js]*number[js];
3710	16		distmatrix[j][js] /= sum;
3711			}
3712	44	100	for (j = is+1; j < n; j++)
3713	58		{ distmatrix[j][js] = distmatrix[j][is]*number[is]
3714	29		+ distmatrix[j][js]*number[js];
3715	29		distmatrix[j][js] /= sum;
3716			}
3717
3718	70	100	for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3719	35	100	for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3720
3721			/* Update number of elements in the clusters */
3722	15		number[js] = sum;
3723	15		number[is] = number[n-1];
3724
3725			/* Update clusterids */
3726	15		clusterid[js] = n-nelements-1;
3727	15		clusterid[is] = clusterid[n-1];
3728			}
3729	2		free(clusterid);
3730	2		free(number);
3731
3732	2		return result;
3733			}
3734
3735			/* ******************************************************************* */
3736
3737	8		Node* treecluster (int nrows, int ncolumns, double data, int mask,
3738			double weight[], int transpose, char dist, char method, double** distmatrix)
3739			/*
3740			Purpose
3741			=======
3742
3743			The treecluster routine performs hierarchical clustering using pairwise
3744			single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3745			given set of gene expression data, using the distance metric given by dist.
3746			If successful, the function returns a pointer to a newly allocated Tree struct
3747			containing the hierarchical clustering solution, and NULL if a memory error
3748			occurs. The pointer should be freed by the calling routine to prevent memory
3749			leaks.
3750
3751			Arguments
3752			=========
3753
3754			nrows (input) int
3755			The number of rows in the data matrix, equal to the number of genes.
3756
3757			ncolumns (input) int
3758			The number of columns in the data matrix, equal to the number of microarrays.
3759
3760			data (input) double[nrows][ncolumns]
3761			The array containing the data of the vectors to be clustered.
3762
3763			mask (input) int[nrows][ncolumns]
3764			This array shows which data values are missing. If mask[i][j]==0, then
3765			data[i][j] is missing.
3766
3767			weight (input) double array[n]
3768			The weights that are used to calculate the distance.
3769
3770			transpose (input) int
3771			If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3772			of the matrix are clustered.
3773
3774			dist (input) char
3775			Defines which distance measure is used, as given by the table:
3776			dist=='e': Euclidean distance
3777			dist=='b': City-block distance
3778			dist=='c': correlation
3779			dist=='a': absolute value of the correlation
3780			dist=='u': uncentered correlation
3781			dist=='x': absolute uncentered correlation
3782			dist=='s': Spearman's rank correlation
3783			dist=='k': Kendall's tau
3784			For other values of dist, the default (Euclidean distance) is used.
3785
3786			method (input) char
3787			Defines which hierarchical clustering method is used:
3788			method=='s': pairwise single-linkage clustering
3789			method=='m': pairwise maximum- (or complete-) linkage clustering
3790			method=='a': pairwise average-linkage clustering
3791			method=='c': pairwise centroid-linkage clustering
3792			For the first three, either the distance matrix or the gene expression data is
3793			sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3794			clustering, however, the gene expression data are always needed, even if the
3795			distance matrix itself is available.
3796
3797			distmatrix (input) double**
3798			The distance matrix. If the distance matrix is zero initially, the distance
3799			matrix will be allocated and calculated from the data by treecluster, and
3800			deallocated before treecluster returns. If the distance matrix is passed by the
3801			calling routine, treecluster will modify the contents of the distance matrix as
3802			part of the clustering algorithm, but will not deallocate it. The calling
3803			routine should deallocate the distance matrix after the return from treecluster.
3804
3805			Return value
3806			============
3807
3808			A pointer to a newly allocated array of Node structs, describing the
3809			hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3810			whether genes (rows) or microarrays (columns) were clustered, nelements is
3811			equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3812			structure.
3813			If a memory error occurs, treecluster returns NULL.
3814
3815			========================================================================
3816			*/
3817	8		{ Node* result = NULL;
3818	8	50	const int nelements = (transpose==0) ? nrows : ncolumns;
3819	8	50	const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
		100
3820
3821	8	50	if (nelements < 2) return NULL;
3822
3823			/* Calculate the distance matrix if the user didn't give it */
3824	8	100	if(ldistmatrix)
3825	6		{ distmatrix =
3826	6		distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
3827	6	50	if (!distmatrix) return NULL; /* Insufficient memory */
3828			}
3829
3830	8		switch(method)
3831			{ case 's':
3832	2		result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3833			dist, transpose);
3834	2		break;
3835			case 'm':
3836	2		result = pmlcluster(nelements, distmatrix);
3837	2		break;
3838			case 'a':
3839	2		result = palcluster(nelements, distmatrix);
3840	2		break;
3841			case 'c':
3842	2		result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3843			dist, transpose);
3844	2		break;
3845			}
3846
3847			/* Deallocate space for distance matrix, if it was allocated by treecluster */
3848	8	100	if(ldistmatrix)
3849			{ int i;
3850	51	100	for (i = 1; i < nelements; i++) free(distmatrix[i]);
3851	6		free (distmatrix);
3852			}
3853
3854	8		return result;
3855			}
3856
3857			/* ******************************************************************* */
3858
3859	0		int sorttree(const int nnodes, Node* tree, const double order[], int indices[])
3860			/*
3861			Purpose
3862			=======
3863
3864			The sorttree routine sorts the items in a hierarchical clustering solution
3865			based on their order values, while remaining consistent with the hierchical
3866			clustering solution.
3867
3868			Arguments
3869			=========
3870
3871			nnodes (input) int
3872			The number of nodes in the hierarchical clustering tree.
3873
3874			tree (input) Node[nnodes]
3875			The hierarchical clustering tree describing the clustering solution.
3876
3877			order (input) double[nnodes+1]
3878			The preferred order of the items.
3879
3880			indices (output) int*
3881			The indices of each item after sorting, with item i appearing at indices[i]
3882			after sorting.
3883
3884			Return value
3885			============
3886
3887			If no errors occur, sorttree returns 1.
3888			If a memory error occurs, sorttree returns 0.
3889
3890			========================================================================
3891			*/
3892
3893			{ int i;
3894			int index;
3895			int i1, i2;
3896			double order1, order2;
3897			int counts1, counts2;
3898	0		int* nodecounts = malloc(nnodes*sizeof(int));
3899	0	0	if (!nodecounts) return 0;
3900	0	0	if (order) {
3901	0		double* nodeorder = malloc(nnodes*sizeof(double));
3902	0	0	if (!nodeorder) {
3903	0		free(nodecounts);
3904	0		return 0;
3905			}
3906	0	0	for (i = 0; i < nnodes; i++)
3907	0		{ i1 = tree[i].left;
3908	0		i2 = tree[i].right;
3909			/* i1 and i2 are the elements that are to be joined */
3910	0	0	if (i1 < 0)
3911	0		{ index = -i1-1;
3912	0		order1 = nodeorder[index];
3913	0		counts1 = nodecounts[index];
3914			}
3915			else
3916	0		{ order1 = order[i1];
3917	0		counts1 = 1;
3918			}
3919	0	0	if (i2 < 0)
3920	0		{ index = -i2-1;
3921	0		order2 = nodeorder[index];
3922	0		counts2 = nodecounts[index];
3923			}
3924			else
3925	0		{ order2 = order[i2];
3926	0		counts2 = 1;
3927			}
3928	0	0	if (order1 > order2) {
3929	0		tree[i].left = i2;
3930	0		tree[i].right = i1;
3931			}
3932	0		nodecounts[i] = counts1 + counts2;
3933	0		nodeorder[i] = (counts1order1 + counts2order2) / (counts1 + counts2);
3934			}
3935	0		free(nodeorder);
3936			}
3937			else
3938	0	0	{ for (i = 0; i < nnodes; i++)
3939	0		{ i1 = tree[i].left;
3940	0		i2 = tree[i].right;
3941			/* i1 and i2 are the elements that are to be joined */
3942	0	0	counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1;
3943	0	0	counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1;
3944	0		nodecounts[i] = counts1 + counts2;
3945			}
3946			}
3947	0		i--;
3948	0		nodecounts[i] = 0;
3949	0	0	for ( ; i >= 0; i--)
3950	0		{ i1 = tree[i].left;
3951	0		i2 = tree[i].right;
3952	0	0	counts1 = (i1<0) ? nodecounts[-i1-1] : 1;
3953	0		index = nodecounts[i];
3954	0	0	if (i1 >= 0) indices[index] = i1;
3955	0		else nodecounts[-i1-1] = index;
3956	0		index += counts1;
3957	0	0	if (i2 >= 0) indices[index] = i2;
3958	0		else nodecounts[-i2-1] = index;
3959			}
3960	0		free(nodecounts);
3961	0		return 1;
3962			}
3963
3964			/* ******************************************************************* */
3965
3966			static
3967	2		void somworker (int nrows, int ncolumns, double data, int mask,
3968			const double weights[], int transpose, int nxgrid, int nygrid,
3969			double inittau, double*** celldata, int niter, char dist)
3970
3971	2	50	{ const int nelements = (transpose==0) ? nrows : ncolumns;
3972	2	50	const int ndata = (transpose==0) ? ncolumns : nrows;
3973			int i, j;
3974	2		double* stddata = calloc(nelements,sizeof(double));
3975			int** dummymask;
3976			int ix, iy;
3977			int* index;
3978			int iter;
3979			/* Maximum radius in which nodes are adjusted */
3980	2		double maxradius = sqrt(nxgridnxgrid+nygridnygrid);
3981
3982			/* Set the metric function as indicated by dist */
3983	2		double (*metric)
3984			(int, double, double, int, int, const double[], int, int, int) =
3985	2		setmetric(dist);
3986
3987			/* Calculate the standard deviation for each row or column */
3988	2	50	if (transpose==0)
3989	19	100	{ for (i = 0; i < nelements; i++)
3990	17		{ int n = 0;
3991	63	100	for (j = 0; j < ndata; j++)
3992	46	50	{ if (mask[i][j])
3993	46		{ double term = data[i][j];
3994	46		term = term * term;
3995	46		stddata[i] += term;
3996	46		n++;
3997			}
3998			}
3999	17	50	if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
4000	0		else stddata[i] = 1;
4001			}
4002			}
4003			else
4004	0	0	{ for (i = 0; i < nelements; i++)
4005	0		{ int n = 0;
4006	0	0	for (j = 0; j < ndata; j++)
4007	0	0	{ if (mask[j][i])
4008	0		{ double term = data[j][i];
4009	0		term = term * term;
4010	0		stddata[i] += term;
4011	0		n++;
4012			}
4013			}
4014	0	0	if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
4015	0		else stddata[i] = 1;
4016			}
4017			}
4018
4019	2	50	if (transpose==0)
4020	2		{ dummymask = malloc(nygridsizeof(int));
4021	22	100	for (i = 0; i < nygrid; i++)
4022	20		{ dummymask[i] = malloc(ndata*sizeof(int));
4023	90	100	for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
4024			}
4025			}
4026			else
4027	0		{ dummymask = malloc(ndatasizeof(int));
4028	0	0	for (i = 0; i < ndata; i++)
4029	0		{ dummymask[i] = malloc(sizeof(int));
4030	0		dummymask[i][0] = 1;
4031			}
4032			}
4033
4034			/* Randomly initialize the nodes */
4035	22	100	for (ix = 0; ix < nxgrid; ix++)
4036	220	100	{ for (iy = 0; iy < nygrid; iy++)
4037	200		{ double sum = 0.;
4038	900	100	for (i = 0; i < ndata; i++)
4039	700		{ double term = -1.0 + 2.0*uniform();
4040	700		celldata[ix][iy][i] = term;
4041	700		sum += term * term;
4042			}
4043	200		sum = sqrt(sum/ndata);
4044	900	100	for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4045			}
4046			}
4047
4048			/* Randomize the order in which genes or arrays will be used */
4049	2		index = malloc(nelements*sizeof(int));
4050	19	100	for (i = 0; i < nelements; i++) index[i] = i;
4051	19	100	for (i = 0; i < nelements; i++)
4052	17		{ j = (int) (i + (nelements-i)*uniform());
4053	17		ix = index[j];
4054	17		index[j] = index[i];
4055	17		index[i] = ix;
4056			}
4057
4058			/* Start the iteration */
4059	202	100	for (iter = 0; iter < niter; iter++)
4060	200		{ int ixbest = 0;
4061	200		int iybest = 0;
4062	200		int iobject = iter % nelements;
4063	200		iobject = index[iobject];
4064	200	50	if (transpose==0)
4065	200		{ double closest = metric(ndata,data,celldata[ixbest],
4066			mask,dummymask,weights,iobject,iybest,transpose);
4067	200		double radius = maxradius * (1. - ((double)iter)/((double)niter));
4068	200		double tau = inittau * (1. - ((double)iter)/((double)niter));
4069
4070	2200	100	for (ix = 0; ix < nxgrid; ix++)
4071	22000	100	{ for (iy = 0; iy < nygrid; iy++)
4072	20000		{ double distance =
4073	20000		metric (ndata,data,celldata[ix],
4074			mask,dummymask,weights,iobject,iy,transpose);
4075	20000	100	if (distance < closest)
4076	589		{ ixbest = ix;
4077	589		iybest = iy;
4078	589		closest = distance;
4079			}
4080			}
4081			}
4082	2200	100	for (ix = 0; ix < nxgrid; ix++)
4083	22000	100	{ for (iy = 0; iy < nygrid; iy++)
4084	20000	100	{ if (sqrt((ix-ixbest)(ix-ixbest)+(iy-iybest)(iy-iybest))
4085	13018		{ double sum = 0.;
4086	59142	100	for (i = 0; i < ndata; i++)
4087	46124	50	{ if (mask[iobject][i]==0) continue;
4088	46124		celldata[ix][iy][i] +=
4089	46124		tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
4090			}
4091	59142	100	for (i = 0; i < ndata; i++)
4092	46124		{ double term = celldata[ix][iy][i];
4093	46124		term = term * term;
4094	46124		sum += term;
4095			}
4096	13018	50	if (sum>0)
4097	13018		{ sum = sqrt(sum/ndata);
4098	59142	100	for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4099			}
4100			}
4101			}
4102			}
4103			}
4104			else
4105			{ double closest;
4106	0		double** celldatavector = malloc(ndatasizeof(double));
4107	0		double radius = maxradius * (1. - ((double)iter)/((double)niter));
4108	0		double tau = inittau * (1. - ((double)iter)/((double)niter));
4109
4110	0	0	for (i = 0; i < ndata; i++)
4111	0		celldatavector[i] = &(celldata[ixbest][iybest][i]);
4112	0		closest = metric(ndata,data,celldatavector,
4113			mask,dummymask,weights,iobject,0,transpose);
4114	0	0	for (ix = 0; ix < nxgrid; ix++)
4115	0	0	{ for (iy = 0; iy < nygrid; iy++)
4116			{ double distance;
4117	0	0	for (i = 0; i < ndata; i++)
4118	0		celldatavector[i] = &(celldata[ixbest][iybest][i]);
4119	0		distance =
4120			metric (ndata,data,celldatavector,
4121			mask,dummymask,weights,iobject,0,transpose);
4122	0	0	if (distance < closest)
4123	0		{ ixbest = ix;
4124	0		iybest = iy;
4125	0		closest = distance;
4126			}
4127			}
4128			}
4129	0		free(celldatavector);
4130	0	0	for (ix = 0; ix < nxgrid; ix++)
4131	0	0	{ for (iy = 0; iy < nygrid; iy++)
4132	0	0	{ if (sqrt((ix-ixbest)(ix-ixbest)+(iy-iybest)(iy-iybest))
4133	0		{ double sum = 0.;
4134	0	0	for (i = 0; i < ndata; i++)
4135	0	0	{ if (mask[i][iobject]==0) continue;
4136	0		celldata[ix][iy][i] +=
4137	0		tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
4138			}
4139	0	0	for (i = 0; i < ndata; i++)
4140	0		{ double term = celldata[ix][iy][i];
4141	0		term = term * term;
4142	0		sum += term;
4143			}
4144	0	0	if (sum>0)
4145	0		{ sum = sqrt(sum/ndata);
4146	0	0	for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4147			}
4148			}
4149			}
4150			}
4151			}
4152			}
4153	2	50	if (transpose==0)
4154	22	100	for (i = 0; i < nygrid; i++) free(dummymask[i]);
4155			else
4156	0	0	for (i = 0; i < ndata; i++) free(dummymask[i]);
4157	2		free(dummymask);
4158	2		free(stddata);
4159	2		free(index);
4160	2		return;
4161			}
4162
4163			/* ******************************************************************* */
4164
4165			static
4166	2		void somassign (int nrows, int ncolumns, double data, int mask,
4167			const double weights[], int transpose, int nxgrid, int nygrid,
4168			double*** celldata, char dist, int clusterid[][2])
4169			/* Collect clusterids */
4170	2	50	{ const int ndata = (transpose==0) ? ncolumns : nrows;
4171			int i,j;
4172
4173			/* Set the metric function as indicated by dist */
4174	2		double (*metric)
4175			(int, double, double, int, int, const double[], int, int, int) =
4176	2		setmetric(dist);
4177
4178	2	50	if (transpose==0)
4179	2		{ int** dummymask = malloc(nygridsizeof(int));
4180	22	100	for (i = 0; i < nygrid; i++)
4181	20		{ dummymask[i] = malloc(ncolumns*sizeof(int));
4182	90	100	for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
4183			}
4184	19	100	for (i = 0; i < nrows; i++)
4185	17		{ int ixbest = 0;
4186	17		int iybest = 0;
4187	17		double closest = metric(ndata,data,celldata[ixbest],
4188			mask,dummymask,weights,i,iybest,transpose);
4189			int ix, iy;
4190	187	100	for (ix = 0; ix < nxgrid; ix++)
4191	1870	100	{ for (iy = 0; iy < nygrid; iy++)
4192	1700		{ double distance =
4193	1700		metric (ndata,data,celldata[ix],
4194			mask,dummymask,weights,i,iy,transpose);
4195	1700	100	if (distance < closest)
4196	57		{ ixbest = ix;
4197	57		iybest = iy;
4198	57		closest = distance;
4199			}
4200			}
4201			}
4202	17		clusterid[i][0] = ixbest;
4203	17		clusterid[i][1] = iybest;
4204			}
4205	22	100	for (i = 0; i < nygrid; i++) free(dummymask[i]);
4206	2		free(dummymask);
4207			}
4208			else
4209	0		{ double** celldatavector = malloc(ndatasizeof(double));
4210	0		int** dummymask = malloc(nrowssizeof(int));
4211	0		int ixbest = 0;
4212	0		int iybest = 0;
4213	0	0	for (i = 0; i < nrows; i++)
4214	0		{ dummymask[i] = malloc(sizeof(int));
4215	0		dummymask[i][0] = 1;
4216			}
4217	0	0	for (i = 0; i < ncolumns; i++)
4218			{ double closest;
4219			int ix, iy;
4220	0	0	for (j = 0; j < ndata; j++)
4221	0		celldatavector[j] = &(celldata[ixbest][iybest][j]);
4222	0		closest = metric(ndata,data,celldatavector,
4223			mask,dummymask,weights,i,0,transpose);
4224	0	0	for (ix = 0; ix < nxgrid; ix++)
4225	0	0	{ for (iy = 0; iy < nygrid; iy++)
4226			{ double distance;
4227	0	0	for(j = 0; j < ndata; j++)
4228	0		celldatavector[j] = &(celldata[ix][iy][j]);
4229	0		distance = metric(ndata,data,celldatavector,
4230			mask,dummymask,weights,i,0,transpose);
4231	0	0	if (distance < closest)
4232	0		{ ixbest = ix;
4233	0		iybest = iy;
4234	0		closest = distance;
4235			}
4236			}
4237			}
4238	0		clusterid[i][0] = ixbest;
4239	0		clusterid[i][1] = iybest;
4240			}
4241	0		free(celldatavector);
4242	0	0	for (i = 0; i < nrows; i++) free(dummymask[i]);
4243	0		free(dummymask);
4244			}
4245	2		return;
4246			}
4247
4248			/* ******************************************************************* */
4249
4250	2		void somcluster (int nrows, int ncolumns, double data, int mask,
4251			const double weight[], int transpose, int nxgrid, int nygrid,
4252			double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4253			/*
4254
4255			Purpose
4256			=======
4257
4258			The somcluster routine implements a self-organizing map (Kohonen) on a
4259			rectangular grid, using a given set of vectors. The distance measure to be
4260			used to find the similarity between genes and nodes is given by dist.
4261
4262			Arguments
4263			=========
4264
4265			nrows (input) int
4266			The number of rows in the data matrix, equal to the number of genes.
4267
4268			ncolumns (input) int
4269			The number of columns in the data matrix, equal to the number of microarrays.
4270
4271			data (input) double[nrows][ncolumns]
4272			The array containing the gene expression data.
4273
4274			mask (input) int[nrows][ncolumns]
4275			This array shows which data values are missing. If
4276			mask[i][j] == 0, then data[i][j] is missing.
4277
4278			weights (input) double[ncolumns] if transpose==0;
4279			double[nrows] if transpose==1
4280			The weights that are used to calculate the distance. The length of this vector
4281			is ncolumns if genes are being clustered, or nrows if microarrays are being
4282			clustered.
4283
4284			transpose (input) int
4285			If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
4286			columns (microarrays) of the matrix are clustered.
4287
4288			nxgrid (input) int
4289			The number of grid cells horizontally in the rectangular topology of clusters.
4290
4291			nygrid (input) int
4292			The number of grid cells horizontally in the rectangular topology of clusters.
4293
4294			inittau (input) double
4295			The initial value of tau, representing the neighborhood function.
4296
4297			niter (input) int
4298			The number of iterations to be performed.
4299
4300			dist (input) char
4301			Defines which distance measure is used, as given by the table:
4302			dist=='e': Euclidean distance
4303			dist=='b': City-block distance
4304			dist=='c': correlation
4305			dist=='a': absolute value of the correlation
4306			dist=='u': uncentered correlation
4307			dist=='x': absolute uncentered correlation
4308			dist=='s': Spearman's rank correlation
4309			dist=='k': Kendall's tau
4310			For other values of dist, the default (Euclidean distance) is used.
4311
4312			celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
4313			double[nxgrid][nygrid][nrows] if tranpose==1
4314			The gene expression data for each node (cell) in the 2D grid. This can be
4315			interpreted as the centroid for the cluster corresponding to that cell. If
4316			celldata is NULL, then the centroids are not returned. If celldata is not
4317			NULL, enough space should be allocated to store the centroid data before callingsomcluster.
4318
4319			clusterid (output), int[nrows][2] if transpose==0;
4320			int[ncolumns][2] if transpose==1
4321			For each item (gene or microarray) that is clustered, the coordinates of the
4322			cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
4323			cluster assignments are not returned. If clusterid is not NULL, enough memory
4324			should be allocated to store the clustering information before calling
4325			somcluster.
4326
4327			========================================================================
4328			*/
4329	2	50	{ const int nobjects = (transpose==0) ? nrows : ncolumns;
4330	2	50	const int ndata = (transpose==0) ? ncolumns : nrows;
4331			int i,j;
4332	2		const int lcelldata = (celldata==NULL) ? 0 : 1;
4333
4334	2	50	if (nobjects < 2) return;
4335
4336	2	50	if (lcelldata==0)
4337	2		{ celldata = malloc(nxgridnygridndatasizeof(double*));
4338	22	100	for (i = 0; i < nxgrid; i++)
4339	20		{ celldata[i] = malloc(nygridndatasizeof(double*));
4340	220	100	for (j = 0; j < nygrid; j++)
4341	200		celldata[i][j] = malloc(ndata*sizeof(double));
4342			}
4343			}
4344
4345	2		somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
4346			inittau, celldata, niter, dist);
4347	2	50	if (clusterid)
4348	2		somassign (nrows, ncolumns, data, mask, weight, transpose,
4349			nxgrid, nygrid, celldata, dist, clusterid);
4350	2	50	if(lcelldata==0)
4351	22	100	{ for (i = 0; i < nxgrid; i++)
4352	220	100	for (j = 0; j < nygrid; j++)
4353	200		free(celldata[i][j]);
4354	22	100	for (i = 0; i < nxgrid; i++)
4355	20		free(celldata[i]);
4356	2		free(celldata);
4357			}
4358	2		return;
4359			}
4360
4361			/* ******************************************************************** */
4362
4363	46		double clusterdistance (int nrows, int ncolumns, double** data,
4364			int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4365			char dist, char method, int transpose)
4366
4367			/*
4368			Purpose
4369			=======
4370
4371			The clusterdistance routine calculates the distance between two clusters
4372			containing genes or microarrays using the measured gene expression vectors. The
4373			distance between clusters, given the genes/microarrays in each cluster, can be
4374			defined in several ways. Several distance measures can be used.
4375
4376			The routine returns the distance in double precision.
4377			If the parameter transpose is set to a nonzero value, the clusters are
4378			interpreted as clusters of microarrays, otherwise as clusters of gene.
4379
4380			Arguments
4381			=========
4382
4383			nrows (input) int
4384			The number of rows (i.e., the number of genes) in the gene expression data
4385			matrix.
4386
4387			ncolumns (input) int
4388			The number of columns (i.e., the number of microarrays) in the gene expression
4389			data matrix.
4390
4391			data (input) double[nrows][ncolumns]
4392			The array containing the data of the vectors.
4393
4394			mask (input) int[nrows][ncolumns]
4395			This array shows which data values are missing. If mask[i][j]==0, then
4396			data[i][j] is missing.
4397
4398			weight (input) double[ncolumns] if transpose==0;
4399			double[nrows] if transpose==1
4400			The weights that are used to calculate the distance.
4401
4402			n1 (input) int
4403			The number of elements in the first cluster.
4404
4405			n2 (input) int
4406			The number of elements in the second cluster.
4407
4408			index1 (input) int[n1]
4409			Identifies which genes/microarrays belong to the first cluster.
4410
4411			index2 (input) int[n2]
4412			Identifies which genes/microarrays belong to the second cluster.
4413
4414			dist (input) char
4415			Defines which distance measure is used, as given by the table:
4416			dist=='e': Euclidean distance
4417			dist=='b': City-block distance
4418			dist=='c': correlation
4419			dist=='a': absolute value of the correlation
4420			dist=='u': uncentered correlation
4421			dist=='x': absolute uncentered correlation
4422			dist=='s': Spearman's rank correlation
4423			dist=='k': Kendall's tau
4424			For other values of dist, the default (Euclidean distance) is used.
4425
4426			method (input) char
4427			Defines how the distance between two clusters is defined, given which genes
4428			belong to which cluster:
4429			method=='a': the distance between the arithmetic means of the two clusters
4430			method=='m': the distance between the medians of the two clusters
4431			method=='s': the smallest pairwise distance between members of the two clusters
4432			method=='x': the largest pairwise distance between members of the two clusters
4433			method=='v': average of the pairwise distances between members of the clusters
4434
4435			transpose (input) int
4436			If transpose is equal to zero, the distances between the rows is
4437			calculated. Otherwise, the distances between the columns is calculated.
4438			The former is needed when genes are being clustered; the latter is used
4439			when microarrays are being clustered.
4440
4441			========================================================================
4442			*/
4443			{ /* Set the metric function as indicated by dist */
4444	46		double (*metric)
4445			(int, double, double, int, int, const double[], int, int, int) =
4446	46		setmetric(dist);
4447
4448			/* if one or both clusters are empty, return */
4449	46	50	if (n1 < 1 \|\| n2 < 1) return -1.0;
		50
4450			/* Check the indices */
4451	46	50	if (transpose==0)
4452			{ int i;
4453	102	100	for (i = 0; i < n1; i++)
4454	56		{ int index = index1[i];
4455	56	50	if (index < 0 \|\| index >= nrows) return -1.0;
		50
4456			}
4457	136	100	for (i = 0; i < n2; i++)
4458	90		{ int index = index2[i];
4459	90	50	if (index < 0 \|\| index >= nrows) return -1.0;
		50
4460			}
4461			}
4462			else
4463			{ int i;
4464	0	0	for (i = 0; i < n1; i++)
4465	0		{ int index = index1[i];
4466	0	0	if (index < 0 \|\| index >= ncolumns) return -1.0;
		0
4467			}
4468	0	0	for (i = 0; i < n2; i++)
4469	0		{ int index = index2[i];
4470	0	0	if (index < 0 \|\| index >= ncolumns) return -1.0;
		0
4471			}
4472			}
4473
4474	46		switch (method)
4475			{ case 'a':
4476			{ /* Find the center */
4477			int i,j,k;
4478	14	50	if (transpose==0)
4479			{ double distance;
4480			double* cdata[2];
4481			int* cmask[2];
4482			int* count[2];
4483	14		count[0] = calloc(ncolumns,sizeof(int));
4484	14		count[1] = calloc(ncolumns,sizeof(int));
4485	14		cdata[0] = calloc(ncolumns,sizeof(double));
4486	14		cdata[1] = calloc(ncolumns,sizeof(double));
4487	14		cmask[0] = malloc(ncolumns*sizeof(int));
4488	14		cmask[1] = malloc(ncolumns*sizeof(int));
4489	38	100	for (i = 0; i < n1; i++)
4490	24		{ k = index1[i];
4491	92	100	for (j = 0; j < ncolumns; j++)
4492	68	50	if (mask[k][j] != 0)
4493	68		{ cdata[0][j] = cdata[0][j] + data[k][j];
4494	68		count[0][j] = count[0][j] + 1;
4495			}
4496			}
4497	40	100	for (i = 0; i < n2; i++)
4498	26		{ k = index2[i];
4499	106	100	for (j = 0; j < ncolumns; j++)
4500	80	50	if (mask[k][j] != 0)
4501	80		{ cdata[1][j] = cdata[1][j] + data[k][j];
4502	80		count[1][j] = count[1][j] + 1;
4503			}
4504			}
4505	42	100	for (i = 0; i < 2; i++)
4506	118	100	for (j = 0; j < ncolumns; j++)
4507	90	50	{ if (count[i][j]>0)
4508	90		{ cdata[i][j] = cdata[i][j] / count[i][j];
4509	90		cmask[i][j] = 1;
4510			}
4511			else
4512	0		cmask[i][j] = 0;
4513			}
4514	14		distance =
4515			metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4516	42	100	for (i = 0; i < 2; i++)
4517	28		{ free (cdata[i]);
4518	28		free (cmask[i]);
4519	28		free (count[i]);
4520			}
4521	14		return distance;
4522			}
4523			else
4524			{ double distance;
4525	0		int** count = malloc(nrowssizeof(int));
4526	0		double** cdata = malloc(nrowssizeof(double));
4527	0		int** cmask = malloc(nrowssizeof(int));
4528	0	0	for (i = 0; i < nrows; i++)
4529	0		{ count[i] = calloc(2,sizeof(int));
4530	0		cdata[i] = calloc(2,sizeof(double));
4531	0		cmask[i] = malloc(2*sizeof(int));
4532			}
4533	0	0	for (i = 0; i < n1; i++)
4534	0		{ k = index1[i];
4535	0	0	for (j = 0; j < nrows; j++)
4536	0	0	{ if (mask[j][k] != 0)
4537	0		{ cdata[j][0] = cdata[j][0] + data[j][k];
4538	0		count[j][0] = count[j][0] + 1;
4539			}
4540			}
4541			}
4542	0	0	for (i = 0; i < n2; i++)
4543	0		{ k = index2[i];
4544	0	0	for (j = 0; j < nrows; j++)
4545	0	0	{ if (mask[j][k] != 0)
4546	0		{ cdata[j][1] = cdata[j][1] + data[j][k];
4547	0		count[j][1] = count[j][1] + 1;
4548			}
4549			}
4550			}
4551	0	0	for (i = 0; i < nrows; i++)
4552	0	0	for (j = 0; j < 2; j++)
4553	0	0	if (count[i][j]>0)
4554	0		{ cdata[i][j] = cdata[i][j] / count[i][j];
4555	0		cmask[i][j] = 1;
4556			}
4557			else
4558	0		cmask[i][j] = 0;
4559	0		distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4560	0	0	for (i = 0; i < nrows; i++)
4561	0		{ free (count[i]);
4562	0		free (cdata[i]);
4563	0		free (cmask[i]);
4564			}
4565	0		free (count);
4566	0		free (cdata);
4567	0		free (cmask);
4568	0		return distance;
4569			}
4570			}
4571			case 'm':
4572			{ int i, j, k;
4573	8	50	if (transpose==0)
4574			{ double distance;
4575	8		double* temp = malloc(nrows*sizeof(double));
4576			double* cdata[2];
4577			int* cmask[2];
4578	24	100	for (i = 0; i < 2; i++)
4579	16		{ cdata[i] = malloc(ncolumns*sizeof(double));
4580	16		cmask[i] = malloc(ncolumns*sizeof(int));
4581			}
4582	32	100	for (j = 0; j < ncolumns; j++)
4583	24		{ int count = 0;
4584	48	100	for (k = 0; k < n1; k++)
4585	24		{ i = index1[k];
4586	24	50	if (mask[i][j])
4587	24		{ temp[count] = data[i][j];
4588	24		count++;
4589			}
4590			}
4591	24	50	if (count>0)
4592	24		{ cdata[0][j] = median (count,temp);
4593	24		cmask[0][j] = 1;
4594			}
4595			else
4596	0		{ cdata[0][j] = 0.;
4597	0		cmask[0][j] = 0;
4598			}
4599			}
4600	32	100	for (j = 0; j < ncolumns; j++)
4601	24		{ int count = 0;
4602	72	100	for (k = 0; k < n2; k++)
4603	48		{ i = index2[k];
4604	48	50	if (mask[i][j])
4605	48		{ temp[count] = data[i][j];
4606	48		count++;
4607			}
4608			}
4609	24	50	if (count>0)
4610	24		{ cdata[1][j] = median (count,temp);
4611	24		cmask[1][j] = 1;
4612			}
4613			else
4614	0		{ cdata[1][j] = 0.;
4615	0		cmask[1][j] = 0;
4616			}
4617			}
4618	8		distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4619	24	100	for (i = 0; i < 2; i++)
4620	16		{ free (cdata[i]);
4621	16		free (cmask[i]);
4622			}
4623	8		free(temp);
4624	8		return distance;
4625			}
4626			else
4627			{ double distance;
4628	0		double* temp = malloc(ncolumns*sizeof(double));
4629	0		double** cdata = malloc(nrowssizeof(double));
4630	0		int** cmask = malloc(nrowssizeof(int));
4631	0	0	for (i = 0; i < nrows; i++)
4632	0		{ cdata[i] = malloc(2*sizeof(double));
4633	0		cmask[i] = malloc(2*sizeof(int));
4634			}
4635	0	0	for (j = 0; j < nrows; j++)
4636	0		{ int count = 0;
4637	0	0	for (k = 0; k < n1; k++)
4638	0		{ i = index1[k];
4639	0	0	if (mask[j][i])
4640	0		{ temp[count] = data[j][i];
4641	0		count++;
4642			}
4643			}
4644	0	0	if (count>0)
4645	0		{ cdata[j][0] = median (count,temp);
4646	0		cmask[j][0] = 1;
4647			}
4648			else
4649	0		{ cdata[j][0] = 0.;
4650	0		cmask[j][0] = 0;
4651			}
4652			}
4653	0	0	for (j = 0; j < nrows; j++)
4654	0		{ int count = 0;
4655	0	0	for (k = 0; k < n2; k++)
4656	0		{ i = index2[k];
4657	0	0	if (mask[j][i])
4658	0		{ temp[count] = data[j][i];
4659	0		count++;
4660			}
4661			}
4662	0	0	if (count>0)
4663	0		{ cdata[j][1] = median (count,temp);
4664	0		cmask[j][1] = 1;
4665			}
4666			else
4667	0		{ cdata[j][1] = 0.;
4668	0		cmask[j][1] = 0;
4669			}
4670			}
4671	0		distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4672	0	0	for (i = 0; i < nrows; i++)
4673	0		{ free (cdata[i]);
4674	0		free (cmask[i]);
4675			}
4676	0		free(cdata);
4677	0		free(cmask);
4678	0		free(temp);
4679	0		return distance;
4680			}
4681			}
4682			case 's':
4683			{ int i1, i2, j1, j2;
4684	8	50	const int n = (transpose==0) ? ncolumns : nrows;
4685	8		double mindistance = DBL_MAX;
4686	16	100	for (i1 = 0; i1 < n1; i1++)
4687	24	100	for (i2 = 0; i2 < n2; i2++)
4688			{ double distance;
4689	16		j1 = index1[i1];
4690	16		j2 = index2[i2];
4691	16		distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4692	16	100	if (distance < mindistance) mindistance = distance;
4693			}
4694	8		return mindistance;
4695			}
4696			case 'x':
4697			{ int i1, i2, j1, j2;
4698	8	50	const int n = (transpose==0) ? ncolumns : nrows;
4699	8		double maxdistance = 0;
4700	16	100	for (i1 = 0; i1 < n1; i1++)
4701	24	100	for (i2 = 0; i2 < n2; i2++)
4702			{ double distance;
4703	16		j1 = index1[i1];
4704	16		j2 = index2[i2];
4705	16		distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4706	16	100	if (distance > maxdistance) maxdistance = distance;
4707			}
4708	8		return maxdistance;
4709			}
4710			case 'v':
4711			{ int i1, i2, j1, j2;
4712	8	50	const int n = (transpose==0) ? ncolumns : nrows;
4713	8		double distance = 0;
4714	16	100	for (i1 = 0; i1 < n1; i1++)
4715	24	100	for (i2 = 0; i2 < n2; i2++)
4716	16		{ j1 = index1[i1];
4717	16		j2 = index2[i2];
4718	16		distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4719			}
4720	8		distance /= (n1*n2);
4721	8		return distance;
4722			}
4723			}
4724			/* Never get here */
4725	0		return -2.0;
4726			}