File Coverage

blib/lib/Statistics/CalinskiHarabasz.pm

Criterion	Covered	Total	%
statement	12	123	9.7
branch	0	24	0.0
condition			n/a
subroutine	4	7	57.1
pod	0	3	0.0
total	16	157	10.1

line	stmt	bran	sub	pod	time	code
1						package Statistics::CalinskiHarabasz;
2
3	1		1		21972	use 5.008005;
	1				4
	1				49
4	1		1		8	use strict;
	1				2
	1				38
5	1		1		5	use warnings;
	1				7
	1				51
6
7						require Exporter;
8	1		1		1059	use AutoLoader qw(AUTOLOAD);
	1				1806
	1				6
9
10						our @ISA = qw(Exporter);
11
12						our @EXPORT = qw( ch );
13
14						our $VERSION = '0.01';
15
16						# global variable
17						my @d = ();
18						my $g_mean = 0;
19						my $rcnt = 0;
20
21						sub ch
22						{
23						# Input params
24	0		0	0		my $matrixfile = shift;
25	0					my $clustmtd = shift;
26	0					my $K = shift;
27
28	0					my $i = 0;
29	0					my $j = 0;
30
31						# Read the matrix file into a 2 dimensional array.
32	0					my @inpmat = ();
33	0	0				open(INP,"<$matrixfile") \|\| die "Error opening input matrix file!";
34
35						# Extract the number of rows from the first line in the file.
36	0					my $ccnt = 0;
37	0					my $line;
38
39	0					$line = ;
40	0					chomp($line);
41	0					$line=~s/\s+/ /;
42
43	0					($rcnt,$ccnt) = split(/\s+/,$line);
44
45						# Not a valid condition:
46						# If maximum number of clusters requested (k) is greater than the
47						# number of observations.
48	0	0				if($K > $rcnt)
49						{
50	0					print STDERR "The K value ($K) cannot be greater than the number of observations present in the input data ($rcnt). \n";
51	0					exit 1;
52						}
53
54						# Copy the complete matrix to a 2D array
55	0					while()
56						{
57						# remove the newline at the end of the input line
58	0					chomp;
59
60						# skip empty lines
61	0	0				if(m/^\s\s\s*$/)
62						{
63	0					next;
64						}
65
66						# remove leading white spaces
67	0					s/^\s+//;
68
69						# seperate individual values in a line
70	0					my @tmp = ();
71	0					@tmp = split(/\s+/);
72
73						# populate them into the 2D matrix
74	0					push @inpmat, [ @tmp ];
75						}
76
77	0					close INP;
78
79	0					my @row1 = ();
80	0					my @row2 = ();
81	0					my $acc = 0;
82
83						# Calculate all possible unique pairwise distances between the vectors
84	0					for($i = 0; $i < $rcnt; $i++)
85						{
86						# for all the rows in the cluster
87	0					for($j = $i+1; $j < $rcnt; $j++)
88						{
89	0					@row1 = @{$inpmat[$i]};
	0
90	0					@row2 = @{$inpmat[$j]};
	0
91	0					$d[$i][$j] = &dist_euclidean_sqr(\@row1, \@row2);
92	0					$acc += $d[$i][$j];
93						}
94						}
95
96						# Calculate general mean (d^2)
97	0					$g_mean = ($acc * 2)/($rcnt * ($rcnt - 1));
98
99						# Calculate mean for each cluster
100						# Calculate Ak
101						# Calculate VRC (Variance Ratio Criterion)
102
103						# For each K
104	0					my $k = 0;
105	0					my @VRC = ();
106
107	0					for($k=2; $k<=$K; $k++)
108						{
109						# avoid the case K = #ofContexts because then the denominator of VRC (n-k)
110						# become 0 and gives "division by 0" error.
111	0	0				if($k == $rcnt)
112						{
113	0					last;
114						}
115
116	0					my $lineNo = 0;
117	0					my %hash = ();
118
119						# Cluster the input dataset into k clusters
120	0					my $out_filename = "tmp.op" . $k . time();
121	0					my $status = 0;
122
123	0					$status = system("vcluster --clmethod $clustmtd $matrixfile $k >& $out_filename ");
124	0	0				die "Error running vcluster \n" unless $status==0;
125
126						# read the clustering output file
127	0	0				open(CO,"<$matrixfile.clustering.$k") \|\| die "Error opening clustering output file.";
128
129	0					my $clust = 0;
130	0					while($clust = )
131						{
132						# hash on the cluster# and append the observation#
133	0					chomp($clust);
134	0	0				if(exists $hash{$clust})
135						{
136	0					$hash{$clust} .= " $lineNo";
137						}
138						else
139						{
140	0					$hash{$clust} = $lineNo;
141						}
142
143						# increment the line number
144	0					$lineNo++;
145						}
146
147	0					close CO;
148
149						# Calculate the "Within Cluster Dispersion Measure / Error Measure" Wk
150						# for given matrix and k value.
151	0					$VRC[$k] = &variance_ratio(\%hash,$k);
152
153	0					unlink "$out_filename","$matrixfile.clustering.$k";
154						}
155
156						# Calculate smallest k for which VRC is maximum
157	0					my $max = 0;
158	0					my $ans = 0;
159	0					for($k=2; $k<=$K; $k++)
160						{
161						# avoid the case K = #ofContexts because then the denominator of VRC (n-k)
162						# become 0 and gives "division by 0" error.
163	0	0				if($k == $rcnt)
164						{
165	0					last;
166						}
167	0	0				if($VRC[$k] > $max)
168						{
169	0					$max = $VRC[$k];
170	0					$ans = $k;
171						}
172						}
173	0					print "$ans\n";
174						}
175
176						sub dist_euclidean_sqr
177						{
178						# arguments
179	0		0	0		my @i = @{(shift)};
	0
180	0					my @j = @{(shift)};
	0
181
182						# local variables
183	0					my $a;
184	0					my $dist = 0;
185	0					my $retvalue = 0;
186
187						# Squared Euclidean measure
188						# summation on all j (xij - xi'j)^2 where i, i' are the rows indicies.
189	0					for $a (0 .. $#i)
190						{
191	0					$dist += (($i[$a] - $j[$a])**2);
192						}
193
194	0					$retvalue = sprintf("%.4f",$dist);
195	0					return $retvalue;
196						}
197
198						sub variance_ratio
199						{
200						# Input arguments
201	0		0	0		my %clustout = %{(shift)};
	0
202	0					my $k = shift;
203
204						# Local variables
205	0					my $i;
206						my $j;
207	0					my @rownum;
208	0					my $key;
209	0					my $row1;
210	0					my $row2;
211	0					my $VRC = 0;
212	0					my @D = ();
213	0					my $tmp;
214	0					my $c_mean = ();
215	0					my $A = 0;
216
217						# For each cluster
218	0					foreach $key (sort keys %clustout)
219						{
220	0					$D[$key] = 0;
221
222	0					@rownum = split(/\s+/,$clustout{$key});
223
224						# for each instance in the cluster
225	0					for($i = 0; $i < $#rownum; $i++)
226						{
227						# for all the rows in the cluster
228	0					for($j = $i+1; $j <= $#rownum; $j++)
229						{
230						# find the distance between the 2 rows of the matrix.
231	0					$row1 = $rownum[$i];
232	0					$row2 = $rownum[$j];
233
234						# store the Dr value
235	0	0				if(exists $d[$row1][$row2])
236						{
237	0					$D[$key] += $d[$row1][$row2];
238						}
239						else
240						{
241	0					$D[$key] += $d[$row2][$row1];
242						}
243						}
244						}
245
246						# Calculate individual cluster mean
247	0	0				if($#rownum == 0)
248						{
249	0					$c_mean = 0;
250						}
251						else
252						{
253	0					$c_mean = ($D[$key] * 2)/(($#rownum + 1) * $#rownum);
254						}
255
256	0					$A += $#rownum * ($g_mean - $c_mean);
257						}
258
259	0					$A = $A/($rcnt - $k);
260
261	0	0				if($g_mean == $A)
262						{
263	0					$VRC = 99999;
264						}
265						else
266						{
267	0					$VRC = ( $g_mean + ($rcnt - $k) / ($k-1) * $A ) / ( $g_mean - $A );
268						}
269	0					return $VRC;
270						}
271
272						1;
273						__END__