File Coverage

blib/lib/Statistics/Hartigan.pm

Criterion	Covered	Total	%
statement	12	116	10.3
branch	0	20	0.0
condition			n/a
subroutine	4	7	57.1
pod	0	3	0.0
total	16	146	10.9

line	stmt	bran	sub	pod	time	code
1						package Statistics::Hartigan;
2
3	1		1		32580	use 5.008005;
	1				4
	1				50
4	1		1		7	use strict;
	1				2
	1				37
5	1		1		5	use warnings;
	1				8
	1				61
6
7						require Exporter;
8	1		1		1155	use AutoLoader qw(AUTOLOAD);
	1				2234
	1				5
9
10						our @ISA = qw(Exporter);
11
12						our @EXPORT = qw( hartigan );
13
14						our $VERSION = '0.01';
15
16						# global variable
17						my @H = ();
18						my @d = ();
19						my @W = ();
20						my $rcnt = 0;
21
22						sub hartigan
23						{
24						# Input params
25	0		0	0		my $matrixfile = shift;
26	0					my $clustmtd = shift;
27	0					my $K = shift;
28	0					my $threshold = shift;
29
30	0					my $i = 0;
31	0					my $j = 0;
32
33						# Read the matrix file into a 2 dimensional array.
34	0					my @inpmat = ();
35	0	0				open(INP,"<$matrixfile") \|\| die "Error opening input matrix file!";
36
37						# Extract the number of rows from the first line in the file.
38	0					my $ccnt = 0;
39	0					my $line;
40
41	0					$line = ;
42	0					chomp($line);
43	0					$line=~s/\s+/ /;
44
45	0					($rcnt,$ccnt) = split(/\s+/,$line);
46
47						# Not a valid condition:
48						# If maximum number of clusters requested (k) is greater than the
49						# number of observations.
50	0	0				if($K > $rcnt)
51						{
52	0					print STDERR "The K value ($K) cannot be greater than the number of observations present in the input data ($rcnt). \n";
53	0					exit 1;
54						}
55
56						# Copy the complete matrix to a 2D array
57	0					while()
58						{
59						# remove the newline at the end of the input line
60	0					chomp;
61
62						# skip empty lines
63	0	0				if(m/^\s\s\s*$/)
64						{
65	0					next;
66						}
67
68						# remove leading white spaces
69	0					s/^\s+//;
70
71						# seperate individual values in a line
72	0					my @tmp = ();
73	0					@tmp = split(/\s+/);
74
75						# populate them into the 2D matrix
76	0					push @inpmat, [ @tmp ];
77						}
78
79	0					close INP;
80
81	0					my @row1 = ();
82	0					my @row2 = ();
83
84	0					my %hash = ();
85
86						# Calculate all possible unique pairwise distances between the vectors
87	0					for($i = 0; $i < $rcnt; $i++)
88						{
89						# for all the rows in the cluster
90	0					for($j = $i+1; $j < $rcnt; $j++)
91						{
92	0					@row1 = @{$inpmat[$i]};
	0
93	0					@row2 = @{$inpmat[$j]};
	0
94	0					$d[$i][$j] = &dist_euclidean_sqr(\@row1, \@row2);
95						}
96	0					$hash{0} .= "$i ";
97						}
98
99	0					$hash{0} = substr($hash{0},0,length($hash{0})-1);
100	0					$W[1] = &WGSS(\%hash);
101
102	0					my $k = 0;
103	0					my $flag = 0;
104
105						# For each K
106	0					for($k=1; $k<$K; $k++)
107						{
108	0					my $lineNo = 0;
109	0					my %hash = ();
110
111						# Cluster the input dataset into k+1 clusters
112	0					my $status = 0;
113	0					my $next_k = $k + 1;
114	0					$status = system("vcluster --clmethod $clustmtd $matrixfile $next_k >& tmpfile");
115	0	0				die "Error running vcluster \n" unless $status==0;
116
117						# read the clustering output file
118	0	0				open(CO,"<$matrixfile.clustering.$next_k") \|\| die "Error opening clustering output file.";
119
120	0					my $clust = 0;
121	0					while($clust = )
122						{
123						# hash on the cluster# and append the observation#
124	0					chomp($clust);
125	0	0				if(exists $hash{$clust})
126						{
127	0					$hash{$clust} .= " $lineNo";
128						}
129						else
130						{
131	0					$hash{$clust} = $lineNo;
132						}
133
134						# increment the line number
135	0					$lineNo++;
136						}
137
138	0					close CO;
139
140						# Calculate the "Within Cluster Sum of Squared W(k+1)
141						# for given matrix and k+1 value.
142	0					$W[$next_k] = &WGSS(\%hash);
143
144	0					unlink "$matrixfile.clustering.$next_k", "tmpfile","$matrixfile.tree";
145
146	0	0				if($W[$next_k] == 0)
147						{
148	0					print $next_k . "\n";
149	0					$flag = 1;
150	0					last;
151						}
152						else
153						{
154	0					$H[$k] = ( $W[$k]/$W[$next_k] - 1 ) * ( $rcnt - $k - 1 );
155	0					$H[$k] = sprintf("%.4f",$H[$k]);
156						}
157
158	0	0				if($H[$k] <= $threshold)
159						{
160	0					print $k . "\n";
161	0					$flag = 1;
162	0					last;
163						}
164						}
165
166	0	0				if(!$flag)
167						{
168	0					print "NAN\n";
169						}
170						}
171
172						sub WGSS
173						{
174						# Input arguments
175	0		0	0		my %clustout = %{(shift)};
	0
176
177						# Local variables
178	0					my $i;
179						my $j;
180	0					my @rownum;
181	0					my $key;
182	0					my $row1;
183	0					my $row2;
184	0					my @D = ();
185	0					my $W = 0;
186
187						# For each cluster
188	0					foreach $key (sort keys %clustout)
189						{
190	0					$D[$key] = 0;
191
192	0					@rownum = split(/\s+/,$clustout{$key});
193
194						# for each instance in the cluster
195	0					for($i = 0; $i < $#rownum; $i++)
196						{
197						# for all the rows in the cluster
198	0					for($j = $i+1; $j <= $#rownum; $j++)
199						{
200						# find the distance between the 2 rows of the matrix.
201	0					$row1 = $rownum[$i];
202	0					$row2 = $rownum[$j];
203
204						# store the Dr value
205	0	0				if(exists $d[$row1][$row2])
206						{
207	0					$D[$key] += $d[$row1][$row2];
208						}
209						else
210						{
211	0					$D[$key] += $d[$row2][$row1];
212						}
213						}
214						}
215
216	0					$W += ( $#rownum - 1 ) * $D[$key];
217						}
218
219	0					$W = $W/2;
220
221	0					return $W;
222						}
223
224						sub dist_euclidean_sqr
225						{
226						# arguments
227	0		0	0		my @i = @{(shift)};
	0
228	0					my @j = @{(shift)};
	0
229
230						# local variables
231	0					my $a;
232	0					my $dist = 0;
233	0					my $retvalue = 0;
234
235						# Squared Euclidean measure
236						# summation on all j (xij - xi'j)^2 where i, i' are the rows indicies.
237	0					for $a (0 .. $#i)
238						{
239	0					$dist += (($i[$a] - $j[$a])**2);
240						}
241
242	0					$retvalue = sprintf("%.4f",$dist);
243	0					return $retvalue;
244						}
245
246
247						1;
248						__END__