File Coverage

blib/lib/Cluster/Similarity.pm

Criterion	Covered	Total	%
statement	365	393	92.8
branch	62	104	59.6
condition	26	66	39.3
subroutine	28	30	93.3
pod	17	17	100.0
total	498	610	81.6

line	stmt	bran	cond	sub	pod	time	code
1							package Cluster::Similarity;
2
3
4	10			10		513517	use English;
	10					4590
	10					59
5	10			10		4660	use warnings;
	10					23
	10					312
6	10			10		52	use strict;
	10					22
	10					355
7	10			10		55	use Carp;
	10					15
	10					742
8
9	10			10		10982	use Math::Combinatorics;
	10					47677
	10					859
10	10			10		84	use List::Util qw(sum min);
	10					19
	10					1133
11
12	10			10		13464	use Class::Std;
	10					125325
	10					68
13
14							=head1 NAME
15
16							Cluster::Similarity - compute the similarity of two classifications.
17
18							=head1 VERSION
19
20							Version 0.02
21
22							=cut
23
24	10			10		10283	use version; our $VERSION = qv('0.02');
	10					23808
	10					78
25
26
27							=head1 SYNOPSIS
28
29							Compute similarity of two classifications following various cluster similarity evaluation schemes based on contingency tables.
30
31
32							use Cluster::Similarity;
33
34
35							my $sim_calculator = Cluster::Similarity->new( $classification_1, $classification_2 );
36
37
38							my $pair_wise_recall = $sim_calculator->pair_wise_recall();
39							my $pair_wise_precision = $sim_calculator->pair_wise_precision();
40							my $pair_wise_f_score = $sim_calculator->pair_wise_fscore();
41
42							my $mutual_information = $sim_calculator->mutual_information();
43
44							my $rand_index = $sim_calculator->rand_index();
45
46							my $rand_adj = $sim_calculator->rand_adjusted($max_index);
47
48							my $matching = $sim_calculator->matching_index();
49
50
51							my $contingency_table = $sim_calculator->contingency();
52
53							my $pairs_matrix = $sim_calculator->pairs_matrix();
54
55							my $pair_of_cell_12 = $sim_calculator->pairs(1,2);
56
57
58							=head1 DESCRIPTION
59
60							Computes the similarity of two word clusterings using several
61							clustering similarity measures.
62
63							Consider for eg. the following groupings:
64
65							clustering_1: { {a, b, c}, {d, e, f} }
66							clustering_2: { {a, b}, {c, d, e}, {f} }
67
68							Cluster similarity measures provide a numerical value helping to
69							assess the alikeness of two such groupings.
70
71							All cluster similarity measures implemented in this module are based
72							on the so-called contingency table of the two classifications
73							(clusterings). The contingency table is a matrix with a cell for each
74							pair of classes (one from each classification), containing the number
75							of objects present in both classes.
76
77							The similarity measures (and also examples and tests) are taken from
78							Chapter 4 of Susanne Schulte im Walde's Phd thesis:
79
80							Sabine Schulte im Walde. Experiments on the Automatic Induction of
81							German Semantic Verb Classes. PhD thesis, Institut für Maschinelle
82							Sprachverarbeitung, Universität Stuttgart, 2003. Published as AIMS
83							Report 9(2) L
84
85							Please see there for a more in depth description of the similarity
86							measures and further details.
87
88							=head1 INTERFACE
89
90							=head2 Constructor
91
92							=over
93
94							=item new()
95
96							Builds a new Cluster::Similarity object.
97
98							=back
99
100							=cut
101
102							{
103
104							############ Data ######################################################################
105
106							my %classification1_of : ATTR( :get ); # hash of hashes
107							my %classification2_of : ATTR( :get ); # hash of hashes
108
109							my %contingency_of : ATTR( :get );
110							my %pairs_contingency_of : ATTR( :get );
111							my %object_nbr_of : ATTR;
112							my %objects_of : ATTR;
113
114							my %tp_of : ATTR( :get );
115							my %pairs_classification_1_of : ATTR;
116							my %pairs_classification_2_of : ATTR;
117							my %pair_wise_precision_of : ATTR;
118							my %pair_wise_recall_of : ATTR;
119							my %pair_wise_fscore_of : ATTR;
120
121							my %mutual_information_of : ATTR;
122
123							my %rand_index_of : ATTR;
124
125							my %rand_index_adj_of : ATTR;
126
127							my %matching_index_of : ATTR;
128
129							############ Utility subroutines #######################################################
130
131							sub _check_dataset {
132	32			32		51	my ($dataset_ref) = @_;
133
134	32	50				85	croak "Need reference to classification\n"
135							unless ($dataset_ref);
136
137
138	32	50				92	if (ref($dataset_ref) eq 'ARRAY') {
		0
139	32					55	return { map { my $index = $_+1; "c_$index" => $dataset_ref->[$_] } 0 .. $#{ $dataset_ref } };
	78					108
	78					317
	32					75
140							} elsif (ref($dataset_ref) eq 'HASH') {
141	0					0	return $dataset_ref;
142							} else {
143	0					0	croak "Classifications must be passed as array or hash references\n";
144							}
145	0					0	return;
146
147							}
148
149							sub _reset_dependant_datastructures {
150	16			16		31	my ($id) = @_;
151
152	16					42	delete $contingency_of{$id};
153	16					213	delete $pairs_contingency_of{$id};
154	16					32	delete $object_nbr_of{$id};
155	16					28	delete $objects_of{$id};
156	16					23	delete $tp_of{$id};
157	16					25	delete $pairs_classification_1_of{$id};
158	16					23	delete $pairs_classification_2_of{$id};
159	16					22	delete $pair_wise_precision_of{$id};
160	16					35	delete $pair_wise_recall_of{$id};
161	16					24	delete $pair_wise_fscore_of{$id};
162	16					22	delete $mutual_information_of{$id};
163	16					23	delete $rand_index_of{$id};
164	16					22	delete $rand_index_adj_of{$id};
165	16					23	delete $matching_index_of{$id};
166
167	16					27	return;
168							}
169
170
171							sub _nC2 {
172	29			29		30	my ($n) = @_;
173
174	29	50				64	if ($n < 0) { return; }
	0					0
175
176	29					82	return $n * ($n - 1) / 2;
177							}
178
179							sub _pairs_in_classification {
180	6			6		9	my ($class_ref) = @_;
181
182	6					7	my %pairs;
183	6					6	foreach my $cluster (values %{ $class_ref }) {
	6					15
184	15					18	my @comb = combine(2, keys %{ $cluster });
	15					61
185	15					3612	foreach my $pair (@comb) {
186	30					37	$pairs{join(',', sort @{$pair})} = 1;
	30					122
187							}
188							}
189
190	6					16	return \%pairs;
191							}
192
193
194							# the sum of the cells of a matrix - represented by a hash of hashes.
195							sub _cell_sum {
196	7			7		11	my ($matrix) = @_;
197
198	7					9	return sum map {values %{ $_ } } values %{ $matrix };
	18					18
	18					75
	7					16
199							}
200
201							############ Methods ###################################################################
202
203							=head1 FUNCTIONS
204
205							=head2 Providing the Data
206
207							=over
208
209							=item load_data(\@classification_1, \@classification_2)
210
211							=item load_data(\%classification_1, \%classification_2)
212
213							=cut
214
215							sub load_data {
216	16			16	1	8855	my ($self, $class1_ref, $class2_ref) = @_;
217
218	16					58	my $id = ident $self;
219
220	16					53	$classification1_of{$id} = _check_dataset($class1_ref);
221	16					58	$classification2_of{$id} = _check_dataset($class2_ref);
222
223	16					73	_reset_dependant_datastructures($id);
224
225	16					46	return;
226							}
227
228
229							=item set_classification_1(\@classification_1), set_classification1(\@classification_2)
230
231							=cut
232
233
234
235							sub set_classification_1 {
236	0			0	1	0	my ($self, $class_ref) = @_;
237
238	0					0	my $id = ident $self;
239
240	0					0	$classification1_of{$id} = _check_dataset($class_ref);
241
242	0					0	return;
243							}
244
245
246							=item set_classification_2(\%classification_1), set_classification1(\%classification_2)
247
248
249							=back
250
251							When calling these methods, the contingency tables and all previously computed similarity values are reset.
252
253							=cut
254
255							sub set_classification_2 {
256	0			0	1	0	my ($self, $class_ref) = @_;
257
258	0					0	my $id = ident $self;
259
260	0					0	$classification2_of{$id} = _check_dataset($class_ref);
261
262	0					0	return;
263							}
264
265							=head2 objects, object_number
266
267							Return (number of) objects in either classification
268
269							=cut
270
271							sub objects {
272	3			3	1	9	my ($self) = @_;
273
274	3					9	my $id = ident $self;
275
276	3	50	33			22	croak "Please set/load classifications before calling objects method\n" unless ($classification1_of{$id} and $classification2_of{$id});
277
278	3	50				11	if ($objects_of{$id}) {
279	0					0	return $objects_of{$id};
280							}
281
282	3					4	my $objects;
283	3					4	foreach my $cluster_ref (values %{ $classification1_of{$id} }, values %{ $classification2_of{$id} }) {
	3					10
	3					11
284	14					16	foreach my $obj (keys %{ $cluster_ref }) {
	14					32
285	36					77	$objects->{$obj}++;
286							}
287							}
288
289	3					8	$objects_of{$id} = $objects;
290	3					6	$object_nbr_of{$id} = scalar(keys %{ $objects });
	3					7
291
292	3					20	return $objects;
293							}
294
295
296							sub object_number {
297	1			1	1	8913	my ($self) = @_;
298
299	1					6	my $id = ident $self;
300
301	1	50	33			14	croak "Please set/load classifications before calling object_number method\n" unless ($classification1_of{$id} and $classification2_of{$id});
302
303	1	50				5	if ($object_nbr_of{$id}) {
304	1					4	return $object_nbr_of{$id};
305							}
306
307	0					0	my $objects = $self->objects();
308
309	0					0	$object_nbr_of{$id} = scalar(keys %{ $objects });
	0					0
310
311	0					0	return $object_nbr_of{$id};
312							}
313
314
315							=head2 contingency
316
317							Compute the contingency table for two classifications. The contingency table is a matrix with a cell for each pair of classes (one class from each classification). Each cell contains the number of objects present in both classes.
318
319							Eg. For the classifications
320
321							=over
322
323							=item
324
325							{ {a, b, c}, {d, e, f} }
326
327							=item
328
329							{ {a, b}, {c, d, e}, {f} }
330
331							=back
332
333							the returned contingency table is:
334
335							{
336							'c_1' => {
337							'c_1' => 2,
338							'c_2' => 0
339							},
340							'c_2' => {
341							'c_1' => 1,
342							'c_2' => 2
343							},
344							'c_3' => {
345							'c_1' => 0,
346							'c_2' => 1
347							}
348							}
349
350							Which is a hash representation of the matrix:
351
352							2 0
353							1 2
354							0 1
355
356
357							with the columns indexed by the classes of the first classification and the rows by the classes of the second classification.
358
359
360							=cut
361
362							sub contingency {
363	14			14	1	27	my ($self) = @_;
364
365	14					32	my $id = ident $self;
366
367	14	50	33			92	croak "Please set/load classifications before computing contingency table\n" unless ($classification1_of{$id} and $classification2_of{$id});
368
369	14	100	66			59	if (exists $contingency_of{$id} and $contingency_of{$id}) {
370	3					7	return $contingency_of{$id};
371							}
372
373	11					15	my $contingency;
374
375	11					15	foreach my $row_cl (keys %{ $classification2_of{$id} }) {
	11					41
376	29					31	foreach my $col_cl (keys %{ $classification1_of{$id} }) {
	29					115
377	64					73	my %common;
378	64					81	foreach my $cl_el (keys %{ $classification2_of{$id}->{$row_cl} }, keys %{ $classification1_of{$id}->{$col_cl} }) {
	64					147
	64					174
379	318					473	$common{$cl_el}++;
380							}
381	64					163	$contingency->{$row_cl}->{$col_cl} = grep { $_ > 1 } values %common;
	252					560
382							}
383							}
384
385	11					35	$contingency_of{$id} = $contingency;
386
387	11					31	return $contingency;
388							}
389
390							=head2 pairs_contingency
391
392							Compute the contingency table for the number of common element pairs in the two classifications.
393
394							For the example above this would be:
395
396							1 0
397							0 0
398							0 1
399
400
401							=cut
402
403							sub pairs_contingency {
404	3			3	1	21024	my ($self) = @_;
405
406	3					10	my $id = ident $self;
407
408	3	50	33			23	croak "Please set/load classifications before computing contingency table\n" unless ($classification1_of{$id} and $classification2_of{$id});
409
410	3	50	33			13	if (exists $pairs_contingency_of{$id} and $pairs_contingency_of{$id}) {
411	0					0	return $pairs_contingency_of{$id};
412							}
413
414
415	3					11	my $contingency = $self->contingency();
416
417	3					5	my $pairs_contingency;
418
419	3					12	foreach my $row_cl (keys %{ $contingency }) {
	3					8
420	8					9	foreach my $col_cl (keys %{ $contingency->{$row_cl} }) {
	8					17
421	16					23	my $n = $contingency->{$row_cl}->{$col_cl};
422	16					28	$pairs_contingency->{$row_cl}->{$col_cl} = _nC2($n);
423							}
424							}
425
426	3					9	$pairs_contingency_of{$id} = $pairs_contingency;
427
428	3					7	return $pairs_contingency;
429							}
430
431							=head2 true_positives
432
433							True positives are the number of object pairs which occur together in both classifications.
434
435							=cut
436
437							sub true_positives {
438	7			7	1	12	my ($self) = @_;
439
440	7					17	my $id = ident $self;
441
442	7	50	33			43	croak "Please set/load classifications before true positives\n" unless ($classification1_of{$id} and $classification2_of{$id});
443
444	7	100				20	if (exists $tp_of{$id}) {
445	4					10	return $tp_of{$id};
446							}
447
448	3					5	my %pairs_1;
449	3					5	foreach my $cluster (values %{ $classification1_of{$id} }) {
	3					11
450	7					9	my @comb = combine(2, keys %{ $cluster });
	7					32
451	7					2015	foreach my $pair (@comb) {
452	16					20	$pairs_1{join(',', sort @{$pair})} = 1;
	16					73
453							}
454							}
455
456
457	3					6	my $tp = 0;
458	3					85	foreach my $pair (keys %pairs_1) {
459	16					39	my ($val1, $val2) = split(/,/, $pair);
460	16	100				22	my $is_in_2 = grep { exists $_->{$val1} and exists $_->{$val2}} values %{ $classification2_of{$id} };
	42					157
	16					34
461	16	100				33	if ($is_in_2) {
462	12					23	$tp++;
463							}
464							}
465
466	3					8	$tp_of{$id} = $tp;
467
468	3					13	return $tp;
469
470							}
471
472
473							=head2 pairs_classification_1, pairs_classification_2
474
475							Number of pairs in classification.
476
477							=cut
478
479							sub pairs_classification_1 {
480	4			4	1	1467	my ($self) = @_;
481
482	4					10	my $id = ident $self;
483
484	4	50				14	croak ("Need data for classification 1\n") unless ($classification1_of{$id});
485
486	4	100				10	if ($pairs_classification_1_of{$id}) {
487	1					3	return $pairs_classification_1_of{$id};
488							}
489
490	3					11	my $pairs_ref = _pairs_in_classification($classification1_of{$id});
491
492
493	3					5	my $pairs_nbr = scalar(keys %{ $pairs_ref });
	3					7
494	3					7	$pairs_classification_1_of{$id} = $pairs_nbr;
495
496	3					11	return $pairs_nbr;
497							}
498
499							sub pairs_classification_2 {
500	4			4	1	413	my ($self) = @_;
501
502	4					11	my $id = ident $self;
503
504	4	50				11	croak ("Need data for classification 2\n") unless ($classification2_of{$id});
505
506	4	100				13	if ($pairs_classification_2_of{$id}) {
507	1					2	return $pairs_classification_2_of{$id};
508							}
509
510	3					8	my $pairs_ref = _pairs_in_classification($classification2_of{$id});
511
512
513	3					5	my $pairs_nbr = scalar(keys %{ $pairs_ref });
	3					6
514	3					6	$pairs_classification_2_of{$id} = $pairs_nbr;
515
516	3					11	return $pairs_nbr;
517							}
518
519
520
521							=head2 pair_wise_precision, pair_wise_recall, pair_wise_fscore
522
523							Pair-wise recall is the number of true positives divided by the number of pairs in classification 1
524
525							Pair-wise precision is the number of true positives divided by the number of pairs in classification 2
526
527							Pair-wise F-score is the harmonic mean of precision and recall, i.e. 2precisionrecall / (precision + recall)
528
529							=cut
530
531							sub pair_wise_recall {
532	6			6	1	415	my ($self) = @_;
533
534	6					15	my $id = ident $self;
535
536	6	100				23	if ($pair_wise_recall_of{$id}) {
537	3					15	return $pair_wise_recall_of{$id};
538							}
539
540	3					4	my $tp = 0;
541	3					9	$tp = $self->true_positives();
542	3					7	my $pairs = $self->pairs_classification_1();
543
544	3	50	33			20	if (not defined $pairs or $pairs == 0) {
545	0					0	$pairs = 1;
546							}
547
548	3					5	my $recall = $tp/$pairs;
549
550	3					6	$pair_wise_recall_of{$id} = $recall;
551
552	3					39	return $recall;
553							}
554
555							sub pair_wise_precision {
556	6			6	1	1522	my ($self) = @_;
557
558	6					13	my $id = ident $self;
559
560	6	100				18	if ($pair_wise_precision_of{$id}) {
561	3					7	return $pair_wise_precision_of{$id};
562							}
563
564	3					8	my $tp = $self->true_positives();
565	3					7	my $pairs = $self->pairs_classification_2();
566
567	3	50	33			17	if (not defined $pairs or $pairs == 0) {
568	0					0	$pairs = 1;
569							}
570
571	3					6	my $precision = $tp/$pairs;
572
573	3					6	$pair_wise_precision_of{$id} = $precision;
574
575	3					24	return $precision;
576							}
577
578							sub pair_wise_fscore {
579	3			3	1	1343	my ($self) = @_;
580
581	3					16	my $id = ident $self;
582
583	3	50				10	if ($pair_wise_fscore_of{$id}) {
584	0					0	return $pair_wise_fscore_of{$id};
585							}
586
587	3					9	my $prec = $self->pair_wise_precision();
588	3					8	my $recall = $self->pair_wise_recall();
589
590	3					4	my $fscore = 0;
591
592	3	50	33			24	if ($prec and $recall) {
593	3					12	$fscore = 2$prec$recall/($prec+$recall);
594							}
595
596	3					7	$pair_wise_fscore_of{$id} = $fscore;
597
598	3					23	return $fscore;
599							}
600
601							=head2 mutual_information
602
603							Mutual information is a symmetric measure for the degree of dependency between two classifications used here as introduced by Strehl et. al. (2000).
604
605							=cut
606
607							sub mutual_information {
608	3			3	1	13	my ($self) = @_;
609
610	3					7	my $id = ident $self;
611
612	3	50	33			22	croak "Please set/load classifications before computing mutual information\n" unless ($classification1_of{$id} and $classification2_of{$id});
613
614	3	50				9	if ($mutual_information_of{$id}) {
615	0					0	return $mutual_information_of{$id};
616							}
617
618	3					9	my $contingency = $self->contingency();
619
620	3					5	my $mi = 0;
621
622	3					4	my @cluster1_names = keys %{ $classification1_of{$id} };
	3					11
623	3					4	my @cluster2_names = keys %{ $classification2_of{$id} };
	3					10
624
625	3					5	my %cluster1_sum;
626							my %cluster2_sum;
627
628	3					6	foreach my $cluster (@cluster2_names) {
629	8	50				17	if (exists $contingency->{$cluster}) {
630	8					15	$cluster2_sum{$cluster} = sum values %{ $contingency->{$cluster} };
	8					50
631							}
632							}
633	3					7	foreach my $cluster (@cluster1_names) {
634	7	50				9	$cluster1_sum{$cluster} = sum map { $contingency->{$_}->{$cluster} } grep { exists $contingency->{$_} and exists $contingency->{$_}->{$cluster} } @cluster2_names;
	19					55
	19					87
635							}
636
637	3					8	my $n = _cell_sum($contingency);
638	3					6	my $k = scalar(@cluster1_names);
639	3					4	my $l = scalar(@cluster2_names);
640	3					12	my $log_kl = log($k*$l);
641
642							# print STDERR "n: $n, k: $k, l: $l\n";
643
644	3					4	foreach my $i (keys %{ $contingency }) {
	3					9
645	8					11	foreach my $j (keys %{ $contingency->{$i} }) {
	8					18
646
647	19	100				52	next unless ($contingency->{$i}->{$j});
648	9					13	my $tij = $contingency->{$i}->{$j};
649							# print STDERR "t($i, $j): $tij\n";
650							# print STDERR "t($i, .): $cluster2_sum{$i}\n";
651							# print STDERR "t(., $j): $cluster1_sum{$j}\n";
652	9					41	$mi += $tij * (log(($tij * $n) / ($cluster2_sum{$i} * $cluster1_sum{$j} )) / $log_kl);
653							}
654							}
655
656	3					4	$mi = $mi / $n;
657
658	3					8	$mutual_information_of{$id} = $mi;
659	3					41	return $mi;
660							}
661
662							=head2 rand_index
663
664							The Rand index (defined by Rand, 1971) is based on the agreement vs. disagreement between object pairs in clusterings.
665
666
667
668							=cut
669
670							sub rand_index {
671	2			2	1	10	my ($self) = @_;
672
673	2					7	my $id = ident $self;
674
675	2	50	33			20	croak "Please set/load classifications before computing rand index\n" unless ($classification1_of{$id} and $classification2_of{$id});
676
677	2	50				7	if ($rand_index_of{$id}) {
678	0					0	return $rand_index_of{$id};
679							}
680
681	2					8	my $objects = $self->objects();
682	2					4	my @pairs = combine(2, keys %{ $objects });
	2					13
683	2					3600	my $class1 = $classification1_of{$id};
684	2					5	my $class2 = $classification2_of{$id};
685
686
687	2					4	my %objects_by_class;
688
689	2					2	foreach my $cluster (keys %{ $class1 }) {
	2					7
690	4					5	foreach my $object (keys %{ $class1->{$cluster} }) {
	4					11
691	12					54	$objects_by_class{$object}->[0]->{$cluster}++;
692							}
693							}
694	2					4	foreach my $cluster (keys %{ $class2 }) {
	2					5
695	5					8	foreach my $object (keys %{ $class2->{$cluster} }) {
	5					12
696	12					30	$objects_by_class{$object}->[1]->{$cluster}++;
697							}
698							}
699
700	2					4	my $rand = 0;
701
702							PAIR:
703	2					4	foreach my $pair (@pairs) {
704
705	30					36	my $o1 = $pair->[0];
706	30					39	my $o2 = $pair->[1];
707
708							# classes Ca of Class_1 and Cb of Class_2 st o1 and o2 are both in Ca and Cb
709
710							# classes of Class_1 containing o1 and o2:
711	30					32	my %pair_in_class1;
712	30	50	33			147	if (exists $objects_by_class{$o1}->[0] and exists $objects_by_class{$o2}->[0]) {
713	30					29	foreach my $cluster (keys %{ $objects_by_class{$o1}->[0] }, keys %{ $objects_by_class{$o2}->[0] }) {
	30					58
	30					152
714	60					131	$pair_in_class1{$cluster}++;
715							}
716							}
717
718	30					70	%pair_in_class1 = map { $_ => 1 } grep { $pair_in_class1{$_} > 1 } keys %pair_in_class1;
	12					37
	48					112
719
720							# classes of Class_2 containing o1 and o2:
721	30					42	my %pair_in_class2;
722	30	50	33			132	if (exists $objects_by_class{$o1}->[1] and exists $objects_by_class{$o2}->[1]) {
723	30					30	foreach my $cluster (keys %{ $objects_by_class{$o1}->[1] }, keys %{ $objects_by_class{$o2}->[1] }) {
	30					59
	30					60
724	60					104	$pair_in_class2{$cluster}++;
725							}
726							}
727
728	30	50				67	%pair_in_class2 = map { $_ => 1 } grep { $pair_in_class2{$_} and $pair_in_class2{$_} > 1 } keys %pair_in_class1;
	8					24
	12					62
729
730	30					57	foreach my $cluster (keys %pair_in_class1) {
731	12	100				30	if (exists $pair_in_class2{$cluster}) {
732	8					8	$rand++;
733	8					33	next PAIR;
734							}
735							}
736
737							# classes Ca of Class_1 and Cb of Class_2 st. o1 is in Ca and Cb and o2 is in neither Ca nor Cb
738
739	22	50	33			101	if (exists $objects_by_class{$o1}->[0] and exists $objects_by_class{$o1}->[1]) {
740	22					23	foreach my $cluster1 (keys %{ $objects_by_class{$o1}->[0] }) {
	22					46
741	22					22	foreach my $cluster2 (keys %{ $objects_by_class{$o1}->[1] }) {
	22					41
742							# o2 is neither in cluster1 nor in cluster2
743
744	22	100	66			187	if (not( exists $objects_by_class{$o2}->[0] and exists $objects_by_class{$o2}->[0]->{$cluster1} ) and
			66
			66
745							not( exists $objects_by_class{$o2}->[1] and exists $objects_by_class{$o2}->[1]->{$cluster2} ) ) {
746	16					16	$rand ++;
747	16					52	next PAIR;
748							}
749							}
750							}
751							}
752
753	6	50	33			30	if (exists $objects_by_class{$o2}->[0] and exists $objects_by_class{$o2}->[1]) {
754	6					37	foreach my $cluster1 (keys %{ $objects_by_class{$o2}->[0] }) {
	6					14
755	6					7	foreach my $cluster2 (keys %{ $objects_by_class{$o2}->[1] }) {
	6					12
756							# o1 is neither in cluster1 nor in cluster2
757	6	50				17	my $o1_in_1 = exists $objects_by_class{$o1}->[0] and exists $objects_by_class{$o1}->[0]->{$cluster1};
758	6	50				16	my $o1_in_2 = exists $objects_by_class{$o1}->[1] and exists $objects_by_class{$o1}->[1]->{$cluster2};
759	6	0	33			32	if (not $o1_in_1 and not $o1_in_2) {
760	0					0	$rand ++;
761	0					0	next PAIR;
762							}
763							}
764							}
765							}
766							}
767
768
769	2					11	my $n = _cell_sum($self->contingency());
770
771	2	50				16	if ($n > 1) {
772	2					7	$rand = $rand / _nC2($n);
773							} else {
774	0					0	$rand = -1;
775							}
776
777	2					5	$rand_index_of{$id} = $rand;
778
779	2					55	return $rand;
780							}
781
782							=head2 rand_adjusted
783
784							Rand index adjusted by chance (Hubert and Arabie 1985). The adopted
785							model for randomness assumes that the two classifications are picked
786							at random, given the original number of classes and objects - the
787							contingency table is constructed from the hyper-geometric
788							distribution. The general form of an index corrected for chance is:
789
790							Index_adj = (Index - Expected Index) / (Maximum Index - Expected Index)
791
792							As maximum index I use the minimum of possible pairs in either classifications.
793
794							=cut
795
796							sub rand_adjusted {
797	2			2	1	9	my ($self) = @_;
798
799	2					4	my $id = ident $self;
800
801	2	50	33			15	croak "Please set/load classifications before computing rand index (adjusted)\n" unless ($classification1_of{$id} and $classification2_of{$id});
802
803	2	50				5	if ($rand_index_adj_of{$id}) {
804	0					0	return $rand_index_adj_of{$id};
805							}
806
807
808	2					6	my $pairs_contingency = $self->pairs_contingency();
809	2					4	my $contingency = $self->contingency();
810
811	2					6	my $n = _cell_sum($contingency);
812
813	2					4	my $index = 0;
814
815	2					2	my %col_clusters;
816
817	2					3	foreach my $row_cl (keys %{ $pairs_contingency }) {
	2					8
818
819	5					5	foreach my $col_cl (keys %{ $pairs_contingency->{$row_cl} }) {
	5					10
820	10					11	$col_clusters{$col_cl}++;
821	10					17	$index += $pairs_contingency->{$row_cl}->{$col_cl};
822							}
823							}
824
825	2					3	my $cont_row_sum = 0;
826	2					3	foreach my $row_cl (keys %{ $contingency }) {
	2					4
827	5					5	$cont_row_sum += _nC2(sum values %{ $contingency->{$row_cl} });
	5					14
828							}
829
830	2					3	my $cont_col_sum = 0;
831	2					4	foreach my $col_cl (keys %col_clusters) {
832	4					5	$cont_col_sum += _nC2(sum map { $contingency->{$_}->{$col_cl} } grep { exists $contingency->{$_}->{$col_cl} } keys %{ $contingency });
	10					17
	10					17
	4					10
833							}
834
835
836
837	2					3	my $exp_index = 0;
838	2	50				6	if ($n > 1 ) {
839	2					3	$exp_index = $cont_row_sum * $cont_col_sum / _nC2($n);
840							};
841
842	2					5	my $max_index = min ($cont_row_sum, $cont_col_sum);
843
844	2					3	my $rand_adj = -1;
845
846	2	50				5	if ($max_index != $exp_index) {
847	2					2	$rand_adj = ($index - $exp_index) / ($max_index - $exp_index);
848							}
849
850	2					3	$rand_index_adj_of{$id} = $rand_adj;
851
852	2					21	return $rand_adj;
853							}
854
855							=head2 matching_index
856
857							Matching index (Fowlkes and Mallows, 1983).
858
859							=cut
860
861							sub matching_index {
862	3			3	1	12	my ($self) = @_;
863
864	3					8	my $id = ident $self;
865
866	3	50	33			22	croak "Please set/load classifications before computing matching index\n" unless ($classification1_of{$id} and $classification2_of{$id});
867
868	3	50				8	if ($matching_index_of{$id}) {
869	0					0	return $matching_index_of{$id};
870							}
871
872	3					8	my $contingency = $self->contingency();
873
874	3					4	my $n = 0;
875
876	3					7	my ($Tk, $Pk, $Qk) = (0, 0, 0);
877
878
879	3					4	my %col_sums;
880	3					4	foreach my $row_cl (keys %{ $contingency }) {
	3					8
881
882	8					10	my $row_sum = 0;
883
884	8					9	foreach my $col_cl (keys %{ $contingency->{$row_cl} }) {
	8					19
885
886	19					21	$n++;
887
888	19					104	my $cell = $contingency->{$row_cl}->{$col_cl};
889
890	19					20	$row_sum += $cell;
891
892	19	100				32	if (exists $col_sums{$col_cl}) {
893	12					15	$col_sums{$col_cl} += $cell;
894							} else {
895	7					10	$col_sums{$col_cl} = $cell;
896							}
897
898	19					37	$Tk += $cell*$cell;
899							}
900
901	8					18	$Pk += $row_sum * $row_sum;
902							}
903
904	3					8	$Qk = sum map { $_ * $_ } values %col_sums;
	7					27
905
906	3					4	$Tk = $Tk - $n;
907	3					4	$Pk = $Pk - $n;
908	3					4	$Qk = $Qk - $n;
909
910	3					4	my $index = 0;
911
912	3					4	my $PkQk = $Pk*$Qk;
913
914	3	50				9	if ($PkQk > 0 ) {
915	3					7	$index = $Tk / sqrt($Pk * $Qk);
916							}
917
918	3					6	$matching_index_of{$id} = $index;
919
920	3					36	return $index;
921
922							}
923
924
925
926							1;
927
928							=head1 DIAGNOSTICS
929
930							=over
931
932							=item C<>
933
934							When a L method is called without enough arguments.
935
936							=item C<>
937
938							Argument of wrong type.
939
940							=item C<>
941
942							Method was called without providing classification data first, by calling one of the L methods.
943
944							=item C<>
945
946							Data for classification 1 (2 resp.) is missing.
947
948							=back
949
950							=head1 CONFIGURATION AND ENVIRONMENT
951
952							Cluster::Similarity requires no configuration files or environment variables.
953
954							=head1 DEPENDENCIES
955
956							=over
957
958							=item Carp
959
960							=item Class::Std
961
962							=item List::Util qw(sum min)
963
964							=item Math::Combinatorics
965
966							=back
967
968
969							=head1 INCOMPATIBILITIES
970
971							None reported.
972
973
974							=head1 BUGS AND LIMITATIONS
975
976							No bugs have been reported.
977
978							Please report any bugs or feature requests to
979							C, or through the web interface at
980							L.
981
982
983							=head1 TO DO
984
985							=over
986
987							=item
988
989							find more suitable return values for when a given similarity measure is not applicable.
990
991							=item
992
993							for the B measure make the maximum index configurable.
994
995							=back
996
997							=head1 AUTHOR
998
999							Ingrid Falk, C<< >>
1000
1001							=head1 BUGS
1002
1003							Please report any bugs or feature requests to C, or through
1004							the web interface at L. I will be notified, and then you'll
1005							automatically be notified of progress on your bug as I make changes.
1006
1007							=head1 SUPPORT
1008
1009							You can find documentation for this module with the perldoc command.
1010
1011							perldoc Cluster::Similarity
1012
1013
1014							You can also look for information at:
1015
1016							=over 4
1017
1018							=item * RT: CPAN's request tracker
1019
1020							L
1021
1022							=item * AnnoCPAN: Annotated CPAN documentation
1023
1024							L
1025
1026							=item * CPAN Ratings
1027
1028							L
1029
1030							=item * Search CPAN
1031
1032							L
1033
1034							=back
1035
1036
1037							=head1 SEE ALSO
1038
1039							=over
1040
1041							=item
1042
1043							For the description of the implemented clustering similarity measures:
1044
1045							Sabine Schulte im Walde. Experiments on the Automatic Induction of
1046							German Semantic Verb Classes. PhD thesis, Institut für Maschinelle
1047							Sprachverarbeitung, Universität Stuttgart, 2003. Published as AIMS
1048							Report 9(2), L
1049
1050							=item * For building clusterings or classifications:
1051
1052							=over 2
1053
1054							=item L
1055
1056							a I.
1057
1058							=item L
1059
1060							I
1061
1062							=back
1063
1064							=back
1065
1066							=head1 COPYRIGHT & LICENSE
1067
1068							Copyright 2008 Ingrid Falk, all rights reserved.
1069
1070							This program is free software; you can redistribute it and/or modify it
1071							under the same terms as Perl itself.
1072
1073
1074							=cut
1075
1076							}
1077
1078							1; # End of Cluster::Similarity