File Coverage

blib/lib/Set/Partitions/Similarity.pm

Criterion	Covered	Total	%
statement	91	104	87.5
branch	12	28	42.8
condition	0	3	0.0
subroutine	12	14	85.7
pod	5	9	55.5
total	120	158	75.9

line	stmt	bran	cond	sub	pod	time	code
1							package Set::Partitions::Similarity;
2
3	1			1		28173	use strict;
	1					2
	1					37
4	1			1		5	use warnings;
	1					2
	1					40
5							#use Data::Dump qw(dump);
6
7							BEGIN
8							{
9	1			1		5	use Exporter ();
	1					5
	1					25
10	1			1		5	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	1					9
	1					123
11	1			1		1	$VERSION = '0.54';
12	1					15	@ISA = qw(Exporter);
13	1					2	@EXPORT = qw();
14	1					3	@EXPORT_OK = qw(getAccuracy getAccuracyAndPrecision getDistance getPrecision areSubsetsDisjoint);
15	1					1283	%EXPORT_TAGS = ();
16							}
17
18							#Routines to measure similarity of partitions.
19							#012345678901234567890123456789012345678901234
20
21							=head1 NAME
22
23							C - Routines to measure similarity of partitions.
24
25							=head1 SYNOPSIS
26
27							use Set::Partitions::Similarity qw(getAccuracyAndPrecision);
28							use Data::Dump qw(dump);
29
30							# set elements are Perl strings, sets are array references
31							# partitions are nested arrays.
32							dump getAccuracyAndPrecision ([[qw(a b)],[1,2]], [[qw(a b 1)],[2]]);
33							# dumps:
34							# ("0.5", "0.25")
35
36							# a partition is equivalent to itself, even the empty partition.
37							dump getAccuracyAndPrecision ([[1,2], [3,4]], [[2,1], [4,3]]);
38							dump getAccuracyAndPrecision ([], []);
39							# dumps:
40							# (1, 1)
41							# (1, 1)
42
43							# accuracy and precision are symmetric functions.
44							my ($p, $q) = ([[1,2,3], [4]], [[1], [2,3,4]]);
45							dump getAccuracyAndPrecision ($p, $q);
46							dump getAccuracyAndPrecision ($q, $p);
47							# dumps:
48							# ("0.333333333333333", "0.2")
49							# ("0.333333333333333", "0.2")
50
51							# checks partitions and throws an exception.
52							eval { getAccuracyAndPrecision ([[1]], [[1,2]], 1); };
53							warn $@ if $@;
54							# dumps:
55							# partitions are invalid, they have different set elements.
56
57							=head1 DESCRIPTION
58
59							A partition of a set is a collection of mutually disjoint subsets of the set
60							whose union is the set. C provides routines
61							that measure the I and I between two partitions of a set. The measures can
62							assess the performance of a binary clustering algorithm by comparing
63							the clusters the algorithm creates against the correct clusters of test data.
64
65							=head2 Accuracy and Precision
66
67							Let C be a set of C elements and let C be a partition of C~~. Let C~~
68							be the set of all sets of two distinct elements of C~~; so C has C sets.~~
69							The partition C uniquely defines a partitioning of C into two sets, C and C where
70							C is the set of all pairs in C such that both elements of a pair
71							occur in the same set in C , and define C as C, the complement.
72
73							Given two partitions C and C of the set C~~, the I is defined as~~
74							C<(\|C(P) ^ C(Q)\| + \|D(P) ^ D(Q)\|) / (n*(n-1)/2)>, where \| \| gives the size of a set and
75							^ represents the intersection operator. The I is defined as
76							C<\|C(P) ^ C(Q)\| / (\|C(P) ^ C(Q)\| + \|C(P) ^ D(Q)\| + \|D(P) ^ C(Q)\|)>. The I and
77							I return values ranging from zero (no similarity) to one (equivalent partitions).
78							The I between two partitions is defined as I<1-accuracy>, and in mathematics is a metric.
79							The I returns values ranging from zero (equivalent partitions) to one (no similarity).
80
81							All the methods implemented that compute the I, I, and I run in time linear in the
82							number of elements of the set partitioned.
83
84							=head1 ROUTINES
85
86							=head2 C
87
88							The routine C returns true if the subsets of the partition are disjoint,
89							false otherwise. It can be used to check the validity of a partition.
90
91							=over
92
93							=item C<$Partition>
94
95							The partition is stored as a nested array reference of the form
96							C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set
97							C<{a,b,1,2}> should be stored as the nested array reference
98							C<[['a','b']],[1,2]]>. Note the elements
99							of a set are represented as Perl strings.
100
101							=back
102
103							An example:
104
105							use Set::Partitions::Similarity qw(areSubsetsDisjoint);
106							use Data::Dump qw(dump);
107							dump areSubsetsDisjoint ([[1,2,3], [4]]);
108							dump areSubsetsDisjoint ([[1,2,3], [4,1]]);
109							# dumps:
110							# "1"
111							# "0"
112
113							=cut
114
115							# a valid partition has all the subsets mutually disjoint. this routine
116							# returns 0 if it finds two distinct subsets have an element in common.
117							# this is done in time linear in the number of elements by computing the
118							# prefix union of the sets in the partition using a hash.
119							sub areSubsetsDisjoint
120							{
121							# the hash %prefixUnionOfSubsets holds the union of elements in each subset
122							# as they are checked for elements that occur in more than one subset.
123	60			60	1	127	my %prefixUnionOfSubsets;
124	60					105	foreach my $subset (@{$_[0]})
	60					304
125							{
126							# since it is possible that a subset could contain a repeating element,
127							# first each element is checked without adding it to the hash.
128	28468					49086	foreach my $element (@$subset)
129							{
130	943332	50				2169148	if (exists $prefixUnionOfSubsets{$element})
131							{
132							# if the second parameter is true, throw and exception, otherwise return false.
133	0	0	0			0	if (defined ($_[1]) && $_[1])
134							{
135	0					0	die "element '$element' occurs in two of the subsets.\n";
136							}
137							else
138							{
139	0					0	return 0;
140							}
141							}
142							}
143
144							# now we can add all the elements to the hash.
145	28468					44660	foreach my $element (@$subset)
146							{
147	943332					1829294	$prefixUnionOfSubsets{$element} = 1;
148							}
149							}
150
151							# the subsets are disjoint, return true.
152	60					427242	return 1;
153							}
154
155							=head2 C
156
157							The routine C returns the I of the
158							two partitions.
159
160							=over
161
162							=item C<$PartitionP, $PartitionQ>
163
164							The partitions are stored as nested array references of the form
165							C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set
166							C<{a,b,1,2}> should be stored as the nested array references
167							C<[['a','b']],[1,2]]>. Note the elements
168							of a set are represented as Perl strings.
169
170							=item C<$CheckValidity>
171
172							If C<$CheckValidity> evaluates to true, then checks are performed to
173							ensure both partitions are valid and an exception is thrown if they
174							are not. The default is false.
175
176							=back
177
178							An example:
179
180							use Set::Partitions::Similarity qw(getAccuracy);
181							use Data::Dump qw(dump);
182							dump getAccuracy ([[qw(a b)], [qw(c d)]], [[qw(a b c d)]]);
183							dump getAccuracy ([[qw(a b c d)]], [[qw(a b)], [qw(c d)]]);
184							# dumps:
185							# "0.333333333333333"
186							# "0.333333333333333"
187
188							=cut
189
190							sub getAccuracy
191							{
192	30			30	1	237977	my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_;
193
194							# get both similarities.
195	30					225	my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity);
196
197							# return just the accuracy.
198	30					242	return $accuracy;
199							}
200
201
202							=head2 C
203
204							The routine C returns the I and I of the
205							two partitions as an array C<(accuracy, precision)>.
206
207							=over
208
209							=item C<$PartitionP, $PartitionQ>
210
211							The partitions are stored as nested array references of the form
212							C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set
213							C<{a,b,1,2}> should be stored as the nested array references
214							C<[['a','b']],[1,2]]>. Note the elements
215							of a set are represented as Perl strings.
216
217							=item C<$CheckValidity>
218
219							If C<$CheckValidity> evaluates to true, then checks are performed to
220							ensure both partitions are valid and an exception is thrown if they
221							are not. The default is false.
222
223							=back
224
225							An example:
226
227							use Set::Partitions::Similarity qw(getAccuracyAndPrecision);
228							use Data::Dump qw(dump);
229							dump getAccuracyAndPrecision ([[1,2], [3,4]], [[1], [2], [3], [4]]);
230							dump getAccuracyAndPrecision ([[1], [2], [3], [4]], [[1,2], [3,4]]);
231							# dumps:
232							# ("0.666666666666667", 0)
233							# ("0.666666666666667", 0)
234
235							=cut
236
237							sub getAccuracyAndPrecision
238							{
239	30			30	1	75	my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_;
240
241							# get the base count of edge types.
242	30					101	my ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $diffRefDiffModel) = getBaseEdgeCounts ($ReferencePartition, $ModelPartition, $CheckValidity);
243
244							# get the total number of bases edges.
245	30					124	my $baseEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel;
246
247							# if there are no base edges, the precision is one.
248	30					62	my $precision = 1;
249
250							# get the precision.
251	30	50				200	$precision = $sameRefSameModel / $baseEdges if $baseEdges;
252
253							# get the total number of edges.
254	30					83	my $totalEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel + $diffRefDiffModel;
255
256							# if there are no edges, the accuracy is one.
257	30					65	my $accuracy = 1;
258
259							# get the accuracy.
260	30	50				141	$accuracy = ($sameRefSameModel + $diffRefDiffModel) / $totalEdges if $totalEdges;
261
262	30					157	return ($accuracy, $precision);
263							}
264
265
266							=head2 C
267
268							The routine C returns I<1-accuracy> of the
269							two partitions, or C<1-getAccuracy($PartitionP, $PartitionQ, $CheckValidity)>.
270
271							=over
272
273							=item C<$PartitionP, $PartitionQ>
274
275							The partitions are stored as nested array references of the form
276							C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set
277							C<{a,b,1,2}> should be stored as the nested array references
278							C<[['a','b']],[1,2]]>. Note the elements
279							of a set are represented as Perl strings.
280
281							=item C<$CheckValidity>
282
283							If C<$CheckValidity> evaluates to true, then checks are performed to
284							ensure both partitions are valid and an exception is thrown if they
285							are not. The default is false.
286
287							=back
288
289							An example:
290
291							use Set::Partitions::Similarity qw(getDistance);
292							use Data::Dump qw(dump);
293							dump getDistance ([[1,2,3], [4]], [[1], [2,3,4]]);
294							# dumps:
295							# "0.666666666666667"
296
297							=cut
298
299							sub getDistance
300							{
301	0			0	1	0	my $accuracy = getAccuracy (@_);
302	0	0				0	return 1 - $accuracy if defined $accuracy;
303	0					0	return undef;
304							}
305
306
307							=head2 C
308
309							The routine C returns the I of the
310							two partitions.
311
312							=over
313
314							=item C<$PartitionP, $PartitionQ>
315
316							The partitions are stored as nested array references of the form
317							C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set
318							C<{a,b,1,2}> should be stored as the nested array references
319							C<[['a','b']],[1,2]]>. Note the elements
320							of a set are represented as Perl strings.
321
322							=item C<$CheckValidity>
323
324							If C<$CheckValidity> evaluates to true, then checks are performed to
325							ensure both partitions are valid and an exception is thrown if they
326							are not. The default is false.
327
328							=back
329
330							An example:
331
332							use Set::Partitions::Similarity qw(getPrecision);
333							use Data::Dump qw(dump);
334							dump getPrecision ([[1,2,3], [4]], [[1], [2,3,4]]);
335							# dumps:
336							# "0.2"
337
338							=cut
339
340							sub getPrecision
341							{
342	0			0	1	0	my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_;
343
344							# get both Similarity.
345	0					0	my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity);
346
347							# return just the precision.
348	0					0	return $precision;
349							}
350
351
352							sub getBaseEdgeCounts
353							{
354	30			30	0	63	my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_;
355
356							# validates the partitions or throws an exception.
357	30	50				201	validatePartitionsOrDie ($ReferencePartition, $ModelPartition) if $CheckValidity;
358
359	30					199	my ($sameRefSameModel, $sameRefDiffModel) = getPartitionsEdgeCounts ($ReferencePartition, $ModelPartition);
360	30					257	my ($sameModelSameRef, $diffRefSameModel) = getPartitionsEdgeCounts ($ModelPartition, $ReferencePartition);
361
362							# make sure the number of edges for
363	30	50				261	if ($sameRefSameModel != $sameModelSameRef)
364							{
365	0					0	die "programming error: computed different values for number of edges in same partitions.\n";
366							}
367
368							# get the number of elements in the universe of the sets.
369	30					66	my $totalElements;
370							{
371	30					104	my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition;
	30					222
	471666					1547952
	13946					327477
372	30					554477	$totalElements = scalar keys %universe;
373							}
374
375							# get the total edges.
376	30					254	my $totalEdges = ($totalElements * ($totalElements - 1)) / 2;
377
378	30					354	return ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $totalEdges - $sameRefSameModel - $sameRefDiffModel - $diffRefSameModel);
379							}
380
381
382							sub getPartitionsEdgeCounts
383							{
384	60			60	0	162	my ($ReferencePartition, $ModelPartition) = @_;
385
386	60					120	my %modelId;
387	60					427	for (my $id = 0; $id < @$ModelPartition; $id++)
388							{
389	28468					41099	my $subset = $ModelPartition->[$id];
390	28468					43905	foreach my $element (@$subset)
391							{
392	943332					2213372	$modelId{$element} = $id;
393							}
394							}
395
396							# count the number of edges in the same partitions and the number in the
397							# same reference partitions but difference model partitions.
398	60					169	my $samePartitions = 0;
399	60					119	my $sameRefDiffModel = 0;
400	60					144	foreach my $subset (@$ReferencePartition)
401							{
402	28468					31039	my %subsetModelPartitionCounts;
403
404							# need to ensure the subset elements are unique.
405							{
406	28468					28206	my %elements;
	28468					28361
407	28468					69247	for (my $i = 0; $i < @$subset; $i++)
408							{
409	943332	50				2232888	unless (exists ($elements{$subset->[$i]}))
410							{
411	943332					1740494	$elements{$subset->[$i]} = 1;
412	943332					3693942	++$subsetModelPartitionCounts{$modelId{$subset->[$i]}};
413							}
414							}
415							}
416
417							# get the sizes of the model partitions of the subset.
418	28468					63875	my @subsetPartitionSizes = values %subsetModelPartitionCounts;
419
420							# count the number of edges having nodes in the same partitions.
421	28468					41403	foreach my $size (@subsetPartitionSizes)
422							{
423	29074					76346	$samePartitions += ($size * ($size - 1)) / 2;
424							}
425
426							# count the number of edges having nodes in the same reference partitions
427							# but different model partitions.
428	28468					36292	my $prefixSumOfSizes;
429	28468	50				59053	$prefixSumOfSizes = $subsetPartitionSizes[0] if @subsetPartitionSizes;
430	28468					118266	for (my $i = 1; $i < @subsetPartitionSizes; $i++)
431							{
432	606					708	$sameRefDiffModel += $prefixSumOfSizes * $subsetPartitionSizes[$i];
433	606					1291	$prefixSumOfSizes += $subsetPartitionSizes[$i];
434							}
435							}
436
437	60					730766	return ($samePartitions, $sameRefDiffModel);
438							}
439
440
441							# for the set partitions to be valid, the union of sets of each partition
442							# must be equal. the routine returns true of they are, false if not.
443							sub doPartitionsHaveSameUnion
444							{
445	30			30	0	109	my ($ReferencePartition, $ModelPartition) = @_;
446
447							# add all the reference elements to the hash.
448	30					249	my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition;
	471666					1347251
	13946					253827
449
450							# now check each subset of the model partition.
451	30					211991	foreach my $subset (@$ModelPartition)
452							{
453							# return 0 if an element from the subset is missing.
454	14522					23685	foreach my $element (@$subset)
455							{
456	471666	50				1232215	return 0 unless exists $universe{$element};
457							}
458
459							# delete all the elements in the hash from the subset.
460	14522					22088	foreach my $element (@$subset)
461							{
462	471666					774656	delete $universe{$element};
463							}
464							}
465
466							# if there are any elements remaining return 0.
467	30	50				145	return 0 if %universe;
468
469	30					274	return 1;
470							}
471
472
473							# this routine checks that the two partitions have the same union and
474							# each partition is composed for sets that a mutually disjoint. the
475							# routine throws and exception is the partitions are invalid.
476							sub validatePartitionsOrDie
477							{
478	30			30	0	102	my ($ReferencePartition, $ModelPartition) = @_;
479
480							# make sure the reference partition is valid.
481	30	50				99	unless (areSubsetsDisjoint ($ReferencePartition))
482							{
483	0					0	die "first partition is an invalid partition, an element occurs in two or more subsets.\n";
484							}
485
486							# make sure the model partition is valid.
487	30	50				228	unless (areSubsetsDisjoint ($ModelPartition))
488							{
489	0					0	die "second partition is an invalid partition, an element occurs in two or more subsets.\n";
490							}
491
492							# make sure the partitions have the same universe.
493	30	50				372	unless (doPartitionsHaveSameUnion ($ReferencePartition, $ModelPartition))
494							{
495	0					0	die "partitions are invalid, they have different set elements.\n";
496							}
497
498	30					83	return 1;
499							}
500
501							=head1 EXAMPLE
502
503							The code following measures the I of a set of 512 elements partitioned
504							equally into subsets of size C<$s> to the entire set.
505
506							use Set::Partitions::Similarity qw(getDistance);
507							my @p = ([0..511]);
508							for (my $s = 1; $s <= 512; $s += $s)
509							{
510							my @q = map { [$s$_..($s$_+$s-1)] } (0..(512/$s-1));
511							print join (', ', $s, getDistance (\@p, \@q, 1)) . "\n";
512							}
513							# dumps:
514							# 1, 1
515							# 2, 0.998043052837573
516							# 4, 0.99412915851272
517							# 8, 0.986301369863014
518							# 16, 0.970645792563601
519							# 32, 0.939334637964775
520							# 64, 0.876712328767123
521							# 128, 0.75146771037182
522							# 256, 0.500978473581213
523							# 512, 0
524
525							=head1 INSTALLATION
526
527							To install the module run the following commands:
528
529							perl Makefile.PL
530							make
531							make test
532							make install
533
534							If you are on a windows box you should use 'nmake' rather than 'make'.
535
536							=head1 BUGS
537
538							Please email bugs reports or feature requests to C, or through
539							the web interface at L. The author
540							will be notified and you can be automatically notified of progress on the bug fix or feature request.
541
542							=head1 AUTHOR
543
544							Jeff Kubina
545
546							=head1 COPYRIGHT
547
548							Copyright (c) 2009 Jeff Kubina. All rights reserved.
549							This program is free software; you can redistribute
550							it and/or modify it under the same terms as Perl itself.
551
552							The full text of the license can be found in the
553							LICENSE file included with this module.
554
555							=head1 KEYWORDS
556
557							accuracy, clustering, measure, metric, partitions, precision, set, similarity
558
559							=head1 SEE ALSO
560
561							=begin html
562
563							Concise explainations of many cluster validity measures (including set partition measures) are available on
564							the Cluster validity algorithms page
565							of the Machaon Clustering and Validation Environment web site
566							by Nadia Bolshakova.
567
568							The Wikipedia article Accuracy and precision has a good explaination
569							of the accuracy and precision measures when applied to
570							binary classifications.
571
572							The report Objective Criteria for the Evaluation of Clustering Methods (1971)
573							by W.M. Rand in the Journal of the American Statistical Association provides an excellent analysis of the accuracy
574							measure of partitions.
575
576							=end html
577
578							=cut
579
580							1;
581							# The preceding line will help the module return a true value
582