| blib/lib/Set/Partitions/Similarity.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 91 | 104 | 87.5 |
| branch | 12 | 28 | 42.8 |
| condition | 0 | 3 | 0.0 |
| subroutine | 12 | 14 | 85.7 |
| pod | 5 | 9 | 55.5 |
| total | 120 | 158 | 75.9 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Set::Partitions::Similarity; | ||||||
| 2 | |||||||
| 3 | 1 | 1 | 28173 | use strict; | |||
| 1 | 2 | ||||||
| 1 | 37 | ||||||
| 4 | 1 | 1 | 5 | use warnings; | |||
| 1 | 2 | ||||||
| 1 | 40 | ||||||
| 5 | #use Data::Dump qw(dump); | ||||||
| 6 | |||||||
| 7 | BEGIN | ||||||
| 8 | { | ||||||
| 9 | 1 | 1 | 5 | use Exporter (); | |||
| 1 | 5 | ||||||
| 1 | 25 | ||||||
| 10 | 1 | 1 | 5 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |||
| 1 | 9 | ||||||
| 1 | 123 | ||||||
| 11 | 1 | 1 | 1 | $VERSION = '0.54'; | |||
| 12 | 1 | 15 | @ISA = qw(Exporter); | ||||
| 13 | 1 | 2 | @EXPORT = qw(); | ||||
| 14 | 1 | 3 | @EXPORT_OK = qw(getAccuracy getAccuracyAndPrecision getDistance getPrecision areSubsetsDisjoint); | ||||
| 15 | 1 | 1283 | %EXPORT_TAGS = (); | ||||
| 16 | } | ||||||
| 17 | |||||||
| 18 | #Routines to measure similarity of partitions. | ||||||
| 19 | #012345678901234567890123456789012345678901234 | ||||||
| 20 | |||||||
| 21 | =head1 NAME | ||||||
| 22 | |||||||
| 23 | C |
||||||
| 24 | |||||||
| 25 | =head1 SYNOPSIS | ||||||
| 26 | |||||||
| 27 | use Set::Partitions::Similarity qw(getAccuracyAndPrecision); | ||||||
| 28 | use Data::Dump qw(dump); | ||||||
| 29 | |||||||
| 30 | # set elements are Perl strings, sets are array references | ||||||
| 31 | # partitions are nested arrays. | ||||||
| 32 | dump getAccuracyAndPrecision ([[qw(a b)],[1,2]], [[qw(a b 1)],[2]]); | ||||||
| 33 | # dumps: | ||||||
| 34 | # ("0.5", "0.25") | ||||||
| 35 | |||||||
| 36 | # a partition is equivalent to itself, even the empty partition. | ||||||
| 37 | dump getAccuracyAndPrecision ([[1,2], [3,4]], [[2,1], [4,3]]); | ||||||
| 38 | dump getAccuracyAndPrecision ([], []); | ||||||
| 39 | # dumps: | ||||||
| 40 | # (1, 1) | ||||||
| 41 | # (1, 1) | ||||||
| 42 | |||||||
| 43 | # accuracy and precision are symmetric functions. | ||||||
| 44 | my ($p, $q) = ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
| 45 | dump getAccuracyAndPrecision ($p, $q); | ||||||
| 46 | dump getAccuracyAndPrecision ($q, $p); | ||||||
| 47 | # dumps: | ||||||
| 48 | # ("0.333333333333333", "0.2") | ||||||
| 49 | # ("0.333333333333333", "0.2") | ||||||
| 50 | |||||||
| 51 | # checks partitions and throws an exception. | ||||||
| 52 | eval { getAccuracyAndPrecision ([[1]], [[1,2]], 1); }; | ||||||
| 53 | warn $@ if $@; | ||||||
| 54 | # dumps: | ||||||
| 55 | # partitions are invalid, they have different set elements. | ||||||
| 56 | |||||||
| 57 | =head1 DESCRIPTION | ||||||
| 58 | |||||||
| 59 | A partition of a set is a collection of mutually disjoint subsets of the set | ||||||
| 60 | whose union is the set. C |
||||||
| 61 | that measure the I |
||||||
| 62 | assess the performance of a binary clustering algorithm by comparing | ||||||
| 63 | the clusters the algorithm creates against the correct clusters of test data. | ||||||
| 64 | |||||||
| 65 | =head2 Accuracy and Precision | ||||||
| 66 | |||||||
| 67 | Let C be a partition of C |
||||||
| 68 | be the set of all sets of two distinct elements of C |
||||||
| 69 | The partition C uniquely defines a partitioning of C |
||||||
| 70 | C |
||||||
| 71 | occur in the same set in C , and define C |
||||||
| 72 | |||||||
| 73 | Given two partitions C and C |
||||||
| 74 | C<(|C(P) ^ C(Q)| + |D(P) ^ D(Q)|) / (n*(n-1)/2)>, where | | gives the size of a set and | ||||||
| 75 | ^ represents the intersection operator. The I |
||||||
| 76 | C<|C(P) ^ C(Q)| / (|C(P) ^ C(Q)| + |C(P) ^ D(Q)| + |D(P) ^ C(Q)|)>. The I |
||||||
| 77 | I |
||||||
| 78 | The I |
||||||
| 79 | The I |
||||||
| 80 | |||||||
| 81 | All the methods implemented that compute the I |
||||||
| 82 | number of elements of the set partitioned. | ||||||
| 83 | |||||||
| 84 | =head1 ROUTINES | ||||||
| 85 | |||||||
| 86 | =head2 C |
||||||
| 87 | |||||||
| 88 | The routine C |
||||||
| 89 | false otherwise. It can be used to check the validity of a partition. | ||||||
| 90 | |||||||
| 91 | =over | ||||||
| 92 | |||||||
| 93 | =item C<$Partition> | ||||||
| 94 | |||||||
| 95 | The partition is stored as a nested array reference of the form | ||||||
| 96 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
| 97 | C<{a,b,1,2}> should be stored as the nested array reference | ||||||
| 98 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
| 99 | of a set are represented as Perl strings. | ||||||
| 100 | |||||||
| 101 | =back | ||||||
| 102 | |||||||
| 103 | An example: | ||||||
| 104 | |||||||
| 105 | use Set::Partitions::Similarity qw(areSubsetsDisjoint); | ||||||
| 106 | use Data::Dump qw(dump); | ||||||
| 107 | dump areSubsetsDisjoint ([[1,2,3], [4]]); | ||||||
| 108 | dump areSubsetsDisjoint ([[1,2,3], [4,1]]); | ||||||
| 109 | # dumps: | ||||||
| 110 | # "1" | ||||||
| 111 | # "0" | ||||||
| 112 | |||||||
| 113 | =cut | ||||||
| 114 | |||||||
| 115 | # a valid partition has all the subsets mutually disjoint. this routine | ||||||
| 116 | # returns 0 if it finds two distinct subsets have an element in common. | ||||||
| 117 | # this is done in time linear in the number of elements by computing the | ||||||
| 118 | # prefix union of the sets in the partition using a hash. | ||||||
| 119 | sub areSubsetsDisjoint | ||||||
| 120 | { | ||||||
| 121 | # the hash %prefixUnionOfSubsets holds the union of elements in each subset | ||||||
| 122 | # as they are checked for elements that occur in more than one subset. | ||||||
| 123 | 60 | 60 | 1 | 127 | my %prefixUnionOfSubsets; | ||
| 124 | 60 | 105 | foreach my $subset (@{$_[0]}) | ||||
| 60 | 304 | ||||||
| 125 | { | ||||||
| 126 | # since it is possible that a subset could contain a repeating element, | ||||||
| 127 | # first each element is checked without adding it to the hash. | ||||||
| 128 | 28468 | 49086 | foreach my $element (@$subset) | ||||
| 129 | { | ||||||
| 130 | 943332 | 50 | 2169148 | if (exists $prefixUnionOfSubsets{$element}) | |||
| 131 | { | ||||||
| 132 | # if the second parameter is true, throw and exception, otherwise return false. | ||||||
| 133 | 0 | 0 | 0 | 0 | if (defined ($_[1]) && $_[1]) | ||
| 134 | { | ||||||
| 135 | 0 | 0 | die "element '$element' occurs in two of the subsets.\n"; | ||||
| 136 | } | ||||||
| 137 | else | ||||||
| 138 | { | ||||||
| 139 | 0 | 0 | return 0; | ||||
| 140 | } | ||||||
| 141 | } | ||||||
| 142 | } | ||||||
| 143 | |||||||
| 144 | # now we can add all the elements to the hash. | ||||||
| 145 | 28468 | 44660 | foreach my $element (@$subset) | ||||
| 146 | { | ||||||
| 147 | 943332 | 1829294 | $prefixUnionOfSubsets{$element} = 1; | ||||
| 148 | } | ||||||
| 149 | } | ||||||
| 150 | |||||||
| 151 | # the subsets are disjoint, return true. | ||||||
| 152 | 60 | 427242 | return 1; | ||||
| 153 | } | ||||||
| 154 | |||||||
| 155 | =head2 C |
||||||
| 156 | |||||||
| 157 | The routine C |
||||||
| 158 | two partitions. | ||||||
| 159 | |||||||
| 160 | =over | ||||||
| 161 | |||||||
| 162 | =item C<$PartitionP, $PartitionQ> | ||||||
| 163 | |||||||
| 164 | The partitions are stored as nested array references of the form | ||||||
| 165 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
| 166 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
| 167 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
| 168 | of a set are represented as Perl strings. | ||||||
| 169 | |||||||
| 170 | =item C<$CheckValidity> | ||||||
| 171 | |||||||
| 172 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
| 173 | ensure both partitions are valid and an exception is thrown if they | ||||||
| 174 | are not. The default is false. | ||||||
| 175 | |||||||
| 176 | =back | ||||||
| 177 | |||||||
| 178 | An example: | ||||||
| 179 | |||||||
| 180 | use Set::Partitions::Similarity qw(getAccuracy); | ||||||
| 181 | use Data::Dump qw(dump); | ||||||
| 182 | dump getAccuracy ([[qw(a b)], [qw(c d)]], [[qw(a b c d)]]); | ||||||
| 183 | dump getAccuracy ([[qw(a b c d)]], [[qw(a b)], [qw(c d)]]); | ||||||
| 184 | # dumps: | ||||||
| 185 | # "0.333333333333333" | ||||||
| 186 | # "0.333333333333333" | ||||||
| 187 | |||||||
| 188 | =cut | ||||||
| 189 | |||||||
| 190 | sub getAccuracy | ||||||
| 191 | { | ||||||
| 192 | 30 | 30 | 1 | 237977 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
| 193 | |||||||
| 194 | # get both similarities. | ||||||
| 195 | 30 | 225 | my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
| 196 | |||||||
| 197 | # return just the accuracy. | ||||||
| 198 | 30 | 242 | return $accuracy; | ||||
| 199 | } | ||||||
| 200 | |||||||
| 201 | |||||||
| 202 | =head2 C |
||||||
| 203 | |||||||
| 204 | The routine C |
||||||
| 205 | two partitions as an array C<(accuracy, precision)>. | ||||||
| 206 | |||||||
| 207 | =over | ||||||
| 208 | |||||||
| 209 | =item C<$PartitionP, $PartitionQ> | ||||||
| 210 | |||||||
| 211 | The partitions are stored as nested array references of the form | ||||||
| 212 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
| 213 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
| 214 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
| 215 | of a set are represented as Perl strings. | ||||||
| 216 | |||||||
| 217 | =item C<$CheckValidity> | ||||||
| 218 | |||||||
| 219 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
| 220 | ensure both partitions are valid and an exception is thrown if they | ||||||
| 221 | are not. The default is false. | ||||||
| 222 | |||||||
| 223 | =back | ||||||
| 224 | |||||||
| 225 | An example: | ||||||
| 226 | |||||||
| 227 | use Set::Partitions::Similarity qw(getAccuracyAndPrecision); | ||||||
| 228 | use Data::Dump qw(dump); | ||||||
| 229 | dump getAccuracyAndPrecision ([[1,2], [3,4]], [[1], [2], [3], [4]]); | ||||||
| 230 | dump getAccuracyAndPrecision ([[1], [2], [3], [4]], [[1,2], [3,4]]); | ||||||
| 231 | # dumps: | ||||||
| 232 | # ("0.666666666666667", 0) | ||||||
| 233 | # ("0.666666666666667", 0) | ||||||
| 234 | |||||||
| 235 | =cut | ||||||
| 236 | |||||||
| 237 | sub getAccuracyAndPrecision | ||||||
| 238 | { | ||||||
| 239 | 30 | 30 | 1 | 75 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
| 240 | |||||||
| 241 | # get the base count of edge types. | ||||||
| 242 | 30 | 101 | my ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $diffRefDiffModel) = getBaseEdgeCounts ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
| 243 | |||||||
| 244 | # get the total number of bases edges. | ||||||
| 245 | 30 | 124 | my $baseEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel; | ||||
| 246 | |||||||
| 247 | # if there are no base edges, the precision is one. | ||||||
| 248 | 30 | 62 | my $precision = 1; | ||||
| 249 | |||||||
| 250 | # get the precision. | ||||||
| 251 | 30 | 50 | 200 | $precision = $sameRefSameModel / $baseEdges if $baseEdges; | |||
| 252 | |||||||
| 253 | # get the total number of edges. | ||||||
| 254 | 30 | 83 | my $totalEdges = $sameRefSameModel + $sameRefDiffModel + $diffRefSameModel + $diffRefDiffModel; | ||||
| 255 | |||||||
| 256 | # if there are no edges, the accuracy is one. | ||||||
| 257 | 30 | 65 | my $accuracy = 1; | ||||
| 258 | |||||||
| 259 | # get the accuracy. | ||||||
| 260 | 30 | 50 | 141 | $accuracy = ($sameRefSameModel + $diffRefDiffModel) / $totalEdges if $totalEdges; | |||
| 261 | |||||||
| 262 | 30 | 157 | return ($accuracy, $precision); | ||||
| 263 | } | ||||||
| 264 | |||||||
| 265 | |||||||
| 266 | =head2 C |
||||||
| 267 | |||||||
| 268 | The routine C |
||||||
| 269 | two partitions, or C<1-getAccuracy($PartitionP, $PartitionQ, $CheckValidity)>. | ||||||
| 270 | |||||||
| 271 | =over | ||||||
| 272 | |||||||
| 273 | =item C<$PartitionP, $PartitionQ> | ||||||
| 274 | |||||||
| 275 | The partitions are stored as nested array references of the form | ||||||
| 276 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
| 277 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
| 278 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
| 279 | of a set are represented as Perl strings. | ||||||
| 280 | |||||||
| 281 | =item C<$CheckValidity> | ||||||
| 282 | |||||||
| 283 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
| 284 | ensure both partitions are valid and an exception is thrown if they | ||||||
| 285 | are not. The default is false. | ||||||
| 286 | |||||||
| 287 | =back | ||||||
| 288 | |||||||
| 289 | An example: | ||||||
| 290 | |||||||
| 291 | use Set::Partitions::Similarity qw(getDistance); | ||||||
| 292 | use Data::Dump qw(dump); | ||||||
| 293 | dump getDistance ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
| 294 | # dumps: | ||||||
| 295 | # "0.666666666666667" | ||||||
| 296 | |||||||
| 297 | =cut | ||||||
| 298 | |||||||
| 299 | sub getDistance | ||||||
| 300 | { | ||||||
| 301 | 0 | 0 | 1 | 0 | my $accuracy = getAccuracy (@_); | ||
| 302 | 0 | 0 | 0 | return 1 - $accuracy if defined $accuracy; | |||
| 303 | 0 | 0 | return undef; | ||||
| 304 | } | ||||||
| 305 | |||||||
| 306 | |||||||
| 307 | =head2 C |
||||||
| 308 | |||||||
| 309 | The routine C |
||||||
| 310 | two partitions. | ||||||
| 311 | |||||||
| 312 | =over | ||||||
| 313 | |||||||
| 314 | =item C<$PartitionP, $PartitionQ> | ||||||
| 315 | |||||||
| 316 | The partitions are stored as nested array references of the form | ||||||
| 317 | C<[[],...[]]>. For example, the set partition C<{{a,b}, {1,2}}> of the set | ||||||
| 318 | C<{a,b,1,2}> should be stored as the nested array references | ||||||
| 319 | C<[['a','b']],[1,2]]>. Note the elements | ||||||
| 320 | of a set are represented as Perl strings. | ||||||
| 321 | |||||||
| 322 | =item C<$CheckValidity> | ||||||
| 323 | |||||||
| 324 | If C<$CheckValidity> evaluates to true, then checks are performed to | ||||||
| 325 | ensure both partitions are valid and an exception is thrown if they | ||||||
| 326 | are not. The default is false. | ||||||
| 327 | |||||||
| 328 | =back | ||||||
| 329 | |||||||
| 330 | An example: | ||||||
| 331 | |||||||
| 332 | use Set::Partitions::Similarity qw(getPrecision); | ||||||
| 333 | use Data::Dump qw(dump); | ||||||
| 334 | dump getPrecision ([[1,2,3], [4]], [[1], [2,3,4]]); | ||||||
| 335 | # dumps: | ||||||
| 336 | # "0.2" | ||||||
| 337 | |||||||
| 338 | =cut | ||||||
| 339 | |||||||
| 340 | sub getPrecision | ||||||
| 341 | { | ||||||
| 342 | 0 | 0 | 1 | 0 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
| 343 | |||||||
| 344 | # get both Similarity. | ||||||
| 345 | 0 | 0 | my ($accuracy, $precision) = getAccuracyAndPrecision ($ReferencePartition, $ModelPartition, $CheckValidity); | ||||
| 346 | |||||||
| 347 | # return just the precision. | ||||||
| 348 | 0 | 0 | return $precision; | ||||
| 349 | } | ||||||
| 350 | |||||||
| 351 | |||||||
| 352 | sub getBaseEdgeCounts | ||||||
| 353 | { | ||||||
| 354 | 30 | 30 | 0 | 63 | my ($ReferencePartition, $ModelPartition, $CheckValidity) = @_; | ||
| 355 | |||||||
| 356 | # validates the partitions or throws an exception. | ||||||
| 357 | 30 | 50 | 201 | validatePartitionsOrDie ($ReferencePartition, $ModelPartition) if $CheckValidity; | |||
| 358 | |||||||
| 359 | 30 | 199 | my ($sameRefSameModel, $sameRefDiffModel) = getPartitionsEdgeCounts ($ReferencePartition, $ModelPartition); | ||||
| 360 | 30 | 257 | my ($sameModelSameRef, $diffRefSameModel) = getPartitionsEdgeCounts ($ModelPartition, $ReferencePartition); | ||||
| 361 | |||||||
| 362 | # make sure the number of edges for | ||||||
| 363 | 30 | 50 | 261 | if ($sameRefSameModel != $sameModelSameRef) | |||
| 364 | { | ||||||
| 365 | 0 | 0 | die "programming error: computed different values for number of edges in same partitions.\n"; | ||||
| 366 | } | ||||||
| 367 | |||||||
| 368 | # get the number of elements in the universe of the sets. | ||||||
| 369 | 30 | 66 | my $totalElements; | ||||
| 370 | { | ||||||
| 371 | 30 | 104 | my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition; | ||||
| 30 | 222 | ||||||
| 471666 | 1547952 | ||||||
| 13946 | 327477 | ||||||
| 372 | 30 | 554477 | $totalElements = scalar keys %universe; | ||||
| 373 | } | ||||||
| 374 | |||||||
| 375 | # get the total edges. | ||||||
| 376 | 30 | 254 | my $totalEdges = ($totalElements * ($totalElements - 1)) / 2; | ||||
| 377 | |||||||
| 378 | 30 | 354 | return ($sameRefSameModel, $sameRefDiffModel, $diffRefSameModel, $totalEdges - $sameRefSameModel - $sameRefDiffModel - $diffRefSameModel); | ||||
| 379 | } | ||||||
| 380 | |||||||
| 381 | |||||||
| 382 | sub getPartitionsEdgeCounts | ||||||
| 383 | { | ||||||
| 384 | 60 | 60 | 0 | 162 | my ($ReferencePartition, $ModelPartition) = @_; | ||
| 385 | |||||||
| 386 | 60 | 120 | my %modelId; | ||||
| 387 | 60 | 427 | for (my $id = 0; $id < @$ModelPartition; $id++) | ||||
| 388 | { | ||||||
| 389 | 28468 | 41099 | my $subset = $ModelPartition->[$id]; | ||||
| 390 | 28468 | 43905 | foreach my $element (@$subset) | ||||
| 391 | { | ||||||
| 392 | 943332 | 2213372 | $modelId{$element} = $id; | ||||
| 393 | } | ||||||
| 394 | } | ||||||
| 395 | |||||||
| 396 | # count the number of edges in the same partitions and the number in the | ||||||
| 397 | # same reference partitions but difference model partitions. | ||||||
| 398 | 60 | 169 | my $samePartitions = 0; | ||||
| 399 | 60 | 119 | my $sameRefDiffModel = 0; | ||||
| 400 | 60 | 144 | foreach my $subset (@$ReferencePartition) | ||||
| 401 | { | ||||||
| 402 | 28468 | 31039 | my %subsetModelPartitionCounts; | ||||
| 403 | |||||||
| 404 | # need to ensure the subset elements are unique. | ||||||
| 405 | { | ||||||
| 406 | 28468 | 28206 | my %elements; | ||||
| 28468 | 28361 | ||||||
| 407 | 28468 | 69247 | for (my $i = 0; $i < @$subset; $i++) | ||||
| 408 | { | ||||||
| 409 | 943332 | 50 | 2232888 | unless (exists ($elements{$subset->[$i]})) | |||
| 410 | { | ||||||
| 411 | 943332 | 1740494 | $elements{$subset->[$i]} = 1; | ||||
| 412 | 943332 | 3693942 | ++$subsetModelPartitionCounts{$modelId{$subset->[$i]}}; | ||||
| 413 | } | ||||||
| 414 | } | ||||||
| 415 | } | ||||||
| 416 | |||||||
| 417 | # get the sizes of the model partitions of the subset. | ||||||
| 418 | 28468 | 63875 | my @subsetPartitionSizes = values %subsetModelPartitionCounts; | ||||
| 419 | |||||||
| 420 | # count the number of edges having nodes in the same partitions. | ||||||
| 421 | 28468 | 41403 | foreach my $size (@subsetPartitionSizes) | ||||
| 422 | { | ||||||
| 423 | 29074 | 76346 | $samePartitions += ($size * ($size - 1)) / 2; | ||||
| 424 | } | ||||||
| 425 | |||||||
| 426 | # count the number of edges having nodes in the same reference partitions | ||||||
| 427 | # but different model partitions. | ||||||
| 428 | 28468 | 36292 | my $prefixSumOfSizes; | ||||
| 429 | 28468 | 50 | 59053 | $prefixSumOfSizes = $subsetPartitionSizes[0] if @subsetPartitionSizes; | |||
| 430 | 28468 | 118266 | for (my $i = 1; $i < @subsetPartitionSizes; $i++) | ||||
| 431 | { | ||||||
| 432 | 606 | 708 | $sameRefDiffModel += $prefixSumOfSizes * $subsetPartitionSizes[$i]; | ||||
| 433 | 606 | 1291 | $prefixSumOfSizes += $subsetPartitionSizes[$i]; | ||||
| 434 | } | ||||||
| 435 | } | ||||||
| 436 | |||||||
| 437 | 60 | 730766 | return ($samePartitions, $sameRefDiffModel); | ||||
| 438 | } | ||||||
| 439 | |||||||
| 440 | |||||||
| 441 | # for the set partitions to be valid, the union of sets of each partition | ||||||
| 442 | # must be equal. the routine returns true of they are, false if not. | ||||||
| 443 | sub doPartitionsHaveSameUnion | ||||||
| 444 | { | ||||||
| 445 | 30 | 30 | 0 | 109 | my ($ReferencePartition, $ModelPartition) = @_; | ||
| 446 | |||||||
| 447 | # add all the reference elements to the hash. | ||||||
| 448 | 30 | 249 | my %universe = map { ($_, 1) } map { @$_ } @$ReferencePartition; | ||||
| 471666 | 1347251 | ||||||
| 13946 | 253827 | ||||||
| 449 | |||||||
| 450 | # now check each subset of the model partition. | ||||||
| 451 | 30 | 211991 | foreach my $subset (@$ModelPartition) | ||||
| 452 | { | ||||||
| 453 | # return 0 if an element from the subset is missing. | ||||||
| 454 | 14522 | 23685 | foreach my $element (@$subset) | ||||
| 455 | { | ||||||
| 456 | 471666 | 50 | 1232215 | return 0 unless exists $universe{$element}; | |||
| 457 | } | ||||||
| 458 | |||||||
| 459 | # delete all the elements in the hash from the subset. | ||||||
| 460 | 14522 | 22088 | foreach my $element (@$subset) | ||||
| 461 | { | ||||||
| 462 | 471666 | 774656 | delete $universe{$element}; | ||||
| 463 | } | ||||||
| 464 | } | ||||||
| 465 | |||||||
| 466 | # if there are any elements remaining return 0. | ||||||
| 467 | 30 | 50 | 145 | return 0 if %universe; | |||
| 468 | |||||||
| 469 | 30 | 274 | return 1; | ||||
| 470 | } | ||||||
| 471 | |||||||
| 472 | |||||||
| 473 | # this routine checks that the two partitions have the same union and | ||||||
| 474 | # each partition is composed for sets that a mutually disjoint. the | ||||||
| 475 | # routine throws and exception is the partitions are invalid. | ||||||
| 476 | sub validatePartitionsOrDie | ||||||
| 477 | { | ||||||
| 478 | 30 | 30 | 0 | 102 | my ($ReferencePartition, $ModelPartition) = @_; | ||
| 479 | |||||||
| 480 | # make sure the reference partition is valid. | ||||||
| 481 | 30 | 50 | 99 | unless (areSubsetsDisjoint ($ReferencePartition)) | |||
| 482 | { | ||||||
| 483 | 0 | 0 | die "first partition is an invalid partition, an element occurs in two or more subsets.\n"; | ||||
| 484 | } | ||||||
| 485 | |||||||
| 486 | # make sure the model partition is valid. | ||||||
| 487 | 30 | 50 | 228 | unless (areSubsetsDisjoint ($ModelPartition)) | |||
| 488 | { | ||||||
| 489 | 0 | 0 | die "second partition is an invalid partition, an element occurs in two or more subsets.\n"; | ||||
| 490 | } | ||||||
| 491 | |||||||
| 492 | # make sure the partitions have the same universe. | ||||||
| 493 | 30 | 50 | 372 | unless (doPartitionsHaveSameUnion ($ReferencePartition, $ModelPartition)) | |||
| 494 | { | ||||||
| 495 | 0 | 0 | die "partitions are invalid, they have different set elements.\n"; | ||||
| 496 | } | ||||||
| 497 | |||||||
| 498 | 30 | 83 | return 1; | ||||
| 499 | } | ||||||
| 500 | |||||||
| 501 | =head1 EXAMPLE | ||||||
| 502 | |||||||
| 503 | The code following measures the I |
||||||
| 504 | equally into subsets of size C<$s> to the entire set. | ||||||
| 505 | |||||||
| 506 | use Set::Partitions::Similarity qw(getDistance); | ||||||
| 507 | my @p = ([0..511]); | ||||||
| 508 | for (my $s = 1; $s <= 512; $s += $s) | ||||||
| 509 | { | ||||||
| 510 | my @q = map { [$s*$_..($s*$_+$s-1)] } (0..(512/$s-1)); | ||||||
| 511 | print join (', ', $s, getDistance (\@p, \@q, 1)) . "\n"; | ||||||
| 512 | } | ||||||
| 513 | # dumps: | ||||||
| 514 | # 1, 1 | ||||||
| 515 | # 2, 0.998043052837573 | ||||||
| 516 | # 4, 0.99412915851272 | ||||||
| 517 | # 8, 0.986301369863014 | ||||||
| 518 | # 16, 0.970645792563601 | ||||||
| 519 | # 32, 0.939334637964775 | ||||||
| 520 | # 64, 0.876712328767123 | ||||||
| 521 | # 128, 0.75146771037182 | ||||||
| 522 | # 256, 0.500978473581213 | ||||||
| 523 | # 512, 0 | ||||||
| 524 | |||||||
| 525 | =head1 INSTALLATION | ||||||
| 526 | |||||||
| 527 | To install the module run the following commands: | ||||||
| 528 | |||||||
| 529 | perl Makefile.PL | ||||||
| 530 | make | ||||||
| 531 | make test | ||||||
| 532 | make install | ||||||
| 533 | |||||||
| 534 | If you are on a windows box you should use 'nmake' rather than 'make'. | ||||||
| 535 | |||||||
| 536 | =head1 BUGS | ||||||
| 537 | |||||||
| 538 | Please email bugs reports or feature requests to C |
||||||
| 539 | the web interface at L |
||||||
| 540 | will be notified and you can be automatically notified of progress on the bug fix or feature request. | ||||||
| 541 | |||||||
| 542 | =head1 AUTHOR | ||||||
| 543 | |||||||
| 544 | Jeff Kubina |
||||||
| 545 | |||||||
| 546 | =head1 COPYRIGHT | ||||||
| 547 | |||||||
| 548 | Copyright (c) 2009 Jeff Kubina. All rights reserved. | ||||||
| 549 | This program is free software; you can redistribute | ||||||
| 550 | it and/or modify it under the same terms as Perl itself. | ||||||
| 551 | |||||||
| 552 | The full text of the license can be found in the | ||||||
| 553 | LICENSE file included with this module. | ||||||
| 554 | |||||||
| 555 | =head1 KEYWORDS | ||||||
| 556 | |||||||
| 557 | accuracy, clustering, measure, metric, partitions, precision, set, similarity | ||||||
| 558 | |||||||
| 559 | =head1 SEE ALSO | ||||||
| 560 | |||||||
| 561 | =begin html | ||||||
| 562 | |||||||
| 563 | Concise explainations of many cluster validity measures (including set partition measures) are available on |
||||||
| 564 | the Cluster validity algorithms page | ||||||
| 565 | of the Machaon Clustering and Validation Environment web site | ||||||
| 566 | by Nadia Bolshakova. | ||||||
| 567 | |||||||
| 568 | The Wikipedia article Accuracy and precision has a good explaination |
||||||
| 569 | of the accuracy and precision measures when applied to | ||||||
| 570 | binary classifications. | ||||||
| 571 | |||||||
| 572 | The report Objective Criteria for the Evaluation of Clustering Methods (1971) |
||||||
| 573 | by W.M. Rand in the Journal of the American Statistical Association provides an excellent analysis of the accuracy | ||||||
| 574 | measure of partitions. | ||||||
| 575 | |||||||
| 576 | =end html | ||||||
| 577 | |||||||
| 578 | =cut | ||||||
| 579 | |||||||
| 580 | 1; | ||||||
| 581 | # The preceding line will help the module return a true value | ||||||
| 582 |