File Coverage

blib/lib/Text/SenseClusters/LabelEvaluation/LabelEvaluation.pm
Criterion Covered Total %
statement 114 143 79.7
branch 21 42 50.0
condition n/a
subroutine 11 11 100.0
pod 0 3 0.0
total 146 199 73.3


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             # Defining the Package for the modules.
4             package Text::SenseClusters::LabelEvaluation::LabelEvaluation;
5              
6 1     1   1841 use strict;
  1         2  
  1         43  
7 1     1   8 use encoding "utf-8";
  1         3  
  1         9  
8              
9             # Defining the version for the Progrm.
10             our $VERSION = '0.06';
11              
12             # Including the FileHandle module.
13 1     1   1476 use FileHandle;
  1         14425  
  1         7  
14              
15             # Including the other dependent Modules.
16 1     1   560 use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
  1         3  
  1         52  
17 1     1   7 use Text::SenseClusters::LabelEvaluation::PrintingHashData;
  1         2  
  1         43  
18 1     1   6 use Text::SenseClusters::LabelEvaluation::Wikipedia::GetWikiData;
  1         3  
  1         47  
19 1     1   7 use Text::SenseClusters::LabelEvaluation::SimilarityScore;
  1         2  
  1         46  
20 1     1   6 use Text::SenseClusters::LabelEvaluation::ConfusionMatrixTotalCalc;
  1         2  
  1         143  
21              
22             #######################################################################################################################
23              
24             =head1 Name
25              
26             Text::SenseClusters::LabelEvaluation - Module for evaluation of labels of the clusters.
27              
28             =head1 SYNOPSIS
29              
30             The following code snippet will evaluate the labels by comparing
31             them with text data for a gold-standard key from Wikipedia .
32              
33             # Including the LabelEvaluation Module.
34             use Text::SenseClusters::LabelEvaluation::LabelEvaluation;
35             # Including the FileHandle module.
36             use FileHandle;
37              
38             # File that will contain the label information.
39             my $labelFileName = "temp_label.txt";
40              
41             # Defining the file handle for the label file.
42             our $labelFileHandle = FileHandle->new(">$labelFileName");
43              
44             # Writing into the label file.
45             print $labelFileHandle "Cluster 0 (Descriptive): George Bush, Al Gore, White House,".
46             " COMMENTARY k, Cox News, George W, BRITAIN London, U S, ".
47             "Prime Minister, New York \n\n";
48             print $labelFileHandle "Cluster 0 (Discriminating): George Bush, COMMENTARY k, Cox ".
49             "News, BRITAIN London \n\n";
50             print $labelFileHandle "Cluster 1 (Descriptive): U S, Al Gore, White House, more than,".
51             "George W, York Times, New York, Prime Minister, President ".
52             "B_T, the the \n\n";
53             print $labelFileHandle "Cluster 1 (Discriminating): more than, York Times, President ".
54             "B_T, the the \n";
55            
56             # File that will contain the topic information.
57             my $topicFileName = "temp_topic.txt";
58              
59             # Defining the file handle for the topic file.
60             our $topicFileHandle = FileHandle->new(">$topicFileName");
61              
62             # Writing into the Topic file.
63             # Bill Clinton , Tony Blair
64             print $topicFileHandle "Bill Clinton , Tony Blair \n";
65              
66             # Closing the handles.
67             close($labelFileHandle);
68             close($topicFileHandle);
69              
70             # Calling the LabelEvaluation modules by passing the following options
71              
72             %inputOptions = (
73              
74             labelFile => $labelFileName,
75             labelKeyFile => $topicFileName
76             );
77              
78              
79             # Calling the LabelEvaluation modules by passing the name of the
80             # label and topic files.
81             my $score = Text::SenseClusters::LabelEvaluation::LabelEvaluation->
82             new (\%inputOptions);
83            
84              
85             # Printing the score.
86             print "\nScore of label evaluation is :: $score \n";
87              
88             # Deleting the temporary label and topic files.
89             unlink $labelFileName or warn "Could not unlink $labelFileName: $!";
90             unlink $topicFileName or warn "Could not unlink $topicFileName: $!";
91              
92              
93             =head1 DESCRIPTION
94              
95             This Program will compare the result obtained from the SenseClusters with that
96             of Gold Standards. Gold Standards will be obtained from two independent and
97             reliable source:
98             1. Wikipedia
99             2. Wordnet
100            
101             For fetching the Wikipedia data it use the WWW::Wikipedia module from the CPAN
102             and for comparison of Labels with Gold Standards it uses the Text::Similarity
103             Module. The comparison result is then further processed to obtain the result
104             and score of result.
105            
106              
107              
108             Result:
109              
110             a) Decision Matrix:
111             Based on the similarity comparison of Labels with the gold standards,
112             the decision matrix are calculated as below:
113              
114             For eg:
115             ===========================================================================
116             | Cluster0 | Cluster1 | Row Total
117             ---------------------------------------------------------------------------
118             Topic#1 | 271 | 2713 | 2984
119             ---------------------------------------------------------------------------
120             Topic#2 | 2396 | 306 | 2702
121             ---------------------------------------------------------------------------
122             Col Total | 2667 | 3019 | 5686
123             ===========================================================================
124              
125             b) Calculated decision Matrix:
126             Now based on decision matrix, a new calculated matrix is printed.
127             Each of the cell in the matrix, will contains the probabilities value:
128            
129             CELL_VALUE_IN_DECISION_MATRIX / TOTAL_SCORE_OF_DECISION_MATRIX
130            
131            
132             For eg:
133             For cell : Cluster0 - Topic#1
134             i) First -Value = 271 / 5686 = 0.048
135              
136              
137             Now based on above decision matrix, new calculated matrix is:
138             ========================================================================
139             | Cluster0 | Cluster1
140             ------------------------------------------------------------------------
141             Topic#1 | 0.048 | 0.477
142             ------------------------------------------------------------------------
143             Topic#2 | 0.421 | 0.054
144             ------------------------------------------------------------------------
145              
146              
147             c) Interpreting Calculated decision Matrix:
148            
149             1. Row-Wise Comparison
150             For each topic, "row score" will be compared and cluster with maximum
151             value will be assigned to that topic.
152             for eg:
153             a) Topic#1 Cluster1 (max-row-score = 0.477 )
154             b) Topic#2 Cluster0 (max-row-score = 0.421 )
155            
156             2. Col-Wise Comparison
157             For each Cluster, "col score" will be compared and topic with maximum
158             value will be assigned to that Cluster.
159             for eg:
160             a) Cluster0 Topic#2 (max-col-score = 0.421 )
161             b) Cluster1 Topic#1 (max-col-score = 0.477 )
162              
163             d) Deriving final conclusion from above two comparison:
164            
165             Result of Row-Wise comparison and Column-wise comparison is matched.
166             Only matching result is then printed.
167              
168             For eg:
169             1. Row-Wise Comparison
170             a) Topic#1 Cluster1
171             b) Topic#2 Cluster0
172             2. Col-Wise Comparison
173             a) Cluster0 Topic#2
174             b) Cluster1 Topic#1
175              
176             Matching Result:
177             Cluster0 Topic#2
178             Cluster1 Topic#1
179              
180             e) Overall score:
181             This is the multiplication of all the probability scores of all
182             matching cluster and topics.
183            
184             For eg:
185             The score for above example will be: 0.201
186            
187            
188              
189              
190             =cut
191              
192              
193             #######################################################################################################################
194              
195             # Declaring the global variables for the LabelEvaluation.
196              
197             # 1. labelFile:
198             # Name of the file containing the labels from sense cluster.
199             our $senseClusterLabelFileName;
200              
201             # 2. labelKeyFile:
202             # Name of the file containing the comma separated actual topics (keys)
203             # for the clusters.
204             our $topicsFileName;
205            
206             # 3. This variable contains the lenth of the data to be fetched from gold
207             # standard source.
208             our $labelKeyLength = 0;
209              
210             # 4. This variable will tell the ratio of weightage of Discriminating labels
211             # over weightage of descriptive labels. Default value is set to 10.
212             our $weightRatio = 10;
213              
214             # 5. This variable will tell user supplied location of file that contains
215             # the stop list.
216             our $stopListFileLocation = "";
217              
218             # 6. This variable will decide whether to keep or delete temporary files.
219             our $isClean = 0;
220              
221             # 7. Variable used for the deciding whether to show detailed results
222             # to user or not.
223             # Default value = Off, to make it 'On' change value to 1.
224             our $isDecisionMatrixDebugOn = 0;
225              
226             # 8. This variable will decide whether to display help to user or not.
227             our $help = "";
228              
229              
230              
231             # Defining the name of the Source from where we are getting the text, for
232             # finding the label.
233             our $standardReferenceName_Global = "Wikipedia";
234              
235             # Defining the file handle for the output file.
236             our $outFileHandle;
237              
238             sub new{
239              
240             # Global variable for storing the labels from the sense cluster.
241 1     1 0 433 our %labelSenseClustersHash_Global =();
242              
243             # Openning the output file in Write mode.
244 1 50       25 open ($outFileHandle, ">&", \*STDERR) or die "Can't duped STDERR: $!";
245            
246             # This variable is never used, so can be ignored.
247 1         4 our $programName = shift;
248            
249            
250             # Add here the options code:
251            
252             # Getting the options-hash from the command line argument.
253 1         2 our $optionHashRef = shift;
254            
255             # Getting the options hash from its reference.
256 1         6 our %optionsHash = %$optionHashRef;
257            
258             # OptionsHash the following options:
259             # 1. labelFile:
260             # Name of the file containing the labels from sense cluster. The syntax of file
261             # must be similar to label file from SenseClusters. This is the mandatory option.
262             #
263             # 2. labelKeyFile:
264             # Name of the file containing the comma separated actual topics (keys) for the
265             # clusters. This is the mandatory option.
266             #
267             # 3. labelKeyLength:
268             # This parameters tell about the length of data to be fetched from Wikipedia
269             # which will be used as reference data. Default is the first section of the
270             # Wikipedia page.
271             #
272             # 4. weightRatio:
273             # This ratio tells us about how much the weight we should provide to Discriminating
274             # label to that of the descriptive label. Default value is set to 10.
275             #
276             # 5. stopList:
277             # This is the name of file which contains the list of all stop words. This is the
278             # optional parameter.
279             #
280             # 6. isClean:
281             # This option tells us whether to keep temporary files or not. Default value is
282             # true
283             #
284             # 7. verbose:
285             # This option will let you see details output. Default value is false.
286             #
287             # 8. help :
288             # This option will show the details about running this module. This is the
289             # optional parameter.
290             #
291            
292            
293             # 1. labelFile
294 1 50       5 if($optionsHash{"labelFile"}){
295 1         4 $senseClusterLabelFileName = $optionsHash{"labelFile"};
296             }else{
297             # display here the help .TODO, write here properly.
298 0         0 print STDERR "Please type help to see how to run the program!";
299             }
300            
301             # 2. labelKeyFile
302 1 50       4 if($optionsHash{"labelKeyFile"}){
303 1         3 $topicsFileName = $optionsHash{"labelKeyFile"};
304             }else{
305             # display here the help .TODO, write here properly.
306 0         0 print STDERR "Please type help to see how to run the program!";
307             }
308              
309             # 3. Weight ratio.
310 1 50       5 if($optionsHash{"weightRatio"}){
311 0         0 $weightRatio = $optionsHash{"weightRatio"};
312             }
313            
314             # 4. Weight ratio.
315 1 50       7 if($optionsHash{"labelKeyLength"}){
316 0         0 $labelKeyLength = $optionsHash{"labelKeyLength"};
317             }
318            
319             # 5. Setting the option which contains the location for file that contains the stop
320             # words list.
321 1 50       5 if($optionsHash{"stopList"}){
322 0         0 $stopListFileLocation = $optionsHash{"stopList"};
323             }
324            
325            
326             # 6. Setting the option whether to delete or keep the temporary files.
327 1 50       5 if($optionsHash{"isClean"}){
328 0         0 $isClean = $optionsHash{"isClean"};
329             }
330            
331             # 7. Setting the detailed debug option using the user input.
332 1 50       4 if($optionsHash{"verbose"}){
333 0         0 $isDecisionMatrixDebugOn = $optionsHash{"verbose"};
334             }
335            
336             # 8. Setting the option whether to display help or not using the user input.
337 1 50       6 if($optionsHash{"help"}){
338 0         0 $help = $optionsHash{"help"};
339             }
340            
341             # Checking if the Label file's name is provided by user.
342 1 50       4 if(!defined $senseClusterLabelFileName){
343             # Close the file handle.
344 0         0 close ($outFileHandle);
345            
346             # If no argument is passed then return from here. This is the place
347             # where we can ask user to print help.
348 0         0 print "Type 'LabelEvaluation help' for usage.";
349            
350             # Return the error code which indicates insufficient argument.
351 0         0 return 2;
352             }
353            
354             # Checking if the Label file's name is provided by user.
355 1 50       3 if(!defined $topicsFileName){
356             # Close the file handle.
357 0         0 close ($outFileHandle);
358            
359             # If no argument is passed then return from here. This is the place
360             # where we can ask user to print help.
361 0         0 print "Type 'LabelEvaluation help' for usage.";
362            
363             # Return the error code which indicates insufficient argument.
364 0         0 return 2;
365             }
366            
367              
368             =pod
369              
370             =head1 Help
371             --------------------
372              
373             The LabelEvaluation module expect the 'OptionsHash' as the required argument.
374             The 'optionHash' has the following elements:
375            
376             1. labelFile:
377             Name of the file containing the labels from sense cluster. The syntax of file
378             must be similar to label file from SenseClusters. This is the mandatory option.
379            
380             2. labelKeyFile:
381             Name of the file containing the comma separated actual topics (keys) for the
382             clusters. This is the mandatory option.
383            
384             3. labelKeyLength:
385             This parameters tell about the length of data to be fetched from Wikipedia
386             which will be used as reference data. Default is the first section of the
387             Wikipedia page.
388            
389             4. weightRatio:
390             This ratio tells us about how much the weight we should provide to Discriminating
391             label to that of the descriptive label. Default value is set to 10.
392            
393             5. stopList:
394             This is the name of file which contains the list of all stop words. This is the
395             optional parameter.
396            
397             6. isClean:
398             This option tells us whether to keep temporary files or not. Default value is
399             true
400            
401             7. verbose:
402             This option will let you see details output. Default value is false.
403            
404             8. help :
405             This option will show the details about running this module. This is the
406             optional parameter.
407            
408             %inputOptions = (
409            
410             labelFile => '/',
411             labelKeyFile => '/',
412             labelKeyLength=> '',
413             weightRatio=> '',
414             stopList=> '/',
415             isClean=> 1,
416             verbose=> 1,
417             help=> 'help'
418             );
419            
420             =cut
421            
422 1 50       5 if($help){
423 0         0 print "\nPlease pass the options-hash in following format:
424             %inputOptions = (
425              
426             labelFile => '/',
427             labelKeyFile => '/',
428             labelKeyLength=> '',
429             weightRatio=> '',
430             stopList=> '/',
431             isClean=> 1,
432             verbose=> 1,
433             help=> 'help'
434             );
435              
436             Note that only 'labelFile' and 'labelKeyFile' are mandatory options.
437             For example, please refer the SYNOPSIS section of the LabelEvaluation Module.\n";
438              
439 0         0 return 3;
440             }
441            
442            
443             # Calling the function "readLinesFromClusterFile"
444 1         10 our $labelSenseClustersHashRef_Global =
445             Text::SenseClusters::LabelEvaluation::ReadingFilesData::readLinesFromClusterFile(
446             $senseClusterLabelFileName,\%labelSenseClustersHash_Global);
447              
448             # Getting the Hash from its reference.
449 1         5 %labelSenseClustersHash_Global = %$labelSenseClustersHashRef_Global;
450              
451             # Calling readLinesFromTopicFile function to get the list of all the topics.
452 1         8 our $standardTermsGlobal =
453             Text::SenseClusters::LabelEvaluation::ReadingFilesData::readLinesFromTopicFile($topicsFileName);
454              
455             # Calling makeDecisionOfSense() function to get the final decision.
456 1         7 my $score = makeDecisionOfSense(\%labelSenseClustersHash_Global,
457             $standardReferenceName_Global, $standardTermsGlobal, $stopListFileLocation);
458              
459             #print $outFileHandle "\nScore:: $score";
460              
461             # Returning the overall score given by this module for labels.
462 1         10 return "$score";
463             }
464              
465              
466              
467             #########################################################################################################
468             =head1 function: makeDecisionOfSense
469              
470             This function will do the evaluation of labels.
471            
472             @argument1 : LabelSenseClusters DataType(Reference to HashOfHash)
473              
474             @argument2 : StandardReferenceName: DataType(String)
475             Name of the external application.
476             Currently, its two possible values are:
477             1. Wikipedia
478             2. WordNet
479            
480             @argument3 : StandardTerms: DataType(String)
481             Terms(comma separated) to be sent to Wikipedia or Wordnet for
482             getting the Gold Standard Labels.
483            
484             @return : Score : DataType(Float)
485             Indicates the measure of overlap of current label mechanisms
486             with the Gold Standard Labels.
487              
488            
489             @description :
490             1). It will go through the Hash which contains the clusters and label terms.
491             2). Each cluster's label terms will be written to a file whose name will be
492             same as of cluster name(or number).
493             3). Then, this will go through the Standard terms against which we have to
494             compare the cluster labels.
495             4). We will then create the files with name of the terms and content of the
496             file will be data fetched from the Wikipedia against a topic.
497             5). Then, cluster's data and topic's data are compared using the method
498             from Text::Similarity::Overlaps.
499             6). Finally the calculated scores are used further for decision matrix and
500             getting the final score value.
501              
502             =cut
503             #######################################################################################################
504             sub makeDecisionOfSense{
505            
506             # Reference of Hash containing the clusters and their corresponding labels.
507 1     1 0 2 my $labelSenseClustersHashRef = shift;
508              
509             # Getting the Hash from its reference.
510 1         4 my %labelSenseClustersHash = %$labelSenseClustersHashRef;
511            
512             # Getting the Name of the external application to lookup from the Argument.
513 1         3 my $standardReferenceName = shift;
514            
515             # Terms to be sent to the external application for getting the Gold
516             # Standard Labels.
517 1         1 my $standardTerms = shift;
518            
519             # Getting the Stop List file location from the argument.
520 1         3 my $stopListFileLocation = shift;
521              
522             # Array to hold the file names for all the clusters.
523 1         3 my @fileNameForClustersArray = ();
524            
525             # Array for holding the name of all the clusters.
526 1         2 my @clusterNameArray = ();
527              
528             # Hash which will hold the score of Topic against a Cluster and its scoring value.
529 1         2 my %hashForClusterTopicScore = ();
530              
531             # Iterating through the Hash which contains the clusters name and its labels
532             # as assigned by sense cluster.
533 1         6 foreach my $sortedOuterKey (sort keys %labelSenseClustersHash){
534            
535             # Open the file handle with Write mode.
536 2         130 open (CLUSTERFILE, ">temp_$sortedOuterKey.txt");
537            
538             # Storing the name of the file (for a cluster data) in the array,
539 2         9 push(@fileNameForClustersArray, "temp_$sortedOuterKey.txt");
540            
541             # Storing the cluster name in the clusterNameArray.
542 2         5 push(@clusterNameArray, $sortedOuterKey);
543            
544             # Iterating through the type-of-Labels to fetch the value Of the Hash.
545 2         3 foreach my $sortedInnerKey (sort keys %{$labelSenseClustersHash{$sortedOuterKey}}){
  2         10  
546              
547             # Writing the label terms in the
548 4         42 print CLUSTERFILE "\n$labelSenseClustersHash{$sortedOuterKey}{$sortedInnerKey}";
549             }
550             # Close the file handle.
551 2         76 close (CLUSTERFILE);
552             }
553              
554              
555             # Spliting the standard terms on "," to get the Topic name.
556             # For e.g: "Bill Clinton , Tony Blair"
557 1         7 my @standardTermsArray = split(/[\,]/, $standardTerms);
558            
559             # Defining the array for holding the name of the files.
560 1         4 my @standardTermsFileArray = ();
561            
562             # 1. Going through the terms against which we have to compare the cluster labels.
563             # 2. We will create the files with name of the terms.
564             # 3. Content of the files will be data fetched from the Wikipedia against a topic.
565             # 4. Finally, storing the name of the newly created files into Array for further
566             # similarity comparison.
567 1         2 foreach my $sortedKey (@standardTermsArray){
568 2         15 push(@standardTermsFileArray,
569             Text::SenseClusters::LabelEvaluation::Wikipedia::GetWikiData::getWikiDataForTopic($sortedKey));
570             }
571              
572             # Iterating through the ClusterFiles against TopicFiles to get the similarity value.
573 1         4 foreach my $clusterFileName (@fileNameForClustersArray){
574 2         27 foreach my $topicFileName (@standardTermsFileArray){
575            
576             # Calling the "computeOverlappingScores" to get the similarity score and
577             # store it into hash.
578 4         77 $hashForClusterTopicScore{$clusterFileName}{$topicFileName}
579             = Text::SenseClusters::LabelEvaluation::SimilarityScore::computeOverlappingScores(
580             $clusterFileName,$topicFileName, $stopListFileLocation);
581             }
582             }
583              
584             # Defining the Reference for the hash, %topicTotalSumHash.
585 1         26 my $topicTotalSumHashRef;
586            
587             # Defining the Reference for the hash, %clusterTotalSumHash.
588             my $clusterTotalSumHashRef;
589              
590             # Calling the function to print the decision matrix, based on the above similarity score.
591 1         8 ($topicTotalSumHashRef,$clusterTotalSumHashRef)=
592             printDecisionMatrix(\@clusterNameArray, \@standardTermsArray, \%hashForClusterTopicScore);
593              
594             # Getting the Hash from its references.
595 1         5 my %topicTotalSumHash = %$topicTotalSumHashRef;
596 1         4 my %clusterTotalSumHash = %$clusterTotalSumHashRef;
597              
598              
599             # Calling the function to print the newly calculated decision matrix, based on the
600             # above decision matrix.
601 1         9 my $score =
602             Text::SenseClusters::LabelEvaluation::ConfusionMatrixTotalCalc::printCalculatedScoreMatrix(
603             $outFileHandle, \@clusterNameArray, \@standardTermsArray,
604             \%hashForClusterTopicScore,\%topicTotalSumHash ,\%clusterTotalSumHash,
605             $isDecisionMatrixDebugOn);
606              
607              
608            
609             # Deleting all the temporary topic files at end of operation.
610 1         3 foreach my $topicFileName(@standardTermsFileArray){
611 2 50       282 unlink $topicFileName or warn "Could not unlink $topicFileName: $!";
612             }
613              
614             # Deleting all the temporary clusters files at end of operation.
615 1         3 foreach my $clusterFileName(@fileNameForClustersArray){
616 2 50       122 unlink $clusterFileName or warn "Could not unlink $clusterFileName: $!";
617             }
618              
619              
620 1         11 return $score;
621             }
622              
623              
624              
625             #########################################################################################################
626             =pod
627              
628             =head1 function: printDecisionMatrix
629              
630             This function is responsible for printing the decision matrix.
631            
632             @argument1 : clusterNameArrayRef: DataType(Reference_Of_Array)
633             Reference to Array containing Cluster Name.
634            
635             @argument2 : standardTermsArrayRef: DataType(Reference_Of_Array)
636             Reference to Array containing Standard terms.
637            
638             @argument3 : hashForClusterTopicScoreRef: DataType(Reference_Of_Hash)
639             Reference to hash containing Cluster Name, corresponding
640             StandardTopic and its score.
641            
642              
643             @return1 : topicTotalSumHash: DataType(Reference_Of_Hash)
644             Hash which will contains the total score for a topic
645             against each clusters.
646            
647             @return2 : clusterTotalSumHash: DataType(Reference_Of_Hash)
648             Hash which will contains the total score for a cluster
649             against each topics.
650            
651              
652              
653             @description :
654             1). It will go through the Hash which contains the similarity score for
655             each clusters against standard label terms.
656             2). This uses the above hash to print the decision matrix. Below has the
657             example of the decision matrix.
658             3). It will also use the ScoringHash to get new hashes which will store
659             a) total score for a cluster against each topics.
660             b) total score for a topic against each cluster.
661              
662              
663             Example of decision Matrix
664              
665             ==============================================================================
666             | Cluster0 | Cluster1
667             ------------------------------------------------------------------------------
668             Bill Clinton: | 11 | 12 | 23(ROW TOTAL)
669             ------------------------------------------------------------------------------
670             ------------------------------------------------------------------------------
671             Tony Blair: | 15 | 9 | 24 (ROW TOTAL)
672             ------------------------------------------------------------------------------
673             Total | 26 | 21 | 47
674             | (COL TOTAL) | (COL TOTAL) | (Total Matrix Sum)
675              
676              
677             Where, 1) Cluster0, Cluster1 are Cluster Names.
678             2) Bill Clinton, Tony Blair are Standard Topics.
679             3) 23, 24 are Row Total of the Topic score. (ROW TOTAL)
680             4) 26, 21 are Col Total of the ClusterName Score. (COL TOTAL)
681             5) 47 is Total sum of the scores of all clusters again all topics.
682             (Total Matrix Sum)
683              
684              
685             =cut
686             #######################################################################################################
687              
688             sub printDecisionMatrix{
689              
690             # Getting the ReferenceToArray which contains ClusterName from the argument.
691 1     1 0 135 my $clusterNameArrayRef = shift;
692             # Getting the array from the reference.
693 1         4 my @clusterNameArray = @$clusterNameArrayRef;
694              
695             # Getting the ReferenceToArray which contains StandardTerms from the argument.
696 1         2 my $standardTermsArrayRef = shift;
697             # Getting the array from the reference.
698 1         3 my @standardTermsArray = @$standardTermsArrayRef;
699              
700             # Getting the Reference to hash which contains Cluster Name, corresponding
701             # StandardTopic and its score from the argument.
702 1         1 my $hashForClusterTopicScoreRef = shift;
703             # Getting the hash from the reference.
704 1         6 my %hashForClusterTopicScore = %$hashForClusterTopicScoreRef;
705              
706            
707             # Defining Hash which will contain the topics and their total score.
708 1         2 my %topicTotalSumHash =();
709              
710             # Defining Hash which will contain the cluster and their total score.
711 1         2 my %clusterTotalSumHash =();
712              
713             # Variable will hold the total values of the decision matrix.
714 1         1 my $totalDecisionMatrixSum = 0;
715              
716             # If user opted to print the decision matrix, then only print the below.
717 1 50       6 if($isDecisionMatrixDebugOn == 1){
718             # Printing the
719             # 1. Title and
720             # 2. Table Column headers for decision matrix table.
721            
722            
723 0         0 print $outFileHandle "\nDECISION MATRIX(Count)::";
724 0         0 print $outFileHandle "\n==============================================================".
725             "=================================================================================\n\t\t";
726            
727 0         0 foreach my $clusterName (@clusterNameArray){
728 0         0 print $outFileHandle "\t|\t $clusterName";
729             }
730             }
731              
732              
733             # Iterate through the list of all Standard Terms.
734 1         2 foreach my $topicName (@standardTermsArray){
735              
736             # Getting the topic name in temporary variable.
737 2         13 my $topicNameLabel = $topicName;
738            
739             # Removing the extra white space with single.
740 2         23 $topicNameLabel =~ s/\s+/ /g;
741              
742             # Removing the white space from the front and end of the word.
743 2         18 $topicNameLabel =~ s/^\s+|\s+$//g;
744            
745             # If user opted to print the decision matrix, then only print the below.
746             # 3. Table Row headers for decision matrix table.
747 2 50       16 if($isDecisionMatrixDebugOn ==1){
748 0         0 print $outFileHandle "\n--------------------------------------------------------------".
749             "----------------------------------------------------------------------------------";
750 0         0 print $outFileHandle "\n\t$topicNameLabel:";
751             }
752              
753             # Removing the white space with underscore.
754 2         11 $topicNameLabel =~ s/\s+/_/g;
755            
756             # Creating the file name from the topic name.
757 2         107 $topicName = "temp_$topicNameLabel.txt";
758              
759             # Variable which will hold the value of the Row Sum.
760 2         4 my $rowSum = 0;
761            
762             # Going through cluster-topic hash.
763 2         10 foreach my $sortedOuterKey (sort keys %hashForClusterTopicScore){
764              
765             # If user opted to print the decision matrix, then only print the below.
766 4 50       10 if($isDecisionMatrixDebugOn ==1){
767            
768             # Printing the similarity vlaue between cluster and topic.
769 0         0 print $outFileHandle "\t|\t\t"
770             ."$hashForClusterTopicScore{$sortedOuterKey}{$topicName} ";
771             }
772            
773             # Adding the value for each cluster against a topic.
774 4         22 $rowSum += $hashForClusterTopicScore{$sortedOuterKey}{$topicName};
775             }
776              
777             # Storing the total sum into the Sum Hash.
778 2         7 $topicTotalSumHash{$topicName} = $rowSum;
779              
780              
781             # If user opted to print the decision matrix, then only print the below.
782 2 50       9 if($isDecisionMatrixDebugOn ==1){
783 0         0 print $outFileHandle "\t|\t\t $rowSum";
784 0         0 print $outFileHandle "\n--------------------------------------------------------"
785             ."----------------------------------------------------------------------------------------";
786             }
787              
788             }
789              
790             # If user opted to print the decision matrix, then only print the below.
791 1 50       6 if($isDecisionMatrixDebugOn ==1){
792 0         0 print $outFileHandle "\n\tTotal\t";
793             }
794            
795              
796             # The following piece of code will go through Similarity Values of clusters against topics.
797             # Then, it will print the column sum for each column.
798 1         4 foreach my $sortedOuterKey (sort keys %hashForClusterTopicScore){
799            
800             # Defining variable for total score of a column.
801 2         38 my $colSum =0;
802            
803             # Iterating through each hash against each Column (Cluster Name).
804 2         3 foreach my $sortedInnerKey (sort keys %{$hashForClusterTopicScore{$sortedOuterKey}}){
  2         8  
805              
806             # Totaling the value of Cluster's similarity score against each topic.
807 4         10 $colSum += $hashForClusterTopicScore{$sortedOuterKey}{$sortedInnerKey}
808             }
809            
810             # If user opted to print the decision matrix, then only print the below.
811 2 50       7 if($isDecisionMatrixDebugOn ==1){
812 0         0 print $outFileHandle "\t|\t\t$colSum";
813             }
814            
815             # Storing the total sum into the Sum Hash.
816 2         4 $clusterTotalSumHash{$sortedOuterKey} = $colSum;
817              
818             #Totaling the score of all the clusters against all the topics.
819 2         4 $totalDecisionMatrixSum += $colSum;
820             }
821              
822             # If user opted to print the decision matrix, then only print the below.
823 1 50       4 if($isDecisionMatrixDebugOn ==1){
824 0         0 print $outFileHandle "\t|\t\t$totalDecisionMatrixSum";
825 0         0 print $outFileHandle "\n============================================================="
826             ."==================================================================================\n";
827             }
828              
829              
830             # Returning Hash containing the total score against each topics and total
831             # score against each clusters.
832 1         7 return (\%topicTotalSumHash,\%clusterTotalSumHash);
833             }
834              
835              
836              
837              
838              
839             #######################################################################################################
840             =pod
841              
842             =head1 BUGS
843              
844             =over
845              
846             =item * Supports input of label and topic values through files. Should be able to accept as string value
847              
848             =item * Currently not supporting the WordNet gold standards comparison.
849              
850             =back
851              
852             =head1 SEE ALSO
853              
854             http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/
855            
856            
857             @Last modified by : Anand Jha
858             @Last_Modified_Date : 24th Dec. 2012
859             @Modified Version : 1.15
860              
861              
862             =head1 AUTHORS
863              
864             Ted Pedersen, University of Minnesota, Duluth
865             tpederse at d.umn.edu
866              
867             Anand Jha, University of Minnesota, Duluth
868             jhaxx030 at d.umn.edu
869              
870              
871              
872             =head1 COPYRIGHT AND LICENSE
873              
874             Copyright (C) 2012 Ted Pedersen, Anand Jha
875              
876             See http://dev.perl.org/licenses/ for more information.
877              
878             This program is free software; you can redistribute it and/or modify
879             it under the terms of the GNU General Public License as published by
880             the Free Software Foundation; either version 2 of the License, or
881             (at your option) any later version.
882              
883             This program is distributed in the hope that it will be useful,
884             but WITHOUT ANY WARRANTY; without even the implied warranty of
885             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
886             GNU General Public License for more details.
887              
888             You should have received a copy of the GNU General Public License
889             along with this program; if not, write to:
890            
891            
892             The Free Software Foundation, Inc., 59 Temple Place, Suite 330,
893             Boston, MA 02111-1307 USA
894            
895            
896             =cut
897             #######################################################################################################
898              
899             1;
900              
901             __END__