File Coverage

blib/lib/Text/SenseClusters/LabelEvaluation/Driver.pm
Criterion Covered Total %
statement 162 224 72.3
branch 41 76 53.9
condition n/a
subroutine 11 16 68.7
pod 2 9 22.2
total 216 325 66.4


line stmt bran cond sub pod time code
1              
2             # Defining the Package for the modules.
3             package Text::SenseClusters::LabelEvaluation::Driver;
4              
5 4     4   7898 use strict;
  4         9  
  4         155  
6 4     4   24 use encoding "utf-8";
  4         7  
  4         36  
7              
8             # Defining the version for the Progrm.
9             our $VERSION = '0.09';
10              
11             # Including the FileHandle module.
12 4     4   13780 use FileHandle;
  4         53895  
  4         25  
13              
14             # Including the other dependent Modules.
15 4     4   4217 use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
  4         10  
  4         1017  
16 4     4   185 use Text::SenseClusters::LabelEvaluation::SimilarityScore;
  4         10  
  4         1560  
17 4     4   189 use Text::SenseClusters::LabelEvaluation::Wikipedia::GetWikiData;
  4         5  
  4         856  
18 4     4   39 use Text::SenseClusters::LabelEvaluation::AssigningLabelUsingHungarianAlgo;
  4         9  
  4         661  
19              
20              
21              
22             #######################################################################################################################
23              
24             =head1 Name
25              
26             Text::SenseClusters::LabelEvaluation::Driver - Module for evaluation of labels of the clusters.
27              
28             =head1 SYNOPSIS
29              
30              
31             The following code snippet will evaluate the labels by comparing
32             them with text data for a gold-standard key from Wikipedia.
33            
34             In order to test this module, please copy 'TestData' folder in current directory
35             or adjust directory location while mentioning the label and GoldKeys files.
36              
37             # Including the LabelEvaluation Module.
38             use Text::SenseClusters::LabelEvaluation::Driver;
39            
40             my $labelFileName = 'TestData/TVS/TVS.label';
41             my $topicFileName = 'TestData/TVS/TVSTopic.txt';
42            
43             # Calling the LabelEvaluation modules by passing the following options
44             %inputOptions = (
45             senseClusterLabelFileName => $labelFileName,
46             labelComparisonMethod => 'automate',
47             goldKeyFileName => $topicFileName,
48             goldKeyDataSource => 'wikipedia',
49             weightRatio => 10,
50             isClean => 1,
51             );
52            
53            
54             # Calling the LabelEvaluation modules by passing the name of the
55             # label and topic files.
56             my $driverObject = Text::SenseClusters::LabelEvaluation::Driver->
57             new (\%inputOptions);
58            
59             if($driverObject->{"errorCode"}){
60             print "Please correct the error before proceeding.\n\n";
61             exit();
62             }
63             my $accuracyScore = $driverObject->evaluateLabels();
64              
65             # Printing the score.
66             print "\n\nScore of label evaluation is :: $accuracyScore \n";
67            
68            
69             Note: For more usage, please refer to test-cases in "t" folder of this package.
70            
71             =head1 DESCRIPTION
72              
73             This Program will compare the result obtained from the SenseClusters with that
74             of Gold Standards. Gold Standards can be obtained from:
75             1. Wikipedia
76             2. Wordnet
77             3. User Provided
78            
79             For fetching the Wikipedia data it use the WWW::Wikipedia module from the CPAN
80             and for comparison of Labels with Gold Standards it uses the Text::Similarity
81             Module. The comparison result is then further processed to obtain the result
82             and score of result.
83              
84              
85              
86            
87             =head1 FILE FORMATS:
88              
89             =head2 senseClusterLabelFileName:
90            
91             This tells about the file that will contains the labels for the clusters generated by SenseClusters.
92             The file format for this file should be same as that of generated by SenseClusters.
93            
94             For e.g:
95            
96             Cluster 0 (Descriptive): George Bush, Russian President, British Prime, British Minister, India Pakistan, US George, Prime Minister,
97             Cluster 0 (Discriminating): Russian President, British Minister, India Pakistan, US George,
98             Cluster 1 (Descriptive): George Bush, British Prime, weapons mass, United Nations, September 11, mass destruction, United States,
99             Prime Minister, military action
100             Cluster 1 (Discriminating): United Nations, September 11, United States
101             Cluster 2 (Descriptive): George Bush, weapons destruction, prime minister, axis evil, Saddam Hussein, weapons mass, mass destruction,
102             Gulf War, military action, Iraqi leader
103             Cluster 2 (Discriminating): weapons destruction, prime minister, axis evil, Saddam Hussein, Gulf War, Iraqi leader
104            
105              
106             =head2 goldKeyFileName:
107              
108             This parameter contains the name of the file that contains the gold standard keys for the labels of clusters generated by
109             SenseClusters.
110            
111             The file format provided by user for Gold-Standard key's are dependent on the following
112             two parameters that user pass to call this module:
113            
114             =head3 labelComparisonMethod
115            
116             This parameter tells that whether is passing the mapping information between
117             goldkeys and clusters or not.
118            
119             Two options available are: 1. 'direct' - this says user will provide the mapping info.
120             2. 'automate' - this says module should find the best possible
121             mapping between cluster's label and goldkeys.
122            
123             =head3 goldKeyDataSource
124            
125             This parameter tell this module from where it can read more information about
126             the goldkeys
127            
128             Options for this parameter are: 1. 'wikipedia' - this tells to fetch data from wikipedia.
129             2. 'wordnet' - this tells to fetch data from wordnet.
130             3. 'userData' - this tells user will give the data along
131             with mapping.
132            
133            
134            
135             Combinatios of the various values for the aboue two parameters will give the following six cases:
136              
137             (Please note that separator between cluster name and Goldkeys are ":::".
138             Also, the separator between Goldkeys and their data are ":::")
139            
140             =head4 Case 1. labelComparisonMethod => 'direct', goldKeyDataSource => 'userData'
141            
142              
143             a) In this case user should provide the mapping between the clusters and Goldkeys
144             b) User should also provide the data about these goldstandard keys.
145            
146             for e.g:
147            
148             Cluster0:::Tony Blair
149             Cluster1:::Vladimir Putin
150             Cluster2:::Saddam Hussein
151              
152             Tony Blair::: Anthony Charles Lynton Blair (born 6 May 1953)[1] is a British Labour Party politician who served
153             as the Prime Minister of the United Kingdom from 1997 to 2007. He was the Member of Parliament (MP) for Sedgefield
154             from 1983 to 2007 and Leader of the Labour Party from 1994 to 2007. He resigned from all of these positions in
155             June 2007.
156            
157             Vladimir Putin::: Vladimir Vladimirovich Putin (Russian: ( listen); born 7 October 1952) is a Russian politician
158             who has been the President of Russia since 7 May 2012. Putin previously served as President from 2000 to 2008, and
159             as Prime Minister of Russia from 1999 to 2000 and again from 2008 to 2012. Putin was also previously the Chairman
160             of United Russia.
161            
162             Saddam Hussein::: Saddam Hussein Abd al-Majid al-Tikriti 28 April 1937[2] – 30 December 2006)[3] was the fifth
163             President of Iraq, serving in this capacity from 16 July 1979 until 9 April 2003.[4][5] A leading member of the
164             revolutionary Arab Socialist Ba'ath Party.
165              
166             =head4 Case 2. labelComparisonMethod => 'direct', goldKeyDataSource => 'wikipedia'
167            
168             a) In this case user just need to provide the mapping between the clusters and Goldkeys.
169             b) User do not need to provide the data about these goldstandard keys. Even though, if user provides the
170             data about these topics, it will be ignored.
171            
172            
173             for e.g:
174             Cluster0:::Tony Blair
175             Cluster1:::Vladimir Putin
176             Cluster2:::Saddam Hussein
177            
178            
179             =head4 Case 3. labelComparisonMethod => 'direct', goldKeyDataSource => 'wordnet'
180              
181             a) In this case also user just need to provide the mapping between the clusters and Goldkeys.
182             b) User do not need to provide the data about these goldstandard keys.
183            
184             for e.g:
185             Cluster0:::Tony Blair
186             Cluster1:::Vladimir Putin
187             Cluster2:::Saddam Hussein
188            
189            
190             =head4 Case 4. labelComparisonMethod => 'automate', goldKeyDataSource => 'userData'
191            
192             a) No Mapping between the clusters and Goldkeys.
193             b) User will just need to provide the data about these goldstandard keys.
194            
195            
196             for e.g:
197             Tony Blair::: Anthony Charles Lynton Blair (born 6 May 1953)[1] is a British Labour Party politician who served
198             as the Prime Minister of the United Kingdom from 1997 to 2007. He was the Member of Parliament (MP) for Sedgefield
199             from 1983 to 2007 and Leader of the Labour Party from 1994 to 2007. He resigned from all of these positions in
200             June 2007.
201            
202             Vladimir Putin::: Vladimir Vladimirovich Putin (Russian: ( listen); born 7 October 1952) is a Russian politician
203             who has been the President of Russia since 7 May 2012. Putin previously served as President from 2000 to 2008, and
204             as Prime Minister of Russia from 1999 to 2000 and again from 2008 to 2012. Putin was also previously the Chairman
205             of United Russia.
206            
207             Saddam Hussein::: Saddam Hussein Abd al-Majid al-Tikriti 28 April 1937[2] – 30 December 2006)[3] was the fifth
208             President of Iraq, serving in this capacity from 16 July 1979 until 9 April 2003.[4][5] A leading member of the
209             revolutionary Arab Socialist Ba'ath Party.
210            
211            
212             =head4 Case 5. labelComparisonMethod => 'automate', goldKeyDataSource => 'wikipedia'
213            
214             a) No Mapping between the clusters and Goldkeys.
215             b) User will just need to provide the comma separated goldstandard keys.
216            
217             for e.g:
218             Tony Blair , Vladimir Putin, Saddam Hussein
219            
220            
221            
222             =head4 Case 6. labelComparisonMethod => 'automate', goldKeyDataSource => 'wordnet'
223              
224             a) No Mapping between the clusters and Goldkeys.
225             b) User will just need to provide the comma separated goldstandard keys.
226              
227            
228             for e.g:
229             Tony Blair , Vladimir Putin, Saddam Hussein
230              
231              
232             Sample files for all the cases are included in 'TestData' of the modules.
233            
234             1. TestData/TVS/TVS.label- Files containing the Labels generated by SenseClusters.
235              
236             2. TestData/TVS/TVSMappingUserData.txt - File contianing GoldKeys, their mapping with clusters and detailed data about the GoldKeys.
237              
238             3. TestData/TVS/TVSMapping.txt - File contianing GoldKeys, their mapping with clusters.
239              
240             4. TestData/TVS/TVSTopic.txt - File containing the GoldKeys and their mapping with clusters.
241              
242             5. TestData/TVS/TVSUserData.txt - File containing the GoldKeys and user provided detailed data about these gold keys.
243              
244             6. TestData/TVS/testTVS.pl - Perl test file which tells us, how to use these files in various scenarios.
245            
246            
247             =head1 RESULT
248              
249              
250             =head4 a) Contingency Matrix:
251             Based on the similarity comparison of Labels with the gold standards,
252             the Contingency Matrix is generated. Following shows an example of
253             contingency matrix for the example mentioned in synposis:
254              
255              
256             Original Contingency Matrix:
257            
258             Bill Clinton Tony Blair
259             -------------------------------------------------
260             Cluster0 54 48
261             -------------------------------------------------
262             Cluster1 31 16
263             -------------------------------------------------
264            
265             =head4 b) Using Hungarian algorithm to display the new contingency matrix,
266             whose diagonal elements indicates the assigned similarity-score
267             between a cluster and a gold-standard key. This format of matrix
268             has the maximum possible diagonal's total.
269            
270             Example:
271            
272             Contigency Matrix after Hungarian Algorithm:
273            
274             Tony Blair Bill Clinton
275             -------------------------------------------------
276             Cluster0 48 54
277             -------------------------------------------------
278             Cluster1 16 31
279             -------------------------------------------------
280            
281              
282             =head4 c) Conclusion: Displays the conclusion of the Hungarian algorithm:
283            
284             Example:
285            
286             Final Conclusion using Hungarian Algorithm::
287             Cluster0 <--> Tony Blair
288             Cluster1 <--> Bill Clinton
289            
290            
291             =head4 d) Displaying the overall accuracy for the label assignment:
292            
293             Sum (Diagonal Scores)
294             Accuracy = -------------------------------------------
295             Sum (All the Scores of contingency table)
296            
297             Example:
298             Accuracy of labels is 53.02%
299             =cut
300              
301             ################################################################################################################
302              
303             =pod
304              
305             =head1 Help
306              
307             The LabelEvaluation module expect the 'OptionsHash' as the required argument.
308              
309             The 'optionHash' has the following elements:
310            
311             =head2 labelFile:
312              
313             Name of the file containing the labels from SenseClusters. The syntax of file
314             must be similar to label file from SenseClusters. This is mandatory parameter.
315            
316             =head2 labelComparisonMethod:
317              
318             Name of the method for comparing the labels with GoldKey. This method tells
319             the program whether the keyFile provided by the User will have the mapping
320             between the assigned labels and expected topics of the clusters.
321              
322             Possible options are :
323             A) 'DirectAssignment' and
324             B) 'AutomateAssignment'.
325            
326             This is mandatory parameter.
327              
328             =head2 goldKeyFile:
329              
330             Name of the file containing the actual topics (keys) and their data for the
331             clusters. This is mandatory parameter.
332            
333             =head2 goldKeyLength:
334              
335             This parameter tells about the length of data to be fetched from the external
336             resource such as Wikipedia. The data will be used as reference data.
337             Default value for this parameter is the first section of the Wikipedia page.
338              
339             =head2 goldKeyDataSource:
340              
341             This parameter tell the name of external application or user supplied file
342             name from where we will get the key's data.
343              
344             Options are:
345             1. 'Wikipedia'
346             2. 'User'
347             3. 'Wordnet' (Will be supported in future).
348            
349             This is the mandatory parameter.
350            
351            
352             =head2 weightRatio:
353            
354             This ratio tells us about the weightage we should provide to Discriminating
355             label over the descriptive label. Default value is set to 10.
356            
357             =head2 stopList:
358              
359             This is the name of file which contains the list of all stop words. This is the
360             optional parameter and its formating should match the requirement of the Text::
361             Simialrity i.e. a single stop word in a single line.
362              
363             for e.g:
364             Content of stoplist.txt should look like:
365             the
366             of
367             in
368             :
369             :
370             to
371            
372             =head2 isClean:
373              
374             This variable will decide whether to keep or delete temporary files.Default
375             value is 'true'.
376            
377             =head2 verbose:
378            
379             Variable used for the deciding whether to show detailed results to user or
380             not. Default value = Off (0), to make it 'On' change value to 1.
381              
382             =head2 help :
383            
384             This variable will decide whether to display help to user or not. Default
385             value for this parameter is 0.
386            
387             %inputOptions = (
388             senseClusterLabelFileName => '/',
389             labelComparisonMethod => 'DirectAssignmentOrAutomateAssignment',
390             goldKeyFileName => '/',
391             goldKeyLength => '',
392             goldKeyDataSource => '',
393             weightRatio => '',
394             stopListFileLocation => '/',
395             isClean => 1,
396             verbose => 0,
397             help => 0
398             );
399              
400              
401             =head3 Examples
402              
403             =head4 With minimum parameters:
404              
405             %inputOptions = (
406             senseClusterLabelFileName => 'labelFile.txt',
407             labelComparisonMethod => 'DirectAssignment',
408             goldKeyFileName => 'goldKeyFile.txt',
409             goldKeyDataSource => 'UserData'
410             );
411            
412             The above mentioned four mandatory parameters.
413            
414             =head4 For Help:
415              
416             %inputOptions = (
417             help => 1
418             );
419            
420             =head4 With all parameters:
421              
422             %inputOptions = (
423             senseClusterLabelFileName => 'labelFile.txt',
424             labelComparisonMethod => 'AutomateAssignment',
425             goldKeyFileName => 'goldKeyFile.txt',
426             goldKeyLength => 2000,
427             goldKeyDataSource => 'Wikipedia',
428             weightRatio => 10,
429             stopListFileLocation => 'stoplist.txt',
430             isClean => 1,
431             verbose => 1,
432             help => 0
433             );
434            
435             =cut
436              
437             # Following blocks declare the global variables for the LabelEvaluation module.
438             our $senseClusterLabelFileName = "SenseClusterLabelFileName";
439             our $labelComparisonMethod = "labelComparisonMethod";
440             our $goldKeyFileName = "goldKeyFileName";
441             our $goldKeyLength = "goldKeyLength";
442             our $goldKeyDataSource = "goldKeyDataSource";
443             our $weightRatio = "weightRatio";
444             our $stopListFileLocation = "stopListFileLocation";
445             our $isClean = "isClean";
446             our $verbose = "verbose";
447             our $help = "help";
448              
449             # These two parameters are used for error handling.
450             our $errorCode = "errorCode";
451             our $errorMessage = "errorMessage";
452             our $exitCode = "exitCode";
453              
454             # Defining the all possible value for the of label-comparison-method.
455             our $labelComparisonMethod_Direct = "direct";
456             our $labelComparisonMethod_Automate = "automate";
457              
458             # Defining the name of all possible sources from where we can get the information about
459             # the topics. This are possible values for the parameter "goldKeyDataSource":
460             our $standardReferenceName_Wikipedia = "wikipedia";
461             our $standardReferenceName_WordNet = "wordnet";
462             our $standardReferenceName_UserData = "userdata";
463              
464             our $labelType_Descriptive = "descriptive";
465             our $labelType_Discriminating = "discriminating";
466              
467             # The following define the exit-code for this program in different situation.
468             our $helpExitCode = 400;
469             our $requiredErrorExitCode = 404;
470             our $unknownErrorExitCode = 502;
471             our $missingMappingErrorExitCode = 401;
472             our $missingKeyDataErrorExitCode = 402;
473              
474              
475             # Defining the file handle for the output file.
476             our $outFileHandle;
477              
478             # Defining the exit code for the module with default value 1.
479             # "1" indicates that program exited with proper execution.
480             our $exitCodeValue = 1;
481              
482              
483             ##########################################################################################
484              
485             =head1 Constructor: new()
486              
487             This is the constructor which will create object for this class.
488             Reference : http://perldoc.perl.org/perlobj.html
489              
490             This constructor takes the hash argument and intialize it for the class.
491              
492             %inputOptions = (
493             senseClusterLabelFileName => 'value1',
494             labelComparisonMethod => 'value2',
495             goldKeyFileName => 'value3',
496             goldKeyLength => value4,
497             goldKeyDataSource => 'value5',
498             weightRatio => value6,
499             stopListFileLocation => 'value7',
500             isClean => value8,
501             verbose => value9,
502             help => value10
503             );
504            
505             Please refer to section "help" about the detailed discussion on this hash.
506             =cut
507              
508             ##########################################################################################
509              
510             sub new {
511              
512             # Creating the object.
513 4     4 1 125 my $class = shift;
514 4         14 my $driverObject = {};
515              
516             # Explicit association is created by the built-in bless function.
517 4         15 bless $driverObject, $class;
518              
519             # Getting the Hash as the argument.
520 4         12 my $argHash = shift;
521              
522             # If the argument is defined then, read its contents and populate the class member
523             # values.
524 4 50       22 if ( defined $argHash ) {
525              
526             # Reading the Key and Value from the argument-hash.
527 4         30 while (my ($key, $val ) = each %$argHash ) {
528              
529             # Setting the class variables.
530 24 100       236 if ( lc($key) eq lc($senseClusterLabelFileName)) {
    100          
    100          
    50          
    100          
    100          
    50          
    50          
    0          
    0          
531 4 50       16 if($val){
532 4         42 $driverObject->{$senseClusterLabelFileName} = $val;
533             }else{
534             # Raise Error: Missing mandatory parameter.
535 0         0 $driverObject->{$errorCode} = $requiredErrorExitCode;
536 0         0 $driverObject->{$errorMessage}= "Label file from the SenseClusters is missing!";
537 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
538             }
539            
540             } elsif (lc($key) eq lc($labelComparisonMethod)) {
541 4 50       16 if($val){
542 4         24 $driverObject->{$labelComparisonMethod} = lc($val);
543             }else{
544             # Raise Error: Missing mandatory parameter.
545 0         0 $driverObject->{$errorCode} = $requiredErrorExitCode;
546 0         0 $driverObject->{$errorMessage}= "Comparison method for labels and keys is not mentioned!";
547 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
548             }
549            
550             } elsif (lc($key) eq lc($goldKeyFileName)) {
551 4 50       19 if($val){
552 4         26 $driverObject->{$goldKeyFileName} = $val;
553             }else{
554             # Raise Error: Missing mandatory parameter.
555 0         0 $driverObject->{$errorCode} = $requiredErrorExitCode;
556 0         0 $driverObject->{$errorMessage}= "Please specify the file name for the GoldKey!";
557 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
558             }
559             } elsif ( lc($key) eq lc($goldKeyLength)) {
560 0 0       0 if($val){
561 0         0 $driverObject->{$goldKeyLength} = $val;
562             }
563             } elsif ( lc($key) eq lc($goldKeyDataSource)) {
564 4 50       17 if($val){
565 4         27 $driverObject->{$goldKeyDataSource} = $val;
566             }else{
567             # Raise Error: Missing mandatory parameter.
568 0         0 $driverObject->{$errorCode} = $requiredErrorExitCode;
569 0         0 $driverObject->{$errorMessage}= "Please specify the name of the source from which information about the topic will be feteched!";
570 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
571             }
572             } elsif ( lc($key) eq lc($weightRatio)) {
573 4 50       15 if($val){
574 4         153 $driverObject->{$weightRatio} = $val;
575             }else{
576 0         0 $driverObject->{$weightRatio} = 10;
577             }
578             } elsif ( lc($key) eq lc($stopListFileLocation)) {
579 0 0       0 if($val){
580 0         0 $driverObject->{$stopListFileLocation} = $val;
581             }else{
582 0         0 $driverObject->{$stopListFileLocation} = "";
583             }
584             } elsif ( lc($key) eq lc($isClean)) {
585 4 50       15 if($val){
586 4         22 $driverObject->{$isClean} = $val;
587             }else{
588 0         0 $driverObject->{$isClean} = 0;
589             }
590             } elsif ( lc($key) eq lc($verbose)) {
591 0 0       0 if($val){
592 0         0 $driverObject->{$verbose} = $val;
593             }else{
594 0         0 $driverObject->{$verbose} = 0;
595             }
596             } elsif ( lc($key) eq lc($help)) {
597 0 0       0 if($val == 1){
598 0         0 $driverObject->{$exitCode} = help();
599             }else{
600 0         0 $driverObject->{$help} = 0;
601             }
602             }
603             }
604             }
605             # Returning the blessed hash refered by $self.
606 4         14 return $driverObject;
607             }
608              
609              
610             # Function to print the input parameters of the program.
611             sub printInputParameter {
612 0     0 0 0 my $driverObject = shift;
613 0         0 print "SenseClusterLabelFileName:: $driverObject->{$senseClusterLabelFileName} \n";
614 0         0 print "labelComparisonMethod:: $driverObject->{$labelComparisonMethod} \n";
615 0         0 print "goldKeyFileName:: $driverObject->{$goldKeyFileName} \n";
616 0         0 print "goldKeyLength:: $driverObject->{$goldKeyLength} \n";
617 0         0 print "goldKeyDataSource:: $driverObject->{$goldKeyDataSource} \n";
618 0         0 print "weightRatio:: $driverObject->{$weightRatio} \n";
619 0         0 print "stopListFileLocation:: $driverObject->{$stopListFileLocation} \n";
620 0         0 print "isClean:: $driverObject->{$isClean} \n";
621 0         0 print "verbose:: $driverObject->{$verbose} \n";
622 0         0 print "help:: $driverObject->{$help} \n";
623 0         0 print "ExitCode:: $driverObject->{$exitCode} \n";
624 0         0 print "ErrorCode:: $driverObject->{$errorCode} \n";
625 0         0 print "ErrorMessage:: $driverObject->{$errorMessage} \n";
626             }
627              
628              
629             # Method for printing the help to end user.
630             sub help{
631 0     0 1 0 print "\nPlease pass values of the parameters of the option-hash in the following format:
632             %inputOptions = (
633             senseClusterLabelFileName => 'labelFile.txt',
634             labelComparisonMethod => 'AutomateAssignment',
635             goldKeyFileName => 'goldKeyFile.txt',
636             goldKeyLength => 2000,
637             goldKeyDataSource => 'Wikipedia',
638             weightRatio => 10,
639             stopListFileLocation => 'stoplist.txt',
640             isClean => 0,
641             verbose => 1,
642             help => 0
643             );
644             \nNote that only 'senseClusterLabelFileName', 'labelComparisonMethod', 'goldKeyFileName'".
645             " and 'goldKeyDataSource' are mandatory parameters.\n".
646             "For detailed explanation and more examples, please refer the HELP and SYNOPSIS section of this module.\n\n" ;
647              
648             # Returning the exit code for the "help".
649 0         0 return $helpExitCode;
650             }
651              
652              
653             # Method for printing the help to end user.
654             sub error{
655 0     0 0 0 my $errorCode = shift;
656 0         0 my $errorMessage = shift;
657            
658 0         0 print STDERR "Program exiting with the error. ";
659 0         0 print STDERR "\nError Code=$errorCode. \n\t$errorMessage \n\n";
660             }
661              
662              
663              
664             # Method for generating the error for "missing mapping".
665             sub errorMissingMapping{
666             # Getting the object from the argument.
667 0     0 0 0 my $driverObject = shift;
668            
669             # Raise Error: Missing Cluster's label and GoldStandard Key mapping.
670 0         0 $driverObject->{$errorCode} = $missingMappingErrorExitCode;
671 0         0 $driverObject->{$errorMessage}= "Missing the mapping between Clusters and GoldStandard Keys.".
672             "\n\tPlease specify the mapping in File containing information about GoldStandard Keys!".
673             "\n\tRefer this module's cpan documentation on \"FILE FORMATS\" - Case1 or Case2 or Case3".
674             "(labelComparisonMethod => 'direct') \nabout how to specify the mapping inside a GoldKey file.";
675            
676             # Calling method for printing the error message.
677 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
678            
679 0         0 exit $driverObject->{$errorCode};
680             }
681              
682              
683             # Method for generating the error for "missing user data in the GoldKeyFile".
684             sub errorMissingUserData{
685              
686             # Getting the object from the argument.
687 0     0 0 0 my $driverObject = shift;
688            
689             # Raise Error: Missing user data for the GoldStandard Key.
690 0         0 $driverObject->{$errorCode} = $missingKeyDataErrorExitCode;
691 0         0 $driverObject->{$errorMessage}= "Missing the data for GoldStandard Keys.".
692             "\n\tPlease specify the data for the GoldStandard Keys!".
693             "\n\tRefer this module's cpan documentation on \"FILE FORMATS\" - Case1 or Case4".
694             "(goldKeyDataSource => 'userData') \nabout how to specify the data for the gold stadndard key.";
695 0         0 error($driverObject->{$errorCode}, $driverObject->{$errorMessage});
696            
697 0         0 exit $driverObject->{$errorCode};
698             }
699              
700              
701              
702             ########################################################################################
703             =head1 Function: evaluateLabels
704              
705             Function which is responsible for evaluating the labels of the clusters. This
706             function will call the other modules for completing the process.
707              
708             @argument : $driverObject : Object of the current file.
709            
710             @return : $accuracy : DataType(Float)
711             Indicates the overall accuracy of the assignments.
712            
713             @description :
714            
715             Overall algorithm for calculating the accuracy of the labels assignment with the help of gold
716             standard keys are:
717            
718             Step 1: Read the clusters and their labels information from the ClusterLabel file.
719            
720             =head3 Case A: User has provided the mapping information about the cluster and gold standard key.
721              
722             Step 2:Read Clusters-Topics mapping information.
723            
724             =head4 Subcase1: User provides data for gold standard keys.
725            
726             Step 3:Read the gold standard keys and their data from the file provided by user.
727             Step 4: continue to next step :).
728            
729             =head4 Subcase2: User provides the gold standard keys. We will fetch data from Wikipedia.
730              
731             User will just provide the data about the topics, but no mapping.
732            
733             Step 3:Read gold standard keys from the file provided by user.
734             Step 4:Read data about the gold standard keys from the Wikipedia.
735            
736             =head4 Subcase3: User provides the gold standard keys. We will fetch data from Wordnet.
737            
738             Step 3:Read gold standard keys from the file provided by user.
739             Step 4:Read data about the gold standard keys from the Wordnet.
740            
741             Step 5: Create contingency matrix with similarity-scores of cluster's label against each
742             gold standard key's data (obtained from steps 3 and 4.)
743             Step 6: Using the mapping provided by user(step 2) to calculate the diagonal score for the
744             contingency matrix.
745             Step 7: Overall Accuracy for the current cluster's label assignment can be calculated as :
746            
747             Sum (Diagonal Scores)
748             Accuracy =--------------------------------------------------
749             Sum (All the Scores of contingency table)
750            
751             =head3 Case B: User has not provided the mapping information about the cluster and gold standard key.
752              
753             We will use the Hungarian algorithm to compute the mapping.
754            
755             =head4 Subcase1: User provides data for gold standard keys.
756            
757             Step 2: Read the gold standard keys and their data from the file provided by user.
758              
759             Step 3: Continue to next step :).
760            
761             =head4 Subcase2: User provides the gold standard keys. We will fetch data from Wikipedia.
762             User will just provide the data about the topics, but no mapping.
763            
764             Step 2: Read gold standard keys from the file provided by user.
765              
766             Step 3: Read data about the gold standard keys from the Wikipedia.
767            
768             =head4 Subcase3: User provides the gold standard keys. We will fetch data from Wordnet.
769            
770             Step 2: Read gold standard keys from the file provided by user.
771              
772             Step 3: Read data about the gold standard keys from the Wordnet.
773              
774              
775             =head3 Common Steps for the all three subcases.
776              
777             Step 4: Create contingency matrix with similarity-scores of cluster's label against each
778             gold standard key's data (obtained from steps 3 and 4.)
779            
780             Step 5: Use Hungarian algorithm to determine the mapping of Clusters with gold standard keys.
781            
782             Step 6: Use the above mapping to calculate the total diagonal score for the new contingency matrix.
783            
784             Step 7: Overall Accuracy for the current cluster's label assignment can be calculated as :
785            
786              
787             Sum (Diagonal Scores)
788             Accuracy = --------------------------------------------------
789             Sum (All the Scores of contingency table)
790              
791             =cut
792              
793              
794             #########################################################################################
795             # Method for evaluting the labels.
796             # Steps:
797             # Step 1. Get the mapping.
798             sub evaluateLabels{
799             # Getting the current class object as the argument.
800 4     4 0 37 my $driverObject = shift;
801            
802             # Getting the clusters file name, from the $driverObject.
803 4         14 my $clusterFileName = $driverObject->{$senseClusterLabelFileName};
804              
805             # Getting the "isClean" parameter from the class variable.
806 4         12 my $isCleaned = $driverObject->{$isClean};
807            
808             # Getting the "verbose" option from the class variable.
809 4         13 my $verboseOption = $driverObject->{$verbose};
810            
811             # Creating the read-file object for reading the cluster's label.
812 4         53 my $readClusterFileObject =
813             Text::SenseClusters::LabelEvaluation::ReadingFilesData->new ($clusterFileName);
814            
815             # Defining hash which will hold the cluster and its labels.
816 4         11 my %labelSenseClustersHash = ();
817             # Calling the function to read the cluster and its labels data in the hash.S
818 4         38 my $labelSenseClustersHashRef =
819             $readClusterFileObject->readLinesFromClusterFile(\%labelSenseClustersHash);
820 4         24 %labelSenseClustersHash = %$labelSenseClustersHashRef;
821              
822             # Getting the topics file name.
823 4         16 my $topicsFileName = $driverObject->{$goldKeyFileName};
824            
825             # Defining the variable which will hold the accuracy score for the labesl to be evaluated
826 4         10 my $accuracyScore = 0;
827            
828             # Creating the read-file object for standard-gold-keys.
829 4         21 my $readTopicFileObject =
830             Text::SenseClusters::LabelEvaluation::ReadingFilesData->new ($topicsFileName);
831            
832              
833             # CASE A: User has provided the mapping information about the cluster and gold standard key.
834 4 100       38 if(lc($driverObject->{$labelComparisonMethod}) eq $labelComparisonMethod_Direct){
    50          
835            
836             # Read Cluster-Topic mapping information and store it in hash.
837 2         11 my ($hashRef, $topicArrayRef) = $readTopicFileObject->readMappingFromTopicFile();
838              
839             # Reading the hash from its reference.
840 2         11 my %mappingHash = %$hashRef;
841 2         6 my @topicArray = @$topicArrayRef;
842            
843            
844             # If there is no mapping, then generate error here.....
845 2 50       19 if(!%mappingHash){
846 0         0 errorMissingMapping($driverObject);
847             }
848            
849            
850             # Subcase1: User provides data for gold standard keys.
851 2 100       14 if(lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_UserData){
    50          
    0          
852            
853             # Call user comparison method.
854            
855             # Reading the topic-data from the user file.
856             # User will provide the name and data of the topics along with mapping.
857 1         6 my $topicDataHashRef = $readTopicFileObject->readTopicDataFromTopicFile(\@topicArray);
858            
859             # Reading the hash from its reference.
860 1         6 my %topicDataHash = %$topicDataHashRef;
861              
862             # If there is no user's data for the topics, generate error here.....
863 1 50       7 if(!%topicDataHash){
864 0         0 errorMissingUserData($driverObject);
865             }
866            
867              
868            
869             # Calling the function 'makeContigencyMatrix' to get the contingency matrix of similarity-scores.
870 1         9 my ($matrixScoreRef, $colHeaderRef, $rowHeaderRef, $totalMatrixScore) =
871             makeContigencyMatrix(\%labelSenseClustersHash, \%topicDataHash, $driverObject->{$weightRatio},
872             $driverObject->{$stopListFileLocation}, $verboseOption);
873            
874             # Calling the function 'printMatrix' to print the contingency matrix.
875 1         12 Text::SenseClusters::LabelEvaluation::AssigningLabelUsingHungarianAlgo::printMatrix
876             ($matrixScoreRef, $colHeaderRef,$rowHeaderRef);
877            
878             # Calling function to calculate the overall accuracy for the label assignment.
879 1         7 $accuracyScore = calculateAccuracy
880             (\%mappingHash, $matrixScoreRef, $colHeaderRef, $rowHeaderRef, $totalMatrixScore);
881            
882             }elsif (lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_Wikipedia){
883            
884             #
885             # Subcase2: User provides the gold standard keys. We will fetch data from Wikipedia.
886             # User will just provide the data about the topics, but no mapping.
887             #
888            
889            
890 1         3 my %topicDataHash = ();
891 1         2 foreach my $topic (@topicArray){
892             # Call wikipedia function.
893 3         16 my $topicData =
894             Text::SenseClusters::LabelEvaluation::Wikipedia::GetWikiData::getWikiDataForTopic(
895             $topic, $isCleaned);
896 3         16 $topicDataHash{$topic} = $topicData;
897             #print "$topic $topicData\n";
898             }
899              
900             # Calling the function 'makeContigencyMatrix' to get the contingency matrix of similarity-scores.
901 1         11 my ($matrixScoreRef, $colHeaderRef, $rowHeaderRef, $totalMatrixScore) =
902             makeContigencyMatrix(\%labelSenseClustersHash, \%topicDataHash, $driverObject->{$weightRatio},
903             $driverObject->{$stopListFileLocation}, $verboseOption);
904 1         129 print STDERR "\nContigency Matrix based on user input::\n";
905            
906             # Calling the function 'printMatrix' to print the contingency matrix.
907 1         10 Text::SenseClusters::LabelEvaluation::AssigningLabelUsingHungarianAlgo::printMatrix
908             ($matrixScoreRef, $colHeaderRef,$rowHeaderRef);
909            
910             # Calling function to calculate the overall accuracy for the label assignment.
911 1         6 $accuracyScore = calculateAccuracy
912             (\%mappingHash, $matrixScoreRef, $colHeaderRef, $rowHeaderRef, $totalMatrixScore);
913            
914             }elsif (lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_WordNet){
915            
916             # Subcase3: User provides the gold standard keys. We will fetch data from Wordnet.
917            
918             # Call wordnet comparison method. User will just provide the topic name.
919             # TODO: Left for future implementation.
920             }
921            
922             # CASE B: User has not provided the mapping information about the cluster and gold standard key.
923             # We will use the Hungarian algorithm to compute the mapping.
924             }elsif(lc($driverObject->{$labelComparisonMethod}) eq $labelComparisonMethod_Automate){
925            
926             # Subcase1: User provides data for gold standard keys.
927             # User will just provide the data about the topics, but no mapping.
928 2 100       16 if(lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_UserData){
    50          
    0          
929            
930             # Empty array for holding the topics.
931 1         2 my @tempTopicNameArray = ();
932            
933             # Reading the topic-data from the user file.
934 1         10 my $topicDataHashRef = $readTopicFileObject->readTopicDataFromTopicFile(\@tempTopicNameArray);
935             # Reading the hash from its reference.
936 1         7 my %topicDataHash = %$topicDataHashRef;
937            
938             # If there is no user's data for the topics, generate error here.....
939 1 50       8 if(!%topicDataHash){
940 0         0 errorMissingUserData($driverObject);
941             }
942            
943             # Calling the function which will create the contingency matrix for given set of inputs.
944 1         11 my ($matrixScoreRef, $colHeaderRef, $rowHeaderRef,$totalMatrixScore) =
945             makeContigencyMatrix(\%labelSenseClustersHash, \%topicDataHash, $driverObject->{$weightRatio},
946             $driverObject->{$stopListFileLocation}, $verboseOption);
947            
948             # Reading the array from its referece.
949 1         7 my @matrixScore = @$matrixScoreRef;
950 1         4 my @colHeader = @$colHeaderRef;
951 1         3 my @rowHeader = @$rowHeaderRef;
952            
953             # Creating the Hungarian object.
954 1         19 my $hungarainObject = Text::SenseClusters::LabelEvaluation::AssigningLabelUsingHungarianAlgo
955             ->new(\@matrixScore, \@colHeader, \@rowHeader);
956            
957             # Reading the Mapping with help of function.
958 1         7 my ($accuracy,$finalMatrixRef,$newColumnHeaderRef) = $hungarainObject->reAssigningWithHungarianAlgo();
959            
960            
961             # Rounding off accuracy to decimal place.
962 1         23 $accuracyScore = sprintf("%.2f", ($accuracy*100));
963 1         57 print STDERR "\n\nAccuracy of labels is $accuracyScore\% \n\n";
964              
965 1 50       29 if($accuracy == 0){
966 0         0 print STDERR "\n\n Accuracy score \"zero\" indicates either of the following two facts::\n";
967 0         0 print STDERR " 1. Labels assigned to Cluster is completely wronged. OR\n";
968 0         0 print STDERR " 2. Gold-Keys provided by you are not correct.... \n";
969             }
970              
971             # Subcase2: User provides the gold standard keys. We will fetch data from Wikipedia.
972             # User will just provide the data about the topics, but no mapping.
973             }elsif (lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_Wikipedia){
974            
975             # Calling readLinesFromTopicFile function to get the list of all the topics.
976 1         6 our $standardTerms = $readTopicFileObject->readLinesFromTopicFile();
977            
978             # Spliting the standard terms on "," to get the Topic name.
979             # For e.g: "Bill Clinton , Tony Blair"
980 1         7 my @topicArray = split(/[\,]/, $standardTerms);
981            
982             # Call wikipedia function. User will just provide the topic name.
983 1         4 my %topicDataHash = ();
984 1         4 foreach my $topic (@topicArray){
985             # Call wikipedia function.
986 3         18 my $topicData =
987             Text::SenseClusters::LabelEvaluation::Wikipedia::GetWikiData::getWikiDataForTopic($topic, $isCleaned);
988            
989             # Setting the data about the topic into hash.
990 3         19 $topicDataHash{$topic} = $topicData;
991             }
992            
993             # Calling the function which will create the contingency matrix for given set of inputs.
994 1         12 my ($matrixScoreRef, $colHeaderRef, $rowHeaderRef, $totalMatrixScore) =
995             makeContigencyMatrix(\%labelSenseClustersHash, \%topicDataHash, $driverObject->{$weightRatio},
996             $driverObject->{$stopListFileLocation}, $verboseOption);
997            
998             # Reading the array from its referece.
999 1         9 my @matrixScore = @$matrixScoreRef;
1000 1         4 my @colHeader = @$colHeaderRef;
1001 1         3 my @rowHeader = @$rowHeaderRef;
1002            
1003             # Creating the object of the class AssigningLabelUsingHungarianAlgo.
1004 1         23 my $hungarainObject = Text::SenseClusters::LabelEvaluation::AssigningLabelUsingHungarianAlgo
1005             ->new(\@matrixScore, \@colHeader, \@rowHeader);
1006            
1007             # Reading the Mapping with help of function.
1008 1         5 my ($accuracy,$finalMatrixRef,$newColumnHeaderRef) = $hungarainObject->reAssigningWithHungarianAlgo();
1009            
1010             # Rounding off accuracy to decimal place.
1011 1         28 $accuracyScore = sprintf("%.2f", ($accuracy*100));
1012 1         99 print STDERR "\n\nAccuracy of labels is $accuracyScore\% \n\n";
1013            
1014             }elsif (lc($driverObject->{$goldKeyDataSource}) eq $standardReferenceName_WordNet){
1015            
1016             # Subcase3: User provides the gold standard keys. We will fetch data from Wordnet.
1017            
1018             # Call wordnet comparison method. User will just provide the topic name.
1019             # TODO. Left for future implementation.
1020             }
1021             }
1022            
1023             # Returning the accuracy of the labels of the clusters.
1024 4         132 return $accuracyScore;
1025             }
1026              
1027              
1028             ##########################################################################################
1029             =head1 Function: makeContigencyMatrix
1030              
1031             This method is responsible for making the Contigency Matrix containing the similarity-scores of the labels with the data of the gold standard keys.
1032            
1033             @argument : $labelSenseClustersHashRef (Hash containing the labels generated by the SenseClusters)
1034              
1035             @argument : $topicDataHashRef (Hash containing the data of the gold standard keys)
1036              
1037             @argument : $weightageRatio (Parameter which tells the weightage to be given to discriminating labels over descriptive labels of the SenseClusters)
1038            
1039             @return : 1. @matrixScore - Contingency matrix containing the similarity-scores.
1040            
1041             @return : 2. @colHeader - Array containing the column header for the contingency matrix.
1042            
1043             @return : 3. @rowHeader - Array containing the row header for the contingency matrix.
1044            
1045             @return : 4. $totalMatrixScore - Total similarity scores of the contingency matrix.
1046            
1047            
1048             @description :
1049              
1050             1). It will iterate through the hash (%labelSenseClustersHash) and extracts the descriptive and discriminating labels for each clusters.
1051              
1052             2). It will read the data about each gold standard key from the hash (%topicDataHash).
1053              
1054             3). It then uses the module, Text::SenseClusters::LabelEvaluation::SimilarityScore to get various similarity score.
1055              
1056             4). Finally, it uses the raw-lesk scores to prepare the contingency matrix.
1057            
1058             =cut
1059             ##########################################################################################
1060              
1061             sub makeContigencyMatrix{
1062             # Getting the reference of the Hash containing the cluster's label.
1063 4     4 0 11 my $labelSenseClustersHashRef = shift;
1064             # Reading the hash from its reference.
1065 4         22 my %labelSenseClustersHash = %$labelSenseClustersHashRef;
1066            
1067             # Getting the reference of the hash containing the topic and its infomation.
1068 4         10 my $topicDataHashRef = shift;
1069             # Reading the hash from its reference.
1070 4         20 my %topicDataHash = %$topicDataHashRef;
1071              
1072             # Getting the weightage for discriminating and descriptive labels.
1073 4         67 my $weightageRatio = shift;
1074              
1075             # Getting the stop list file location.
1076 4         13 my $stopListFileLoc = shift;
1077            
1078             # Getting the verbose option.
1079 4         7 my $verboseOpt = shift;
1080            
1081             # Defining the matrix which contains the score.
1082 4         12 my @matrixScore = ();
1083             # Defining the internal Index for the matrix score.
1084 4         9 my $firstDimIndex = 0;
1085             # Variable which will hold TotalMatrixScore.
1086 4         9 my $totalMatrixScore = 0;
1087            
1088             # Array that will contain Row Header (Cluster name).
1089 4         28 my @rowHeader = sort keys %labelSenseClustersHash;
1090             # Array that will contain Column Header (Topic name).
1091 4         20 my @colHeader = sort keys %topicDataHash;
1092            
1093             # Iterating through each cluster entry .
1094 4         20 foreach my $key (sort keys %labelSenseClustersHash){
1095             # Variable to store the two type of labels for the cluster.
1096 12         31 my $clusterDescriptiveLabel ="";
1097 12         28 my $clusterDiscriminatingLabel ="";
1098            
1099             # Reading the labels for a cluster from the hash.
1100 12         28 for my $innerkey (keys %{$labelSenseClustersHash{$key}}){
  12         68  
1101 24 100       135 if(lc($innerkey) eq $labelType_Descriptive){
    50          
1102 12         49 $clusterDescriptiveLabel = $labelSenseClustersHash{$key}{$innerkey};
1103             }elsif(lc($innerkey) eq $labelType_Discriminating){
1104 12         49 $clusterDiscriminatingLabel = $labelSenseClustersHash{$key}{$innerkey};
1105             }
1106             }
1107            
1108             # Defining Index for the second dimension.
1109 12         28 my $secondDimIndex = 0;
1110              
1111             # Iterating through the topics.
1112 12         57 for my $topicKey (sort keys %topicDataHash){
1113            
1114             # Calling the SimilarityScore module to get the Similarity Score between
1115             # Descriptive labels and Gold Key Data.
1116 36         341 my $similarityObject = Text::SenseClusters::LabelEvaluation::SimilarityScore
1117             ->new($clusterDescriptiveLabel,$topicDataHash{$topicKey},
1118             $stopListFileLoc,$verboseOpt );
1119            
1120             # Calling the SimilarityScore module to get the overlapping score.
1121 36         161 my ($score, %allScores) = $similarityObject->computeOverlappingScores();
1122 36         980 my $descriptiveScore = $allScores{'raw_lesk'};
1123              
1124             # Calling the SimilarityScore module to get the Similarity Score between
1125             # Discriminating labels and Gold Key Data.
1126 36         339 $similarityObject = Text::SenseClusters::LabelEvaluation::SimilarityScore
1127             ->new($clusterDiscriminatingLabel,$topicDataHash{$topicKey},
1128             $stopListFileLoc, $verboseOpt);
1129            
1130             # Calling the SimilarityScore module to get the overlapping score.
1131 36         163 ($score, %allScores) = $similarityObject->computeOverlappingScores();
1132 36         1183 my $discriminatingScore = $allScores{'raw_lesk'};
1133              
1134              
1135             # Calculating Total-Similarity-Score for the labels and gold-key.
1136 36         112 my $totalScore = $descriptiveScore + $weightageRatio * $discriminatingScore;
1137             # Storing the similarity score into 2D-Array MatricScore.
1138 36         136 $matrixScore[$firstDimIndex][$secondDimIndex++] = $totalScore;
1139              
1140             # Adding the current similarity-score to overall total similarity score.
1141 36         268 $totalMatrixScore = $totalMatrixScore + $totalScore;
1142             }
1143 12         54 $firstDimIndex++;
1144             }
1145             # Returning the Array contianing Similarity Score, row and column headers.
1146 4         42 return (\@matrixScore, \@colHeader, \@rowHeader, $totalMatrixScore);
1147             }
1148              
1149              
1150             ########################################################################################
1151             =head1 Function: calculateAccuracy
1152              
1153             Method used for calculating the Accuracy score for the labels generated by the
1154             SenseClusters or others.
1155              
1156             @argument1 : $mappingHashRef (Reference to Hash which contains the mapping information about the cluster and gold standard)
1157            
1158             @argument2 : $matrixScoreRef (2-D Array/Matrix which contains the similarity-scores of each labels)
1159              
1160             @argument3 : $colHeaderRef (Reference of array which contains the column header)
1161              
1162             @argument4 : $rowHeaderRef (Reference of array which contains the row header)
1163              
1164             @argument5 : $totalMatrixScore (Total similarity score of the labels with gold standard)
1165            
1166             @return : Return the overall accuracy of the labels assigned by the SenseClusters.
1167            
1168             @description :
1169              
1170             1). With the help of ()$mappingHashRef $matrixScoreRef $colHeaderRef $rowHeaderRef),
1171             this function try to calculate the sum of all diagonal elements.
1172              
1173             2). It will then calculate the accuracy for the assignment as
1174            
1175             Sum (Diagonal Scores)
1176             Accuracy = -----------------------------------
1177             Sum (All the Scores)
1178            
1179             =cut
1180              
1181             #########################################################################################
1182             sub calculateAccuracy{
1183 2     2 0 5 my $mappingHashRef = shift;
1184 2         5 my $matrixScoreRef = shift;
1185 2         3 my $colHeaderRef = shift;
1186 2         3 my $rowHeaderRef = shift;
1187 2         5 my $totalMatrixScore = shift;
1188            
1189 2         13 my %mappingHash = %$mappingHashRef;
1190 2         8 my @matrixScore = @$matrixScoreRef;
1191             # Array that will contain Row Header (Cluster name).
1192 2         5 my @rowHeader = @$rowHeaderRef;
1193             # Array that will contain Column Header (Topic name).
1194 2         5 my @colHeader = @$colHeaderRef;
1195              
1196             # Defining the internal Index for the matrix score.
1197 2         4 my $firstDimIndex = 0;
1198             # Variable which will hold TotalMatrixScore.
1199 2         4 my $diagonalScore = 0;
1200            
1201 2         94 print STDERR "\n\n Mapping provided by user\n";
1202 2         9 for my $key (keys %mappingHash){
1203 6         8 my $rowIndex = 0;
1204 6         7 my $colIndex = 0;
1205            
1206             #print STDERR "\n$key $mappingHash{$key} \t @rowHeader \t @colHeader \n\n\n";
1207 6         13 for my $index(0..@rowHeader-1){
1208 18 100       48 if($key eq $rowHeader[$index]){
1209 6         12 $rowIndex = $index;
1210             }
1211             }
1212 6         11 for my $index(0..@colHeader-1){
1213 18 100       40 if($mappingHash{$key} eq $colHeader[$index]){
1214 6         11 $colIndex = $index;
1215             }
1216             }
1217             # Getting the diagonal.
1218 6         11 $diagonalScore = $diagonalScore + $matrixScore[$rowIndex][$colIndex];
1219 6         277 print STDERR "\t$key\t<-->\t$mappingHash{$key} \n";
1220             }
1221              
1222             # Defining the accuracy.
1223 2         6 my $accuracy = 0;
1224            
1225 2 50       8 if($totalMatrixScore == 0){
1226 0         0 $accuracy = 0;
1227             }else{
1228             # Making the accuracy in percentage and rounding off it to 2 decimal place.
1229 2         42 $accuracy = sprintf("%.2f", ($diagonalScore *100 /$totalMatrixScore));
1230             }
1231            
1232 2         108 print STDERR "\nAccuracy of assigned labels =". $accuracy ."\%\n\n";
1233            
1234             # Returning the accuracy.
1235 2         41 return $accuracy;
1236             }
1237              
1238              
1239              
1240             #######################################################################################################
1241             =pod
1242              
1243             =head1 BUGS
1244              
1245             =over
1246              
1247             =item * Currently not supporting the WordNet gold standards comparison.
1248              
1249             =back
1250              
1251             =head1 SEE ALSO
1252              
1253             http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/
1254            
1255             Last modified by :
1256             $Id: Driver.pm,v 1.6 2013/03/18 02:59:42 jhaxx030 Exp $
1257              
1258             =head1 AUTHORS
1259              
1260             Anand Jha, University of Minnesota, Duluth
1261             jhaxx030 at d.umn.edu
1262              
1263             Ted Pedersen, University of Minnesota, Duluth
1264             tpederse at d.umn.edu
1265              
1266              
1267             =head1 COPYRIGHT AND LICENSE
1268              
1269             Copyright (C) 2012-2013 Ted Pedersen, Anand Jha
1270              
1271             See http://dev.perl.org/licenses/ for more information.
1272              
1273             This program is free software; you can redistribute it and/or modify
1274             it under the terms of the GNU General Public License as published by
1275             the Free Software Foundation; either version 2 of the License, or
1276             (at your option) any later version.
1277              
1278             This program is distributed in the hope that it will be useful,
1279             but WITHOUT ANY WARRANTY; without even the implied warranty of
1280             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1281             GNU General Public License for more details.
1282              
1283             You should have received a copy of the GNU General Public License
1284             along with this program; if not, write to:
1285            
1286            
1287             The Free Software Foundation, Inc., 59 Temple Place, Suite 330,
1288             Boston, MA 02111-1307 USA
1289            
1290            
1291             =cut
1292             #######################################################################################################
1293              
1294             1;