File Coverage

blib/lib/Text/SenseClusters/LabelEvaluation/ReadingFilesData.pm
Criterion Covered Total %
statement 96 115 83.4
branch 22 36 61.1
condition 3 3 100.0
subroutine 7 8 87.5
pod 0 6 0.0
total 128 168 76.1


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             package Text::SenseClusters::LabelEvaluation::ReadingFilesData;
4              
5 5     5   196100 use strict;
  5         12  
  5         216  
6 5     5   9106 use encoding "utf-8";
  5         108659  
  5         34  
7              
8             # The following two lines will make this module inherit from the Exporter Class.
9             require Exporter;
10             our @ISA = qw(Exporter);
11              
12             #######################################################################################################################
13              
14             =head1 Name
15              
16             Text::SenseClusters::LabelEvaluation::ReadingFilesData - Module for reading the data from a file as single string object.
17              
18             =head1 SYNOPSIS
19              
20             The following code snippet will show how to use this module.
21              
22             Example 1: Reading the label file generated by sense cluster.
23              
24             use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
25            
26             # Reading the cluster's labels file.
27             my $clusterFileName = "TVS.label";
28            
29             # Getting the clusters file name.
30             my $clusterFileName = $driverObject->{$senseClusterLabelFileName};
31            
32             # Creating the read file object and reading the label examples.
33             my $readClusterFileObject =
34             Text::SenseClusters::LabelEvaluation::ReadingFilesData->new ($clusterFileName);
35             my %labelSenseClustersHash = ();
36             my $labelSenseClustersHashRef =
37             $readClusterFileObject->readLinesFromClusterFile(\%labelSenseClustersHash);
38             %labelSenseClustersHash = %$labelSenseClustersHashRef;
39            
40             # Iterating the Hash to print the value.
41             foreach my $key (sort keys %labelSenseClustersHash){
42             foreach my $innerkey (sort keys %{$labelSenseClustersHash{$key}}){
43             print "$key :: $innerkey :: $labelSenseClustersHash{$key}{$innerkey} \n";
44             }
45             }
46            
47            
48             Example 2: Reading the user provided Gold Standard keys and their data.
49            
50             use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
51             # Reading the topic file name.
52             my $topicsFileName = "TVS.txt";
53            
54             # Creating the read object, which will read the gold-standard keys and data provided by user.
55             my $readFileObject =
56             Text::SenseClusters::LabelEvaluation::ReadingFilesData->new($topicsFileName);
57            
58             # Reading the Mapping with help of function.
59             my ( $hashRef, $topicArrayRef ) = $readFileObject->readMappingFromTopicFile();
60            
61             # Reading the hash from its reference.
62             my %mappingHash = %$hashRef;
63             my @topicArray = @$topicArrayRef;
64             # Iterating the Hash to print the value.
65             foreach my $key ( sort keys %mappingHash ) {
66             print "$key=$mappingHash{$key}\n";
67             }
68             # Iterating the Hash to print the value.
69             foreach my $key (@topicArray) {
70             print "$key\n";
71             }
72              
73              
74             =head1 DESCRIPTION
75              
76             This module provides the various functions to read the labels and topic files.
77            
78             The first function reads the labelled data generated by the SenseClusters and
79             create hash from it. The data-format of the input file must match the format
80             of label-file generated by SenseClusters.
81            
82             The second function reads a file into a string variable by removing all the
83             newline characters from it.
84            
85             The remaining functions read the user provided file that contains the mapping
86             of clusters labels with gold standard keys, and/or data about the gold standard
87             key or list of topics.
88            
89             =cut
90              
91              
92             # Parameter for reading the file.
93             our $fileName = "fileName";
94              
95              
96             ##########################################################################################
97              
98             =head1 Constructor: new()
99              
100             This is the constructor which will create object for this class.
101             Reference : http://perldoc.perl.org/perlobj.html
102              
103             This constructor takes the following argument:
104             1. $fileNameArg :
105             The name of the file whose data has to be read.
106              
107             =cut
108              
109             ##########################################################################################
110             sub new {
111             # Creating the object.
112 9     9 0 35 my $class = shift;
113 9         18 my $readFileObject = {};
114              
115             # Explicit association is created by the built-in bless function.
116 9         27 bless $readFileObject, $class;
117              
118             # Getting the Hash as the argument.
119 9         20 my $fileNameArg = shift;
120 9         58 $readFileObject->{$fileName} = $fileNameArg;
121              
122             # Returning the blessed hash refered by $self.
123 9         32 return $readFileObject;
124             }
125            
126              
127             ###########################################################################################
128              
129             =head1 Function: readLinesFromClusterFile
130              
131             This function will read lines from the file containing the Labels of the
132             Clusters and make the hash file.
133              
134             @argument1 : Name of the cluster file name.
135              
136             @argument2 : Reference of Hash ($labelSenseClustersHash) which will hold
137             the information in the following format:
138            
139             For e.g.:\tCluster0{
140             Descriptive => George Bush, Al Gore, White House, New York
141             Discriminating => George Bush, York Times
142             }
143             Cluster1{
144             Descriptive => George Bush, BRITAIN London, Prime Minister
145             Discriminating => BRITAIN London, Prime Minister
146             }
147              
148              
149             @return : It will return the reference of the Hash mentioned above:
150             $labelSenseClustersHashRef.
151            
152             @description :
153              
154             1. Read the file line by line.
155             2. Ignore the lines which do not follow one of the following format:
156             Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
157             Cluster 0 (Discriminating): George Bush, BRITAIN London
158             3. Create Key from the "Cluster # (Descriptive)" or "Cluster # (Discrim
159             - inating)" as "OuterKey: Cluster#" "InnerKey: Descriptive".
160             4. Store the value of hash as the keywords similar to above example:
161             for e.g:
162             $labelSenseClustersGlobalRef{Cluster0}{Discriminating}
163             = "BRITAIN London, Prime Minister";
164            
165            
166             =cut
167              
168             ###########################################################################################
169              
170             sub readLinesFromClusterFile {
171             # Reading the object as the argument.
172 5     5 0 17 my $readFileObject = shift;
173            
174             # Getting the fileName Contains the Cluster and topic mapping..
175 5         17 my $clusterFileName = $readFileObject->{$fileName};
176            
177             # Reading the reference from the argument.
178 5         11 my $labelSenseClustersHashRef = shift;
179             # Getting the hash from the reference.
180 5         33 my %labelSenseClustersHash = %$labelSenseClustersHashRef;
181              
182             # Opening the File passed by user as the first command line argument.
183             # It should be the name of the cluster file containing the labels.
184 5 50       293 open clusterFile, $clusterFileName or die $!;
185              
186             # Reading all the lines of the clusterslabel file.
187 5         162 while () {
188             # Removing the new line character.
189 60         81 chomp;
190              
191             # Removing the white space from the front and end of the word.
192 60         799 $_ =~ s/^\s+|\s+$//g;
193              
194             # If the line is empty then ignore that line and go to next line.
195 60 100       129 if ( $_ eq '' ) {
196 30         110 next;
197             }
198              
199             # Contents of LabelFile.
200             # Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
201             # Cluster 0 (Discriminating): George Bush, BRITAIN London
202              
203             # Spiliting each line by ":".
204 30         136 my @lineArray = split( /:/, $_ );
205              
206             # If the given do not have Two elements after split. (It means no data for the
207             # given cluster.) Then ignore that cluster.
208 30 50       80 if ( scalar(@lineArray) != 2 ) {
209 0         0 next;
210             }
211              
212             # Following Code are for making the Key (which will be Cluster Number and Type of
213             # Labels) Typical Key Structure --> "Cluster 0 (Descriptive)"
214              
215             # Spiliting the elements contianing the information about the key with whitespace
216 30         85 my @keyArray = split( /\s+/, $lineArray[0] );
217              
218             # If something wrong with the structure than ignore the key and carry on with
219             # next line.
220 30 50       80 if ( scalar(@keyArray) != 3 ) {
221 0         0 next;
222             }
223              
224             # Making of the Outer key, which is "cluster#"
225 30         63 my $outerKey = $keyArray[0] . $keyArray[1];
226              
227             # The inner key indicates the type of label i.e. Descriptive or Discriminating.
228 30         55 my $innerKey = $keyArray[2];
229              
230             # Removing the start parenthesis '(' and closing ')' parenthesis from the inner
231             # key.
232 30         153 $innerKey =~ s/[(,)]+//g;
233              
234             # Setting the keywords associated with this keys as the value.
235             # For e.g.: Cluster0{
236             # Descriptive => George Bush, Al Gore, White House, New York
237             # Discriminating => George Bush, BRITAIN London
238             # }
239 30         248 $labelSenseClustersHash{$outerKey}{$innerKey} = $lineArray[1];
240             }
241              
242             # Close the file handle.
243 5         55 close(clusterFile);
244              
245             # Returning the reference of the Hash containg the Labels information from
246             # the cluster.
247 5         25 return \%labelSenseClustersHash;
248             }
249              
250             ##########################################################################################
251              
252             =head1 Function: readLinesFromTopicFile
253              
254             This function will read lines from the topic file and list of all the topics.
255              
256             @argument1 : Name of the topicFile.
257            
258             @return : String containing the list of all the topics(labels) for
259             the clusters.
260            
261             @description :
262             1. Read the file line by line.
263             2. Remove the new line characters and making string variable which contains the
264             list of all the topics.
265            
266             =cut
267              
268             ##########################################################################################
269              
270             sub readLinesFromTopicFile {
271             # Reading the object as the argument.
272 1     1 0 2 my $readFileObject = shift;
273            
274             # Getting the topic file name from argument.
275 1         3 my $topicFileName = $readFileObject->{$fileName};
276              
277             # Opening the File, whose name is passed as the second command-line-argument.
278             # It is the name of the file which contains the list of the topics for clusters.
279 1 50       40 open topicFile, $topicFileName or die $!;
280              
281             # Defining the variable which will hold all the topics.
282 1         3 my $topicData = "";
283              
284             # Reading the file line by line till end of file.
285 1         24 while () {
286             # Removing the new line character.
287 2         5 chomp;
288              
289             # Concatenating it to previous line.
290 2         13 $topicData = $topicData . $_;
291             }
292             # Close the file handle.
293 1         11 close(topicFile);
294              
295             # Returning the topic list.
296 1         6 return $topicData;
297             }
298              
299              
300              
301              
302             ##########################################################################################
303              
304             =head1 Function: readMappingFromTopicFile
305              
306             This function will read mapping provided by the user for the Cluster's label (Cluster#)
307             and gold standard key(topic-name).
308              
309             Syntax of the file:
310             <#>
311             Example:
312             Cluster0:::topic1
313             Cluster1:::topic2
314             Cluster2:::topic0
315              
316             @argument : $readFileObject : Object of the current file.
317            
318             @return1 : \%clusterTopicMappingHash : DataType : (Reference to Hash)
319             Reference of Hash containing the mapping between the Cluster's
320             label and gold standard key.
321              
322             @return2 : \@topicArray : DataType : (Reference to array)
323             Reference of array containg the gold standard keys.
324            
325             @description :
326             1. Read the file line by line.
327             2. Check the line, if it contains the "Cluster#:::".
328             3. Spliting these line with Seprator":::".
329             4. A WordArray do not have 2 elements, ignore it.
330             3. Otherwise ignore the remaining lines.
331              
332            
333             Reason for selecting the separtor as ":::"
334             1. It will ensure that it is unique and it has very rare chance of occuring
335             in a documents or text.
336            
337             =cut
338              
339             ##########################################################################################
340             sub readMappingFromTopicFile {
341             # Reading the object as the argument.
342 2     2 0 4 my $readFileObject = shift;
343            
344             # Getting the fileName Contains the Cluster and topic mapping..
345 2         4 my $topicFileName = $readFileObject->{$fileName};
346              
347             # Opening the topicFile.
348 2 50       67 open topicFile, $topicFileName or die $!;
349              
350             # Defining the hash which will store the hash information.
351 2         6 my %clusterTopicMappingHash = ();
352            
353             # Defining the array which will hold the topic name.
354 2         3 my @topicArray = ();
355            
356             # Defining the index for the array.
357 2         5 my $index = 0;
358            
359             # Reading the file line by line till end of file.
360 2         50 while ( my $lineData = ) {
361              
362             # Removing the new line character.
363 76         64 chomp($lineData);
364              
365             # Removing space from the front and back.
366 76         1435 $lineData =~ s/^\s+|\s+$//g;
367              
368             # If the line start with "Cluster".
369 76 100       333 if ( $lineData =~ m/^cluster/i ) {
370              
371             # Spliting with Seprator":::".
372 6         20 my @wordsOfSentenceArray = split( /:::/, $lineData );
373            
374             # If the WordArray do not have 2 elements, ignore it.
375 6 50       16 if ( scalar(@wordsOfSentenceArray) != 2 ) {
376 0         0 next;
377             }
378            
379             # Removing the front and last whitespace from cluster name and topic name.
380 6         19 $wordsOfSentenceArray[0]=~ s/^\s+|\s+$//g;
381 6         21 $wordsOfSentenceArray[1]=~ s/^\s+|\s+$//g;
382            
383             # Storing the mapping into the hash.
384 6         9 $clusterTopicMappingHash{$wordsOfSentenceArray[0]} = $wordsOfSentenceArray[1];
385            
386             # Also storing the list of all the topics name in the array.
387 6         28 $topicArray[$index++] = $wordsOfSentenceArray[1];
388             }
389             # If the line do not start with Cluster, ignore it.
390             }
391              
392             # Close the file handle.
393 2         18 close(topicFile);
394              
395             # Returning the topic list.
396 2         12 return (\%clusterTopicMappingHash, \@topicArray);
397             }
398              
399              
400             ###########################################################################################
401              
402             =head1 Function: readTopicDataFromTopicFile
403              
404             This function will read data about the gold standard key(topic-name).
405              
406             Syntax of the file:
407            
408             Example:
409              
410             topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
411             data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
412             topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
413             data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1
414            
415             @argument : $readFileObject : Object of the current file.
416            
417             @return : \%topicDataHash : DataType : (Reference to Hash)
418             Reference of Hash containing the topics and their corresponding
419             data.
420            
421             @description :
422             1. Read the file line by line.
423             2. Check the line, if it contains the ":::" and starts with one of the topic:
424             a. This indicates the start of the topic's data.
425             b. Read the line till we encounter another "topic-name:::" or "cluster#:::"
426             4. Finally, make hash containing the topic as the key and topic's data as the
427             value.
428             3. Return the reference of this hash.
429            
430             =cut
431              
432             ##########################################################################################
433             sub readTopicDataFromTopicFile {
434             # Reading the object as the argument.
435 2     2 0 4 my $readFileObject = shift;
436            
437             # Getting the fileName Contains the Cluster and topic mapping..
438 2         8 my $topicFileName = $readFileObject->{$fileName};
439              
440             # Opening the topicFile.
441 2 50       66 open topicFile, $topicFileName or die $!;
442              
443             # Getting the reference of the array containing the topic name.
444 2         6 my $topicNameArrayRef = shift;
445            
446             # Getting the array from the reference.
447 2         6 my @topicNameArray = @$topicNameArrayRef;
448             # print "@topicNameArray \n\n\n";
449            
450             # Defining the hash containing the topics and their corresponding data.
451 2         5 my %topicDataHash = ();
452            
453             # Defining the array which will hold all the data for the file.
454 2         4 my @fileData =();
455              
456             # Reading the file line by line till end of file.
457 2         82 while ( my $lineData = ) {
458             # Adding all the line data in a array.
459 143         408 push (@fileData, $lineData);
460             }
461            
462             # If the topic is not present than read the files and populate the value.
463 2 100       10 if(@topicNameArray ==0){
464              
465             # Iterating through the array which contains all the data.
466 1         3 foreach my $lineData (@fileData) {
467            
468             # Removing the new line character.
469 70         79 chomp($lineData);
470            
471             # Reading the topic value.
472 70 100       156 if($lineData =~ m/^(.+):::/){
473             # Removing the white spaces around the topic.
474 3         17 $1 =~ s/^\s+|\s+$//;
475 3         8 push(@topicNameArray,$1);
476             }
477             }
478             }
479            
480             # Iterating through the array which contains all the data.
481 2         6 foreach my $topicName (@topicNameArray){
482            
483 6         10 my $topicKey = "";
484 6         7 my $topicData = "";
485            
486             # This variable is set to 1, only when we are reading the "topic:::" for first time.
487 6         8 my $startOfTopicData = 1;
488            
489             # Reading the file line by line till end of file.
490 6         8 foreach my $lineData (@fileData) {
491            
492             # Removing the new line character.
493 305         393 chomp($lineData);
494            
495             # If this is already set 0 and we encounter "topicName:::" or "Cluster#:::" then
496             # reading of data about the topic is over.
497 305 100 100     905 if($startOfTopicData == 0 && ($lineData =~ m/^.+:::/i) ){
498 4         9 last;
499             }
500            
501             # Making the variable metadata for Regular expression search.
502 301         351 my $searchString = quotemeta $topicName;
503            
504             # If the line start with any topicName.
505 301 100       1129 if ( $lineData =~ m/^$searchString\:::/i) {
506            
507             # ReSetting the start counter to 0.
508 6         11 $startOfTopicData = 0;
509            
510             # Setting the topic key.
511 6         10 $topicKey = $topicName;
512            
513             # The removing the topic from the line and remaining terms will be part of
514             # the data about the topics.
515 6         86 $lineData =~ s/(^$searchString\:::)//i;
516 6         50 $topicData = $lineData;
517 6         15 next;
518             }
519            
520             # Each subsequent line will be keep on adding to the topic data, till we encounter
521             # "Cluster#:::" or "topic#:::" or "end of file"
522 295 100       543 if($startOfTopicData == 0){
523 132         574 $topicData = $topicData." ".$lineData;
524             }
525             }
526             # Removing space from the front and back.
527 6         2734 $topicData =~ s/^\s+|\s+$//g;
528            
529 6 50       19 if($topicKey){
530 6         22 $topicDataHash{$topicKey} = $topicData;
531             }
532            
533             }
534              
535             # Close the file handle.
536 2         31 close(topicFile);
537              
538             # Returning the topic list.
539 2         29 return \%topicDataHash;
540             }
541              
542             ###########################################################################################
543              
544             =head1 Function: readTopicNamesFromTopicFile
545              
546             This function will list all the topics from the file provided by user.
547              
548             Syntax of the file:
549            
550            
551            
552            
553            
554            
555            
556             Example:
557              
558             topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
559             data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
560             topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
561             data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1
562             cluster0:::topic1
563             cluster1:::topic2
564             cluster2:::topic0
565            
566             @argument : $readFileObject : Object of the current file.
567            
568             @return : \@topicNameArray : DataType : (Reference to array)
569             Reference of array containing the list of topics.
570            
571             @description :
572             1. Read the file line by line.
573             2. Check the line, if it contains the ":::"
574             a. if starts with "cluster" ignore it.
575             b. otherwise, split that line with separator, ":::" and store the results in array.
576             c. The first element of the array is the topic-name.
577             d. Push, this topic-name into the array.
578             3. Return the reference of this array.
579              
580              
581             Reason for selecting the separtor as ":::"
582             1. It will ensure that it is unique and it has very rare chance of occuring
583             in a documents or text.
584             =cut
585              
586             ##########################################################################################
587             sub readTopicNamesFromTopicFile {
588            
589             # Reading the object as the argument.
590 0     0 0   my $readFileObject = shift;
591            
592             # Getting the fileName Contains the Cluster and topic mapping..
593 0           my $topicFileName = $readFileObject->{$fileName};
594              
595             # Opening the topicFile.
596 0 0         open topicFile, $topicFileName or die $!;
597              
598             # Variable which will contains the topics.
599 0           my @topicNameArray = ();
600            
601             # Defining the index for the array.
602 0           my $index = 0;
603            
604             # Reading the file line by line till end of file.
605 0           while ( my $lineData = ) {
606             # Removing the new line character.
607 0           chomp($lineData);
608              
609             # Removing space from the front and back.
610 0           $lineData =~ s/^\s+|\s+$//;
611              
612             # If the line start with Cluster, ignore it.
613 0 0         if ( $lineData =~ m/^cluster/i ) {
614 0           next;
615             }
616              
617             # If the line contains ":::".
618 0 0         if ( $lineData =~ m/:::/ ) {
619             # Spliting with Seprator":::".
620 0           my @wordsOfSentenceArray = split( /:::/, $lineData );
621              
622             # Removing the white spaces around the topic.
623 0           $wordsOfSentenceArray[0] =~ s/^\s+|\s+$//;
624              
625             # Adding the terms to topic-array.
626 0           $topicNameArray[$index++] = $wordsOfSentenceArray[0];
627             }
628             }
629             # Close the file handle.
630 0           close(topicFile);
631              
632             # Returning the topic list.
633 0           return (\@topicNameArray);
634             }
635              
636             #######################################################################################################
637              
638             =pod
639              
640             =head1 SEE ALSO
641              
642             http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/
643              
644             Last modified by :
645             $Id: ReadingFilesData.pm,v 1.5 2013/03/07 23:15:49 jhaxx030 Exp $
646            
647             =head1 AUTHORS
648              
649             Anand Jha, University of Minnesota, Duluth
650             jhaxx030 at d.umn.edu
651              
652             Ted Pedersen, University of Minnesota, Duluth
653             tpederse at d.umn.edu
654              
655             =head1 COPYRIGHT AND LICENSE
656              
657             Copyright (C) 2012-2013 Ted Pedersen, Anand Jha
658              
659             See http://dev.perl.org/licenses/ for more information.
660              
661             This program is free software; you can redistribute it and/or modify
662             it under the terms of the GNU General Public License as published by
663             the Free Software Foundation; either version 2 of the License, or
664             (at your option) any later version.
665              
666             This program is distributed in the hope that it will be useful,
667             but WITHOUT ANY WARRANTY; without even the implied warranty of
668             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
669             GNU General Public License for more details.
670              
671             You should have received a copy of the GNU General Public License
672             along with this program; if not, write to:
673            
674            
675             The Free Software Foundation, Inc., 59 Temple Place, Suite 330,
676             Boston, MA 02111-1307 USA
677            
678            
679             =cut
680              
681             #######################################################################################################
682              
683             # Making the default return statement as 1;
684             # Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html
685              
686             1;