File Coverage

blib/lib/Text/SenseClusters/LabelEvaluation/ReadingFilesData.pm
Criterion Covered Total %
statement 33 35 94.2
branch 6 10 60.0
condition n/a
subroutine 4 4 100.0
pod 0 2 0.0
total 43 51 84.3


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             package Text::SenseClusters::LabelEvaluation::ReadingFilesData;
4              
5 2     2   65546 use strict;
  2         5  
  2         81  
6 2     2   2086 use encoding "utf-8";
  2         48041  
  2         15  
7              
8             # The following two lines will make this module inherit from the Exporter Class.
9             require Exporter;
10             our @ISA = qw(Exporter);
11              
12             #######################################################################################################################
13              
14             =head1 Name
15              
16             Text::SenseClusters::LabelEvaluation::ReadingFilesData - Module for reading the data from a file as single string object.
17              
18             =head1 SYNOPSIS
19              
20             The following code snippet will show how to use this module.
21              
22             package Text::SenseClusters::LabelEvaluation::Test_ReadingFilesData;
23              
24             # Including the LabelEvaluation Module.
25             use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
26              
27             # Including the FileHandle module.
28             use FileHandle;
29              
30              
31             # The following block-of-code, create a file and write the data into it.
32             # At the end of this test program, we will delete that file.
33            
34             # File that will contain the topic information.
35             my $topicFileName = "temp_TopicData.txt";
36              
37             # Defining the file handle for the topic file.
38             our $topicFileHandle = FileHandle->new(">$topicFileName");
39              
40             # Writing into the Topic file.
41             # Bill Clinton , Tony Blair
42             print $topicFileHandle "Bill Clinton is an American politician who served as the 42nd President of".
43             "the United States from 1993 to 2001. Inaugurated at age 46, he was the third-youngest president.".
44             "He took office at the end of the Cold War, and was the first president of the baby boomer generation.".
45             "Clinton has been described as a New Democrat. Many of his policies have been attributed to a centrist".
46             "Third Way philosophy of governance. He is married to Hillary Rodham Clinton, who has served as the".
47             "United States Secretary of State since 2009 and was a Senator from New York from 2001 to 2009.".
48             "As Governor of Arkansas, Clinton overhauled the state's education system, and served as Chair ".
49             "of the National Governors Association.Clinton was elected president in 1992, defeating incumbent".
50             "president George H. W. Bush. The Congressional Budget Office reported a budget surplus between ".
51             "the years 1998 and 2000, the last three years of Clinton's presidency. Since leaving office,".
52             "Clinton has been rated highly in public opinion polls of U.S. presidents. \n";
53              
54             # Closing file handle.
55             close($topicFileHandle);
56              
57             # END OF file creation block.
58              
59              
60             # The following code will call the readLinesFromTopicFile() function from the
61             # ReadingFilesData modules. It will return the content of the file in a string.
62             my $fileData = Text::SenseClusters::LabelEvaluation::ReadingFilesData::readLinesFromTopicFile(
63             $topicFileName);
64              
65             # Printing the content of the file.
66             print "\n Data of the input file is $fileData \n";
67              
68              
69             # Deleting the temporary label and topic files.
70             unlink $topicFileName or warn "Could not unlink $topicFileName: $!";
71              
72              
73             =head1 DESCRIPTION
74              
75             This module provides the two functions. The first function reads the labelled
76             data generated by the SenseClusters and create hash from it. The data of the
77             input file must match the format of label-file generated by SenseClusters.
78            
79            
80             The second function reads a file into a string variable by removing all the
81             newline characters from it.
82            
83             =cut
84              
85             ###########################################################################################
86              
87             =head1 Function: readLinesFromClusterFile
88             ------------------------------------------------
89              
90             This function will read lines from the file containing the Labels of the
91             Clusters and make the hash file.
92              
93             @argument1 : Name of the cluster file name.
94              
95             @argument2 : Reference of Hash ($labelSenseClustersHash) which will hold
96             the information in the following format:
97            
98             For e.g.: Cluster0{
99             Descriptive => George Bush, Al Gore, White
100             House, New York
101             Discriminating => George Bush, York Times
102             }
103             Cluster1{
104             Descriptive => George Bush, BRITAIN London,
105             Prime Minister
106             Discriminating => BRITAIN London, Prime Minister
107             }
108              
109              
110             @return : It will return the reference of the Hash mentioned above:
111             $labelSenseClustersHashRef.
112            
113             @description :
114              
115             1. Read the file line by line.
116            
117             2. Ignore the lines which do not follow one of the following format:
118            
119             Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
120            
121             Cluster 0 (Discriminating): George Bush, BRITAIN London
122            
123             3. Create Key from the "Cluster # (Descriptive)" or "Cluster # (Discrim
124             - inating)" as "OuterKey: Cluster#" "InnerKey: Descriptive".
125            
126             4. Store the value of hash as the keywords similar to above example:
127             for e.g:
128            
129             $labelSenseClustersGlobalRef{Cluster0}{Discriminating}
130             = "BRITAIN London, Prime Minister";
131            
132            
133             =cut
134             ###########################################################################################
135              
136             sub readLinesFromClusterFile{
137              
138             # Reading the cluster file Name from the argument.
139 1     1 0 2 my $clusterFileName = shift;
140            
141             # Reading the reference from the argument.
142 1         3 my $labelSenseClustersHashRef = shift;
143            
144             # Getting the hash from the reference.
145 1         3 my %labelSenseClustersHash = %$labelSenseClustersHashRef;
146              
147             # Opening the File passed by user as the first command line argument.
148             # It should be the name of the cluster file containing the labels.
149 1 50       2824 open clusterFile, $clusterFileName or die $!;
150              
151 1         22 while (){
152             # Removing the new line character.
153 7         11 chomp;
154            
155             # Removing the white space from the front and end of the word.
156 7         93 $_ =~ s/^\s+|\s+$//g;
157            
158             # If the line is empty then ignore that line and go to next line.
159 7 100       18 if($_ eq ''){
160 3         9 next;
161             }
162              
163             # Contents of LabelFile.
164             # Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
165             # Cluster 0 (Discriminating): George Bush, BRITAIN London
166            
167             # Spiliting each line by ":".
168 4         20 my @lineArray = split(/:/, $_);
169            
170             # If the given do not have Two elements after split. (It means no data for the
171             # given cluster.) Then ignore that cluster.
172 4 50       12 if(scalar(@lineArray)!=2){
173 0         0 next;
174             }
175              
176             # Following Code are for making the Key (which will be Cluster Number and Type of
177             # Labels) Typical Key Structure --> "Cluster 0 (Descriptive)"
178            
179             # Spiliting the elements contianing the information about the key with whitespace
180 4         21 my @keyArray = split(/\s+/, $lineArray[0]);
181            
182             # If something wrong with the structure than ignore the key and carry on with
183             # next line.
184 4 50       11 if(scalar(@keyArray)!=3){
185 0         0 next;
186             }
187            
188             # Making of the Outer key, which is "cluster#"
189 4         9 my $outerKey = $keyArray[0].$keyArray[1];
190            
191             # The inner key indicates the type of label i.e. Descriptive or Discriminating.
192 4         6 my $innerKey = $keyArray[2];
193            
194             # Removing the start parenthesis '(' and closing ')' parenthesis from the inner
195             # key.
196 4         18 $innerKey =~s/[(,)]+//g;
197            
198             # Setting the keywords associated with this keys as the value.
199             # For e.g.: Cluster0{
200             # Descriptive => George Bush, Al Gore, White House, New York
201             # Discriminating => George Bush, BRITAIN London
202             # }
203 4         31 $labelSenseClustersHash{$outerKey}{$innerKey} = $lineArray[1];
204             }
205            
206             # Close the file handle.
207 1         12 close (clusterFile);
208            
209             # Returning the reference of the Hash containg the Labels information from
210             # the cluster.
211 1         7 return \%labelSenseClustersHash;
212             }
213              
214              
215              
216             ##########################################################################################
217             =head1 Function: readLinesFromTopicFile
218              
219             ------------------------------------------------
220              
221             This function will read lines from the topic file and list of all the
222             topics.
223              
224             @argument1 : Name of the topicFile.
225            
226             @return : String containing the list of all the topics(labels) for
227             the clusters.
228            
229             @description :
230              
231             1. Read the file line by line.
232            
233             2. Remove the new line characters and making string variable which
234             contains the list of all the topics.
235            
236             =cut
237             ##########################################################################################
238              
239             sub readLinesFromTopicFile{
240            
241             # Getting the topic file name from argument.
242 2     2 0 368 my $topicFileName = shift;
243              
244             # Opening the File, whose name is passed as the second command-line-argument.
245             # It is the name of the file which contains the list of the topics for clusters.
246 2 50       73 open topicFile, $topicFileName or die $!;
247            
248             # Defining the variable which will hold all the topics.
249 2         7 my $topicData = "";
250            
251             # Reading the file line by line till end of file.
252 2         43 while (){
253            
254             # Removing the new line character.
255 2         7 chomp;
256            
257             # Concatenating it to previous line.
258 2         22 $topicData = $topicData.$_;
259             }
260            
261             # Close the file handle.
262 2         22 close (topicFile);
263            
264             # Returning the topic list.
265 2         10 return $topicData;
266             }
267              
268              
269              
270              
271             #######################################################################################################
272             =pod
273              
274              
275             =head1 SEE ALSO
276              
277             http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/
278            
279            
280             @Last modified by : Anand Jha
281             @Last_Modified_Date : 24th Dec. 2012
282             @Modified Version : 1.6
283            
284             =head1 AUTHORS
285              
286             Ted Pedersen, University of Minnesota, Duluth
287             tpederse at d.umn.edu
288              
289             Anand Jha, University of Minnesota, Duluth
290             jhaxx030 at d.umn.edu
291              
292              
293              
294             =head1 COPYRIGHT AND LICENSE
295              
296             Copyright (C) 2012 Ted Pedersen, Anand Jha
297              
298             See http://dev.perl.org/licenses/ for more information.
299              
300             This program is free software; you can redistribute it and/or modify
301             it under the terms of the GNU General Public License as published by
302             the Free Software Foundation; either version 2 of the License, or
303             (at your option) any later version.
304              
305             This program is distributed in the hope that it will be useful,
306             but WITHOUT ANY WARRANTY; without even the implied warranty of
307             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
308             GNU General Public License for more details.
309              
310             You should have received a copy of the GNU General Public License
311             along with this program; if not, write to:
312            
313            
314             The Free Software Foundation, Inc., 59 Temple Place, Suite 330,
315             Boston, MA 02111-1307 USA
316            
317            
318             =cut
319             #######################################################################################################
320              
321             # Making the default return statement as 1;
322             # Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html
323              
324             1;