File Coverage

blib/lib/Text/SenseClusters/LabelEvaluation/ReadingFilesData.pm

Criterion	Covered	Total	%
statement	96	115	83.4
branch	22	36	61.1
condition	3	3	100.0
subroutine	7	8	87.5
pod	0	6	0.0
total	128	168	76.1

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl -w
2
3							package Text::SenseClusters::LabelEvaluation::ReadingFilesData;
4
5	5			5		196100	use strict;
	5					12
	5					216
6	5			5		9106	use encoding "utf-8";
	5					108659
	5					34
7
8							# The following two lines will make this module inherit from the Exporter Class.
9							require Exporter;
10							our @ISA = qw(Exporter);
11
12							#######################################################################################################################
13
14							=head1 Name
15
16							Text::SenseClusters::LabelEvaluation::ReadingFilesData - Module for reading the data from a file as single string object.
17
18							=head1 SYNOPSIS
19
20							The following code snippet will show how to use this module.
21
22							Example 1: Reading the label file generated by sense cluster.
23
24							use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
25
26							# Reading the cluster's labels file.
27							my $clusterFileName = "TVS.label";
28
29							# Getting the clusters file name.
30							my $clusterFileName = $driverObject->{$senseClusterLabelFileName};
31
32							# Creating the read file object and reading the label examples.
33							my $readClusterFileObject =
34							Text::SenseClusters::LabelEvaluation::ReadingFilesData->new ($clusterFileName);
35							my %labelSenseClustersHash = ();
36							my $labelSenseClustersHashRef =
37							$readClusterFileObject->readLinesFromClusterFile(\%labelSenseClustersHash);
38							%labelSenseClustersHash = %$labelSenseClustersHashRef;
39
40							# Iterating the Hash to print the value.
41							foreach my $key (sort keys %labelSenseClustersHash){
42							foreach my $innerkey (sort keys %{$labelSenseClustersHash{$key}}){
43							print "$key :: $innerkey :: $labelSenseClustersHash{$key}{$innerkey} \n";
44							}
45							}
46
47
48							Example 2: Reading the user provided Gold Standard keys and their data.
49
50							use Text::SenseClusters::LabelEvaluation::ReadingFilesData;
51							# Reading the topic file name.
52							my $topicsFileName = "TVS.txt";
53
54							# Creating the read object, which will read the gold-standard keys and data provided by user.
55							my $readFileObject =
56							Text::SenseClusters::LabelEvaluation::ReadingFilesData->new($topicsFileName);
57
58							# Reading the Mapping with help of function.
59							my ( $hashRef, $topicArrayRef ) = $readFileObject->readMappingFromTopicFile();
60
61							# Reading the hash from its reference.
62							my %mappingHash = %$hashRef;
63							my @topicArray = @$topicArrayRef;
64							# Iterating the Hash to print the value.
65							foreach my $key ( sort keys %mappingHash ) {
66							print "$key=$mappingHash{$key}\n";
67							}
68							# Iterating the Hash to print the value.
69							foreach my $key (@topicArray) {
70							print "$key\n";
71							}
72
73
74							=head1 DESCRIPTION
75
76							This module provides the various functions to read the labels and topic files.
77
78							The first function reads the labelled data generated by the SenseClusters and
79							create hash from it. The data-format of the input file must match the format
80							of label-file generated by SenseClusters.
81
82							The second function reads a file into a string variable by removing all the
83							newline characters from it.
84
85							The remaining functions read the user provided file that contains the mapping
86							of clusters labels with gold standard keys, and/or data about the gold standard
87							key or list of topics.
88
89							=cut
90
91
92							# Parameter for reading the file.
93							our $fileName = "fileName";
94
95
96							##########################################################################################
97
98							=head1 Constructor: new()
99
100							This is the constructor which will create object for this class.
101							Reference : http://perldoc.perl.org/perlobj.html
102
103							This constructor takes the following argument:
104							1. $fileNameArg :
105							The name of the file whose data has to be read.
106
107							=cut
108
109							##########################################################################################
110							sub new {
111							# Creating the object.
112	9			9	0	35	my $class = shift;
113	9					18	my $readFileObject = {};
114
115							# Explicit association is created by the built-in bless function.
116	9					27	bless $readFileObject, $class;
117
118							# Getting the Hash as the argument.
119	9					20	my $fileNameArg = shift;
120	9					58	$readFileObject->{$fileName} = $fileNameArg;
121
122							# Returning the blessed hash refered by $self.
123	9					32	return $readFileObject;
124							}
125
126
127							###########################################################################################
128
129							=head1 Function: readLinesFromClusterFile
130
131							This function will read lines from the file containing the Labels of the
132							Clusters and make the hash file.
133
134							@argument1 : Name of the cluster file name.
135
136							@argument2 : Reference of Hash ($labelSenseClustersHash) which will hold
137							the information in the following format:
138
139							For e.g.:\tCluster0{
140							Descriptive => George Bush, Al Gore, White House, New York
141							Discriminating => George Bush, York Times
142							}
143							Cluster1{
144							Descriptive => George Bush, BRITAIN London, Prime Minister
145							Discriminating => BRITAIN London, Prime Minister
146							}
147
148
149							@return : It will return the reference of the Hash mentioned above:
150							$labelSenseClustersHashRef.
151
152							@description :
153
154							1. Read the file line by line.
155							2. Ignore the lines which do not follow one of the following format:
156							Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
157							Cluster 0 (Discriminating): George Bush, BRITAIN London
158							3. Create Key from the "Cluster # (Descriptive)" or "Cluster # (Discrim
159							- inating)" as "OuterKey: Cluster#" "InnerKey: Descriptive".
160							4. Store the value of hash as the keywords similar to above example:
161							for e.g:
162							$labelSenseClustersGlobalRef{Cluster0}{Discriminating}
163							= "BRITAIN London, Prime Minister";
164
165
166							=cut
167
168							###########################################################################################
169
170							sub readLinesFromClusterFile {
171							# Reading the object as the argument.
172	5			5	0	17	my $readFileObject = shift;
173
174							# Getting the fileName Contains the Cluster and topic mapping..
175	5					17	my $clusterFileName = $readFileObject->{$fileName};
176
177							# Reading the reference from the argument.
178	5					11	my $labelSenseClustersHashRef = shift;
179							# Getting the hash from the reference.
180	5					33	my %labelSenseClustersHash = %$labelSenseClustersHashRef;
181
182							# Opening the File passed by user as the first command line argument.
183							# It should be the name of the cluster file containing the labels.
184	5	50				293	open clusterFile, $clusterFileName or die $!;
185
186							# Reading all the lines of the clusterslabel file.
187	5					162	while () {
188							# Removing the new line character.
189	60					81	chomp;
190
191							# Removing the white space from the front and end of the word.
192	60					799	$_ =~ s/^\s+\|\s+$//g;
193
194							# If the line is empty then ignore that line and go to next line.
195	60	100				129	if ( $_ eq '' ) {
196	30					110	next;
197							}
198
199							# Contents of LabelFile.
200							# Cluster 0 (Descriptive): George Bush, Al Gore, White House, New York
201							# Cluster 0 (Discriminating): George Bush, BRITAIN London
202
203							# Spiliting each line by ":".
204	30					136	my @lineArray = split( /:/, $_ );
205
206							# If the given do not have Two elements after split. (It means no data for the
207							# given cluster.) Then ignore that cluster.
208	30	50				80	if ( scalar(@lineArray) != 2 ) {
209	0					0	next;
210							}
211
212							# Following Code are for making the Key (which will be Cluster Number and Type of
213							# Labels) Typical Key Structure --> "Cluster 0 (Descriptive)"
214
215							# Spiliting the elements contianing the information about the key with whitespace
216	30					85	my @keyArray = split( /\s+/, $lineArray[0] );
217
218							# If something wrong with the structure than ignore the key and carry on with
219							# next line.
220	30	50				80	if ( scalar(@keyArray) != 3 ) {
221	0					0	next;
222							}
223
224							# Making of the Outer key, which is "cluster#"
225	30					63	my $outerKey = $keyArray[0] . $keyArray[1];
226
227							# The inner key indicates the type of label i.e. Descriptive or Discriminating.
228	30					55	my $innerKey = $keyArray[2];
229
230							# Removing the start parenthesis '(' and closing ')' parenthesis from the inner
231							# key.
232	30					153	$innerKey =~ s/[(,)]+//g;
233
234							# Setting the keywords associated with this keys as the value.
235							# For e.g.: Cluster0{
236							# Descriptive => George Bush, Al Gore, White House, New York
237							# Discriminating => George Bush, BRITAIN London
238							# }
239	30					248	$labelSenseClustersHash{$outerKey}{$innerKey} = $lineArray[1];
240							}
241
242							# Close the file handle.
243	5					55	close(clusterFile);
244
245							# Returning the reference of the Hash containg the Labels information from
246							# the cluster.
247	5					25	return \%labelSenseClustersHash;
248							}
249
250							##########################################################################################
251
252							=head1 Function: readLinesFromTopicFile
253
254							This function will read lines from the topic file and list of all the topics.
255
256							@argument1 : Name of the topicFile.
257
258							@return : String containing the list of all the topics(labels) for
259							the clusters.
260
261							@description :
262							1. Read the file line by line.
263							2. Remove the new line characters and making string variable which contains the
264							list of all the topics.
265
266							=cut
267
268							##########################################################################################
269
270							sub readLinesFromTopicFile {
271							# Reading the object as the argument.
272	1			1	0	2	my $readFileObject = shift;
273
274							# Getting the topic file name from argument.
275	1					3	my $topicFileName = $readFileObject->{$fileName};
276
277							# Opening the File, whose name is passed as the second command-line-argument.
278							# It is the name of the file which contains the list of the topics for clusters.
279	1	50				40	open topicFile, $topicFileName or die $!;
280
281							# Defining the variable which will hold all the topics.
282	1					3	my $topicData = "";
283
284							# Reading the file line by line till end of file.
285	1					24	while () {
286							# Removing the new line character.
287	2					5	chomp;
288
289							# Concatenating it to previous line.
290	2					13	$topicData = $topicData . $_;
291							}
292							# Close the file handle.
293	1					11	close(topicFile);
294
295							# Returning the topic list.
296	1					6	return $topicData;
297							}
298
299
300
301
302							##########################################################################################
303
304							=head1 Function: readMappingFromTopicFile
305
306							This function will read mapping provided by the user for the Cluster's label (Cluster#)
307							and gold standard key(topic-name).
308
309							Syntax of the file:
310							<#>
311							Example:
312							Cluster0:::topic1
313							Cluster1:::topic2
314							Cluster2:::topic0
315
316							@argument : $readFileObject : Object of the current file.
317
318							@return1 : \%clusterTopicMappingHash : DataType : (Reference to Hash)
319							Reference of Hash containing the mapping between the Cluster's
320							label and gold standard key.
321
322							@return2 : \@topicArray : DataType : (Reference to array)
323							Reference of array containg the gold standard keys.
324
325							@description :
326							1. Read the file line by line.
327							2. Check the line, if it contains the "Cluster#:::".
328							3. Spliting these line with Seprator":::".
329							4. A WordArray do not have 2 elements, ignore it.
330							3. Otherwise ignore the remaining lines.
331
332
333							Reason for selecting the separtor as ":::"
334							1. It will ensure that it is unique and it has very rare chance of occuring
335							in a documents or text.
336
337							=cut
338
339							##########################################################################################
340							sub readMappingFromTopicFile {
341							# Reading the object as the argument.
342	2			2	0	4	my $readFileObject = shift;
343
344							# Getting the fileName Contains the Cluster and topic mapping..
345	2					4	my $topicFileName = $readFileObject->{$fileName};
346
347							# Opening the topicFile.
348	2	50				67	open topicFile, $topicFileName or die $!;
349
350							# Defining the hash which will store the hash information.
351	2					6	my %clusterTopicMappingHash = ();
352
353							# Defining the array which will hold the topic name.
354	2					3	my @topicArray = ();
355
356							# Defining the index for the array.
357	2					5	my $index = 0;
358
359							# Reading the file line by line till end of file.
360	2					50	while ( my $lineData = ) {
361
362							# Removing the new line character.
363	76					64	chomp($lineData);
364
365							# Removing space from the front and back.
366	76					1435	$lineData =~ s/^\s+\|\s+$//g;
367
368							# If the line start with "Cluster".
369	76	100				333	if ( $lineData =~ m/^cluster/i ) {
370
371							# Spliting with Seprator":::".
372	6					20	my @wordsOfSentenceArray = split( /:::/, $lineData );
373
374							# If the WordArray do not have 2 elements, ignore it.
375	6	50				16	if ( scalar(@wordsOfSentenceArray) != 2 ) {
376	0					0	next;
377							}
378
379							# Removing the front and last whitespace from cluster name and topic name.
380	6					19	$wordsOfSentenceArray[0]=~ s/^\s+\|\s+$//g;
381	6					21	$wordsOfSentenceArray[1]=~ s/^\s+\|\s+$//g;
382
383							# Storing the mapping into the hash.
384	6					9	$clusterTopicMappingHash{$wordsOfSentenceArray[0]} = $wordsOfSentenceArray[1];
385
386							# Also storing the list of all the topics name in the array.
387	6					28	$topicArray[$index++] = $wordsOfSentenceArray[1];
388							}
389							# If the line do not start with Cluster, ignore it.
390							}
391
392							# Close the file handle.
393	2					18	close(topicFile);
394
395							# Returning the topic list.
396	2					12	return (\%clusterTopicMappingHash, \@topicArray);
397							}
398
399
400							###########################################################################################
401
402							=head1 Function: readTopicDataFromTopicFile
403
404							This function will read data about the gold standard key(topic-name).
405
406							Syntax of the file:
407
408							Example:
409
410							topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
411							data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
412							topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
413							data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1
414
415							@argument : $readFileObject : Object of the current file.
416
417							@return : \%topicDataHash : DataType : (Reference to Hash)
418							Reference of Hash containing the topics and their corresponding
419							data.
420
421							@description :
422							1. Read the file line by line.
423							2. Check the line, if it contains the ":::" and starts with one of the topic:
424							a. This indicates the start of the topic's data.
425							b. Read the line till we encounter another "topic-name:::" or "cluster#:::"
426							4. Finally, make hash containing the topic as the key and topic's data as the
427							value.
428							3. Return the reference of this hash.
429
430							=cut
431
432							##########################################################################################
433							sub readTopicDataFromTopicFile {
434							# Reading the object as the argument.
435	2			2	0	4	my $readFileObject = shift;
436
437							# Getting the fileName Contains the Cluster and topic mapping..
438	2					8	my $topicFileName = $readFileObject->{$fileName};
439
440							# Opening the topicFile.
441	2	50				66	open topicFile, $topicFileName or die $!;
442
443							# Getting the reference of the array containing the topic name.
444	2					6	my $topicNameArrayRef = shift;
445
446							# Getting the array from the reference.
447	2					6	my @topicNameArray = @$topicNameArrayRef;
448							# print "@topicNameArray \n\n\n";
449
450							# Defining the hash containing the topics and their corresponding data.
451	2					5	my %topicDataHash = ();
452
453							# Defining the array which will hold all the data for the file.
454	2					4	my @fileData =();
455
456							# Reading the file line by line till end of file.
457	2					82	while ( my $lineData = ) {
458							# Adding all the line data in a array.
459	143					408	push (@fileData, $lineData);
460							}
461
462							# If the topic is not present than read the files and populate the value.
463	2	100				10	if(@topicNameArray ==0){
464
465							# Iterating through the array which contains all the data.
466	1					3	foreach my $lineData (@fileData) {
467
468							# Removing the new line character.
469	70					79	chomp($lineData);
470
471							# Reading the topic value.
472	70	100				156	if($lineData =~ m/^(.+):::/){
473							# Removing the white spaces around the topic.
474	3					17	$1 =~ s/^\s+\|\s+$//;
475	3					8	push(@topicNameArray,$1);
476							}
477							}
478							}
479
480							# Iterating through the array which contains all the data.
481	2					6	foreach my $topicName (@topicNameArray){
482
483	6					10	my $topicKey = "";
484	6					7	my $topicData = "";
485
486							# This variable is set to 1, only when we are reading the "topic:::" for first time.
487	6					8	my $startOfTopicData = 1;
488
489							# Reading the file line by line till end of file.
490	6					8	foreach my $lineData (@fileData) {
491
492							# Removing the new line character.
493	305					393	chomp($lineData);
494
495							# If this is already set 0 and we encounter "topicName:::" or "Cluster#:::" then
496							# reading of data about the topic is over.
497	305	100	100			905	if($startOfTopicData == 0 && ($lineData =~ m/^.+:::/i) ){
498	4					9	last;
499							}
500
501							# Making the variable metadata for Regular expression search.
502	301					351	my $searchString = quotemeta $topicName;
503
504							# If the line start with any topicName.
505	301	100				1129	if ( $lineData =~ m/^$searchString\:::/i) {
506
507							# ReSetting the start counter to 0.
508	6					11	$startOfTopicData = 0;
509
510							# Setting the topic key.
511	6					10	$topicKey = $topicName;
512
513							# The removing the topic from the line and remaining terms will be part of
514							# the data about the topics.
515	6					86	$lineData =~ s/(^$searchString\:::)//i;
516	6					50	$topicData = $lineData;
517	6					15	next;
518							}
519
520							# Each subsequent line will be keep on adding to the topic data, till we encounter
521							# "Cluster#:::" or "topic#:::" or "end of file"
522	295	100				543	if($startOfTopicData == 0){
523	132					574	$topicData = $topicData." ".$lineData;
524							}
525							}
526							# Removing space from the front and back.
527	6					2734	$topicData =~ s/^\s+\|\s+$//g;
528
529	6	50				19	if($topicKey){
530	6					22	$topicDataHash{$topicKey} = $topicData;
531							}
532
533							}
534
535							# Close the file handle.
536	2					31	close(topicFile);
537
538							# Returning the topic list.
539	2					29	return \%topicDataHash;
540							}
541
542							###########################################################################################
543
544							=head1 Function: readTopicNamesFromTopicFile
545
546							This function will list all the topics from the file provided by user.
547
548							Syntax of the file:
549
550
551
552
553
554
555
556							Example:
557
558							topic1:::data1, data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
559							data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1 data1
560							topic2:::data2, data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2 data2
561							data2 data2 data2 data2 data2 data2 data2 data2 data2 data1 data1 data1 data1 data1
562							cluster0:::topic1
563							cluster1:::topic2
564							cluster2:::topic0
565
566							@argument : $readFileObject : Object of the current file.
567
568							@return : \@topicNameArray : DataType : (Reference to array)
569							Reference of array containing the list of topics.
570
571							@description :
572							1. Read the file line by line.
573							2. Check the line, if it contains the ":::"
574							a. if starts with "cluster" ignore it.
575							b. otherwise, split that line with separator, ":::" and store the results in array.
576							c. The first element of the array is the topic-name.
577							d. Push, this topic-name into the array.
578							3. Return the reference of this array.
579
580
581							Reason for selecting the separtor as ":::"
582							1. It will ensure that it is unique and it has very rare chance of occuring
583							in a documents or text.
584							=cut
585
586							##########################################################################################
587							sub readTopicNamesFromTopicFile {
588
589							# Reading the object as the argument.
590	0			0	0		my $readFileObject = shift;
591
592							# Getting the fileName Contains the Cluster and topic mapping..
593	0						my $topicFileName = $readFileObject->{$fileName};
594
595							# Opening the topicFile.
596	0	0					open topicFile, $topicFileName or die $!;
597
598							# Variable which will contains the topics.
599	0						my @topicNameArray = ();
600
601							# Defining the index for the array.
602	0						my $index = 0;
603
604							# Reading the file line by line till end of file.
605	0						while ( my $lineData = ) {
606							# Removing the new line character.
607	0						chomp($lineData);
608
609							# Removing space from the front and back.
610	0						$lineData =~ s/^\s+\|\s+$//;
611
612							# If the line start with Cluster, ignore it.
613	0	0					if ( $lineData =~ m/^cluster/i ) {
614	0						next;
615							}
616
617							# If the line contains ":::".
618	0	0					if ( $lineData =~ m/:::/ ) {
619							# Spliting with Seprator":::".
620	0						my @wordsOfSentenceArray = split( /:::/, $lineData );
621
622							# Removing the white spaces around the topic.
623	0						$wordsOfSentenceArray[0] =~ s/^\s+\|\s+$//;
624
625							# Adding the terms to topic-array.
626	0						$topicNameArray[$index++] = $wordsOfSentenceArray[0];
627							}
628							}
629							# Close the file handle.
630	0						close(topicFile);
631
632							# Returning the topic list.
633	0						return (\@topicNameArray);
634							}
635
636							#######################################################################################################
637
638							=pod
639
640							=head1 SEE ALSO
641
642							http://senseclusters.cvs.sourceforge.net/viewvc/senseclusters/LabelEvaluation/
643
644							Last modified by :
645							$Id: ReadingFilesData.pm,v 1.5 2013/03/07 23:15:49 jhaxx030 Exp $
646
647							=head1 AUTHORS
648
649							Anand Jha, University of Minnesota, Duluth
650							jhaxx030 at d.umn.edu
651
652							Ted Pedersen, University of Minnesota, Duluth
653							tpederse at d.umn.edu
654
655							=head1 COPYRIGHT AND LICENSE
656
657							Copyright (C) 2012-2013 Ted Pedersen, Anand Jha
658
659							See http://dev.perl.org/licenses/ for more information.
660
661							This program is free software; you can redistribute it and/or modify
662							it under the terms of the GNU General Public License as published by
663							the Free Software Foundation; either version 2 of the License, or
664							(at your option) any later version.
665
666							This program is distributed in the hope that it will be useful,
667							but WITHOUT ANY WARRANTY; without even the implied warranty of
668							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
669							GNU General Public License for more details.
670
671							You should have received a copy of the GNU General Public License
672							along with this program; if not, write to:
673
674
675							The Free Software Foundation, Inc., 59 Temple Place, Suite 330,
676							Boston, MA 02111-1307 USA
677
678
679							=cut
680
681							#######################################################################################################
682
683							# Making the default return statement as 1;
684							# Reference : http://lists.netisland.net/archives/phlpm/phlpm-2001/msg00426.html
685
686							1;