line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# WordNet::Extend::Locate.pm version 0.030 |
2
|
|
|
|
|
|
|
# Updated: 01/16/17 |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Ted Pedersen, University of Minnesota Duluth |
5
|
|
|
|
|
|
|
# tpederse at d.umn.edu |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Jon Rusert, University of Minnesota Duluth |
8
|
|
|
|
|
|
|
# ruse0008 at d.umn.edu |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify |
11
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
12
|
|
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
13
|
|
|
|
|
|
|
# (at your option) any later version. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
16
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
17
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18
|
|
|
|
|
|
|
# GNU General Public License for more details. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
21
|
|
|
|
|
|
|
# along with this program. If not, see . |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package WordNet::Extend::Locate; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 NAME |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
WordNet::Extend::Locate - Perl modules for locating where in WordNet a |
29
|
|
|
|
|
|
|
lemma should be inserted. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 SYNOPSIS |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 Basic Usage Example |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
use WordNet::Extend::Locate; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $locate = WordNet::Extend::Locate->new(); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$locate->stopList('s/\b(the|is|at)\b//'); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$locate->setCleanUp(1); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
$locate->preProcessing(); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
$locate->toggleCompareGlosses(1,1,0); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$locate->setBonus(25); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
$locate->toggleRefineSense(0); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
print "Finding location for 'dog noun withdef.1 man's best friend'\n"; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
@location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
print "Location found: @location\n"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 DESCRIPTION |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head2 Introduction |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=over |
64
|
|
|
|
|
|
|
=cut |
65
|
|
|
|
|
|
|
|
66
|
1
|
|
|
1
|
|
13174
|
use WordNet::QueryData; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
#use Wiktionary::Parser; |
68
|
|
|
|
|
|
|
use Getopt::Long; |
69
|
|
|
|
|
|
|
use File::Spec; |
70
|
|
|
|
|
|
|
use Lingua::Stem; |
71
|
|
|
|
|
|
|
use Lingua::EN::Tagger; |
72
|
|
|
|
|
|
|
use WordNet::Similarity::vector; |
73
|
|
|
|
|
|
|
#use List::Util; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
%EXPORT_TAGS = (); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
@EXPORT_OK = (); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
@EXPORT = (); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
$VERSION = '0.030'; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
#**************Variables********************** |
88
|
|
|
|
|
|
|
$wn = WordNet::QueryData->new; #to be used to access data from wordnet |
89
|
|
|
|
|
|
|
$stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc. |
90
|
|
|
|
|
|
|
$tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure |
91
|
|
|
|
|
|
|
$measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity |
92
|
|
|
|
|
|
|
@wordNetNouns; #stores all words for noun sense from wordnet |
93
|
|
|
|
|
|
|
@wordNetVerbs; #stores all words for verb sense from wordnet |
94
|
|
|
|
|
|
|
%wnGlosses = (); |
95
|
|
|
|
|
|
|
@wnNounSenses; |
96
|
|
|
|
|
|
|
@wnVerbSenses; |
97
|
|
|
|
|
|
|
%wnHypes = (); |
98
|
|
|
|
|
|
|
%wnHypos = (); |
99
|
|
|
|
|
|
|
%wnSyns = (); |
100
|
|
|
|
|
|
|
%wnFreq = (); |
101
|
|
|
|
|
|
|
#our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages. |
102
|
|
|
|
|
|
|
$stopList = "s/\b(the|is|at|which|on|a|an|and|or|up)\b//g"; #default stop list. |
103
|
|
|
|
|
|
|
$preProcessed = 0; #Flag to determine if preProcessing() has been called. |
104
|
|
|
|
|
|
|
$cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp(); |
105
|
|
|
|
|
|
|
$userCleanUp = ""; #Cleanup step specified by user in addCleanUp(); |
106
|
|
|
|
|
|
|
$useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons. |
107
|
|
|
|
|
|
|
$useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons. |
108
|
|
|
|
|
|
|
$useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons. |
109
|
|
|
|
|
|
|
$bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus(); |
110
|
|
|
|
|
|
|
$refineSense = 0; #Toggle for use of refineSense() method, default on. |
111
|
|
|
|
|
|
|
$help = 0; |
112
|
|
|
|
|
|
|
$scoringMethod = 'baseline'; |
113
|
|
|
|
|
|
|
@scoringMethods = ('baseline', 'BwS', 'Similarity'); |
114
|
|
|
|
|
|
|
$stemming = 0; #Toggle for stemming on or off. |
115
|
|
|
|
|
|
|
$stemmed = 0; #flag for use in BwS |
116
|
|
|
|
|
|
|
#********************************************* |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
GetOptions('help' => \$help); |
119
|
|
|
|
|
|
|
if($help == 1) |
120
|
|
|
|
|
|
|
{ |
121
|
|
|
|
|
|
|
printHelp(); |
122
|
|
|
|
|
|
|
exit(0); |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=head2 Methods |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
The following methods are defined in this package: |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head3 Public methods |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=over |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item $obj->new() |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
The constructor for WordNet::Extend::Locate objects. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Parameters: none. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Return value: the new blessed object |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=cut |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub new |
144
|
|
|
|
|
|
|
{ |
145
|
|
|
|
|
|
|
my $class = shift; |
146
|
|
|
|
|
|
|
my $self = {}; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$self->{errorString} = ''; |
149
|
|
|
|
|
|
|
$self->{error}=0; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
bless $self, $class; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
return $self; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item $obj->getError() |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Allows the object to check if any errors have occurred. |
159
|
|
|
|
|
|
|
Returns an array ($error, $errorString), where $error |
160
|
|
|
|
|
|
|
value equal to 1 represents a warning and 2 represents |
161
|
|
|
|
|
|
|
an error and $errString contains the possible error. |
162
|
|
|
|
|
|
|
For example, if a user forgets to run preProcessing() before |
163
|
|
|
|
|
|
|
a method that relies on it, the error would be 2 and errorString |
164
|
|
|
|
|
|
|
would mention that preProcessing had not been run. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Parameter: None |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Returns: array of the form ($error, $errorString). |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=cut |
171
|
|
|
|
|
|
|
sub getError() |
172
|
|
|
|
|
|
|
{ |
173
|
|
|
|
|
|
|
my $self = shift; |
174
|
|
|
|
|
|
|
my $error = $self->{error}; |
175
|
|
|
|
|
|
|
my $errString = $self->{errorString}; |
176
|
|
|
|
|
|
|
$self->{error}=0; |
177
|
|
|
|
|
|
|
$self->{errorString} = ""; |
178
|
|
|
|
|
|
|
$errString =~ s/^[\r\n\t ]+//; |
179
|
|
|
|
|
|
|
return ($error, $errString); |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=item $obj->locateFile($input_file, $output_file) |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Attempts to locate best WordNet position for each word |
185
|
|
|
|
|
|
|
from input file into WordNet, outputs results to output file. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Parameter: location of input file and output file respectively |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Returns: nothing |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub locateFile() |
194
|
|
|
|
|
|
|
{ |
195
|
|
|
|
|
|
|
my $input = File::Spec->canonpath($_[1]); |
196
|
|
|
|
|
|
|
my $output = File::Spec->canonpath($_[2]); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#Attempts to open input data |
199
|
|
|
|
|
|
|
open DATA, "$input" or die $!; |
200
|
|
|
|
|
|
|
open (OUTDATA, '>', "$output") or die $!; |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#if preProcessing() hasn't been called, call it. |
203
|
|
|
|
|
|
|
if($preProcessed == 0) |
204
|
|
|
|
|
|
|
{ |
205
|
|
|
|
|
|
|
preProcessing(); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
my @outLemma = ("","",""); |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
while() #While lemmas are left in the input data |
211
|
|
|
|
|
|
|
{ |
212
|
|
|
|
|
|
|
for $tempIn (split("\n")) #processes data line by line. |
213
|
|
|
|
|
|
|
{ |
214
|
|
|
|
|
|
|
@outLemma = @{locate($tempIn)}; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
$tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n"; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
print OUTDATA "$tempOut"; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
close DATA; |
222
|
|
|
|
|
|
|
close OUTDATA; |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item $obj->locate($wordPosGloss) |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Takes in single lemma with gloss and returns location of best |
228
|
|
|
|
|
|
|
insertion point in WordNet. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Parameter: Lemma string in format of 'word\tpos\titem-id\tdef' |
231
|
|
|
|
|
|
|
NOTE: String must only be separated by \t no space. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Returns: Array in format of (item-id, WordNet sense, operation) |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=cut |
236
|
|
|
|
|
|
|
sub locate() |
237
|
|
|
|
|
|
|
{ |
238
|
|
|
|
|
|
|
my $base = 0; |
239
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
240
|
|
|
|
|
|
|
{ |
241
|
|
|
|
|
|
|
$base = 1; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
#if preProcessing() hasn't been called, call it. |
245
|
|
|
|
|
|
|
if($preProcessed == 0) |
246
|
|
|
|
|
|
|
{ |
247
|
|
|
|
|
|
|
preProcessing(); |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
my @inLemma = split("\t", $_[$base]); #stores lemma as formatted above |
251
|
|
|
|
|
|
|
my @outLemma = @{processLemma(\@inLemma)}; |
252
|
|
|
|
|
|
|
return \@outLemma; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=item $obj->stopList($newStopList) |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
Takes in new stop list, in regex form |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Parameter:the new stop list in regex substitution form s/.../g? |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Returns: nothing |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=cut |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub stopList() |
267
|
|
|
|
|
|
|
{ |
268
|
|
|
|
|
|
|
my $base = 0; |
269
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
270
|
|
|
|
|
|
|
{ |
271
|
|
|
|
|
|
|
$base = 1; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
my $tempStopList = $_[$base]; |
274
|
|
|
|
|
|
|
if($tempStopList =~ /s\/.*?\/g?/g) |
275
|
|
|
|
|
|
|
{ |
276
|
|
|
|
|
|
|
$stopList = $tempStopList; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
else |
279
|
|
|
|
|
|
|
{ |
280
|
|
|
|
|
|
|
my $self = shift; |
281
|
|
|
|
|
|
|
$self->{error} = 1; |
282
|
|
|
|
|
|
|
$self->{errorString} = "Proposed stop list not in regex substition form s/.../g?, default remains"; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=item $obj->setCleanUp($switch) |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Allows the user to toggle whether or not |
289
|
|
|
|
|
|
|
glosses should be cleaned up. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
Parameter: 0 or 1 to turn clean up off or on respectively |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
Returns: nothing |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
sub setCleanUp() |
298
|
|
|
|
|
|
|
{ |
299
|
|
|
|
|
|
|
my $base = 0; |
300
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
301
|
|
|
|
|
|
|
{ |
302
|
|
|
|
|
|
|
$base = 1; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
if($_[$base] == 0) #turns cleanUp off. |
306
|
|
|
|
|
|
|
{ |
307
|
|
|
|
|
|
|
$cleanUp = 0; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
else #turns cleanUp on. |
310
|
|
|
|
|
|
|
{ |
311
|
|
|
|
|
|
|
$cleanUp = 1; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item $obj->addCleanUp($cleanUp) |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Allows the user to add their own |
318
|
|
|
|
|
|
|
regex for cleaning up the glosses. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
Parameter: Regex representing the cleanup |
321
|
|
|
|
|
|
|
the user wants performed. |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
Returns: Nothing |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=cut |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub addCleanUp() |
328
|
|
|
|
|
|
|
{ |
329
|
|
|
|
|
|
|
my $base = 0; |
330
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
331
|
|
|
|
|
|
|
{ |
332
|
|
|
|
|
|
|
$base = 1; |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
my $tempCleanUp = $_[$base]; |
336
|
|
|
|
|
|
|
if($tempCleanUp =~ /(s|t)\/.*\/g?/g) |
337
|
|
|
|
|
|
|
{ |
338
|
|
|
|
|
|
|
$userCleanUp = $tempCleanUp; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
else |
341
|
|
|
|
|
|
|
{ |
342
|
|
|
|
|
|
|
my $self = shift; |
343
|
|
|
|
|
|
|
$self->{error} = 1; |
344
|
|
|
|
|
|
|
$self->{errorString} = "Clean Up not in regex format '/.../', default remains on"; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item $obj->preProcessing() |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Highly increases speed of program by making |
351
|
|
|
|
|
|
|
as many outside calls as possible and storing |
352
|
|
|
|
|
|
|
outside info to be used later. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
Parameter: none |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Returns: nothing |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=cut |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub preProcessing() |
361
|
|
|
|
|
|
|
{ |
362
|
|
|
|
|
|
|
$preProcessed = 1; #Flag that preProcessing has been called. |
363
|
|
|
|
|
|
|
@wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses. |
364
|
|
|
|
|
|
|
@wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses. |
365
|
|
|
|
|
|
|
#reset all glosses, senses, etc. |
366
|
|
|
|
|
|
|
%wnGlosses = (); |
367
|
|
|
|
|
|
|
@wnNounSenses; |
368
|
|
|
|
|
|
|
@wnVerbSenses; |
369
|
|
|
|
|
|
|
%wnHypes = (); |
370
|
|
|
|
|
|
|
%wnHypos = (); |
371
|
|
|
|
|
|
|
%wnSyns = (); |
372
|
|
|
|
|
|
|
%wnFreq = (); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
#Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration. |
376
|
|
|
|
|
|
|
foreach my $noun (@wordNetNouns) |
377
|
|
|
|
|
|
|
{ |
378
|
|
|
|
|
|
|
my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word |
379
|
|
|
|
|
|
|
foreach my $curNSense (@nSenses) |
380
|
|
|
|
|
|
|
{ |
381
|
|
|
|
|
|
|
#stores in noun senses to differentiate from verbs. |
382
|
|
|
|
|
|
|
push(@wnNounSenses, $curNSense); |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
#obtain each gloss and clean up before inserting into hash. |
385
|
|
|
|
|
|
|
my @nGlosses = $wn->querySense($curNSense, "glos"); |
386
|
|
|
|
|
|
|
my $tempSenseGloss = $nGlosses[0]; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
if($cleanUp == 1) |
389
|
|
|
|
|
|
|
{ |
390
|
|
|
|
|
|
|
#Clean up the words in the temporary sense gloss. |
391
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(\(|\)|\.)//g; |
392
|
|
|
|
|
|
|
$tempSenseGloss =~ s/^a-zA-Z//g; |
393
|
|
|
|
|
|
|
$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase. |
394
|
|
|
|
|
|
|
$tempSenseGloss =~ $stopList; #remove stop words |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
397
|
|
|
|
|
|
|
{ |
398
|
|
|
|
|
|
|
$tempSenseGloss =~ $userCleanUp; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
#if stemming is on, stem each word in each gloss |
402
|
|
|
|
|
|
|
if($stemming == 1) |
403
|
|
|
|
|
|
|
{ |
404
|
|
|
|
|
|
|
my @tempStem = split(' ', $tempSenseGloss); |
405
|
|
|
|
|
|
|
my @stemmedGloss = @{$stemmer->stem(@tempStem)}; |
406
|
|
|
|
|
|
|
$tempSenseGloss = join(' ', @stemmedGloss); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
#maps each sense to its gloss |
410
|
|
|
|
|
|
|
$wnGlosses{$curNSense} = $tempSenseGloss; |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
#obtains and stores, hypes, hypos, and syns |
413
|
|
|
|
|
|
|
my @hypes = $wn->querySense($curNSense, "hype"); |
414
|
|
|
|
|
|
|
$wnHypes{$curNSense} = \@hypes; |
415
|
|
|
|
|
|
|
my @hypos = $wn->querySense($curNSense, "hypo"); |
416
|
|
|
|
|
|
|
$wnHypos{$curNSense} = \@hypos; |
417
|
|
|
|
|
|
|
my @syns = $wn->querySense($curNSense, "syns"); |
418
|
|
|
|
|
|
|
$wnSyns{$curNSense} = \@syns; |
419
|
|
|
|
|
|
|
$wnFreq{$curNSense} = $wn->frequency($curNSense); |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#stores verbs' senses' glosses, hypes, hypos, and syns. |
424
|
|
|
|
|
|
|
foreach my $verb (@wordNetVerbs) |
425
|
|
|
|
|
|
|
{ |
426
|
|
|
|
|
|
|
my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word |
427
|
|
|
|
|
|
|
foreach my $curVSense (@vSenses) |
428
|
|
|
|
|
|
|
{ |
429
|
|
|
|
|
|
|
#stores in verb senses to differentiate later. |
430
|
|
|
|
|
|
|
push(@wnVerbSenses, $curVSense); |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
#obtain each gloss and clean up before inserting into hash. |
433
|
|
|
|
|
|
|
my @vGlosses = $wn->querySense($curVSense, "glos"); |
434
|
|
|
|
|
|
|
my $tempSenseGloss = $vGlosses[0]; |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
if($cleanUp == 1) |
437
|
|
|
|
|
|
|
{ |
438
|
|
|
|
|
|
|
#Clean up the words in the temporary sense gloss. |
439
|
|
|
|
|
|
|
$tempSenseGloss =~ s/(\(|\)|\.)//g; |
440
|
|
|
|
|
|
|
$tempSenseGloss =~ s/^a-zA-Z//g; |
441
|
|
|
|
|
|
|
$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase. |
442
|
|
|
|
|
|
|
$tempSenseGloss =~ s/\b$stopList\b//g; #remove stop words |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
445
|
|
|
|
|
|
|
{ |
446
|
|
|
|
|
|
|
$tempSenseGloss =~ $userCleanUp; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
#if stemming is on, stem each word in each gloss |
450
|
|
|
|
|
|
|
if($stemming == 1) |
451
|
|
|
|
|
|
|
{ |
452
|
|
|
|
|
|
|
my @tempStem = split(' ', $tempSenseGloss); |
453
|
|
|
|
|
|
|
my @stemmedGloss = @{$stemmer->stem(@tempStem)}; |
454
|
|
|
|
|
|
|
$tempSenseGloss = join(' ', @stemmedGloss); |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
#maps each sense to its gloss |
458
|
|
|
|
|
|
|
$wnGlosses{$curVSense} = $tempSenseGloss; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
#obtains and stores, hypes, hypos, and syns |
461
|
|
|
|
|
|
|
my @hypes = $wn->querySense($curVSense, "hype"); |
462
|
|
|
|
|
|
|
$wnHypes{$curVSense} = \@hypes; |
463
|
|
|
|
|
|
|
my @hypos = $wn->querySense($curVSense, "hypo"); |
464
|
|
|
|
|
|
|
$wnHypos{$curVSense} = \@hypos; |
465
|
|
|
|
|
|
|
my @syns = $wn->querySense($curVSense, "syns"); |
466
|
|
|
|
|
|
|
$wnSyns{$curVSense} = \@syns; |
467
|
|
|
|
|
|
|
$wnFreq{$curVSense} = $wn->frequency($curVSense); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=item $obj->processLemma(@inLemma) |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
Determines where the OOV Lemma should be |
477
|
|
|
|
|
|
|
inserted into WordNet, returns the output. |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Parameter: the lemma to be inserted in array form |
480
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Returns: chosen lemma in array form |
483
|
|
|
|
|
|
|
(item-id, WordNet sense, operation) |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=cut |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub processLemma() |
488
|
|
|
|
|
|
|
{ |
489
|
|
|
|
|
|
|
my $base = 0; |
490
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
491
|
|
|
|
|
|
|
{ |
492
|
|
|
|
|
|
|
$base = 1; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
my %senseScores = (); |
496
|
|
|
|
|
|
|
my $highSenseScore = 0; |
497
|
|
|
|
|
|
|
my $highSense = ""; |
498
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
499
|
|
|
|
|
|
|
my @outLemma = ("","",""); |
500
|
|
|
|
|
|
|
my $attachMerge = ""; |
501
|
|
|
|
|
|
|
my @senses = (); |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
if($preProcessed == 1) |
504
|
|
|
|
|
|
|
{ |
505
|
|
|
|
|
|
|
if($inLemma[1] =~ /noun/) |
506
|
|
|
|
|
|
|
{ |
507
|
|
|
|
|
|
|
@senses = @wnNounSenses; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
else |
510
|
|
|
|
|
|
|
{ |
511
|
|
|
|
|
|
|
@senses = @wnVerbSenses; |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
foreach $curSense (@senses) #runs through each sense of current word |
515
|
|
|
|
|
|
|
{ |
516
|
|
|
|
|
|
|
my $score = scoreSense(\@inLemma, $curSense); |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
if($score >= $highSenseScore) |
519
|
|
|
|
|
|
|
{ |
520
|
|
|
|
|
|
|
$highSenseScore = $score; |
521
|
|
|
|
|
|
|
$highSense = $curSense; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
$senseScores{$curSense} = $score; |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
if($refineSense == 1) |
528
|
|
|
|
|
|
|
{ |
529
|
|
|
|
|
|
|
$highSense = refineSense(\@inLemma, $highSense); |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
if($wnFreq{$highSense} == 0) |
533
|
|
|
|
|
|
|
{ |
534
|
|
|
|
|
|
|
$attachMerge = "attach"; |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
else |
537
|
|
|
|
|
|
|
{ |
538
|
|
|
|
|
|
|
$attachMerge = "merge"; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
$outLemma[0] = $inLemma[2]; |
542
|
|
|
|
|
|
|
$outLemma[1] = $highSense; |
543
|
|
|
|
|
|
|
$outLemma[2] = $attachMerge; |
544
|
|
|
|
|
|
|
return \@outLemma; |
545
|
|
|
|
|
|
|
} |
546
|
|
|
|
|
|
|
else |
547
|
|
|
|
|
|
|
{ |
548
|
|
|
|
|
|
|
my $self = shift; |
549
|
|
|
|
|
|
|
$self->{error} = 2; |
550
|
|
|
|
|
|
|
$self->{errorString} = "PreProcessing must be run before processLemma() is called."; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=item $obj->toggleCompareGlosses($hype,$hypo,$syns) |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
Toggles which glosses are used in score sense. |
557
|
|
|
|
|
|
|
by default, the sense, the sense's hypernyms' |
558
|
|
|
|
|
|
|
glosses,hyponyms' glosses, and synsets' glosses |
559
|
|
|
|
|
|
|
are turned on. This method allows for toggling |
560
|
|
|
|
|
|
|
of hypes,hypos,synsets, by passing in three |
561
|
|
|
|
|
|
|
parameters, 1 for on and 0 for off. |
562
|
|
|
|
|
|
|
Example: toggleCompareGlosses(0,0,0) toggles |
563
|
|
|
|
|
|
|
all three off. |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Parameters: 0 or 1 for toggling hypernyms, hyponyms, |
566
|
|
|
|
|
|
|
and synset comparisons. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
Returns: nothing |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=cut |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
sub toggleCompareGlosses() |
573
|
|
|
|
|
|
|
{ |
574
|
|
|
|
|
|
|
my $base = 0; |
575
|
|
|
|
|
|
|
if(scalar @_ == 4)#checks if method entered by object. |
576
|
|
|
|
|
|
|
{ |
577
|
|
|
|
|
|
|
$base = 1; |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
if($_[$base] == 0) |
581
|
|
|
|
|
|
|
{ |
582
|
|
|
|
|
|
|
$useHypeGlosses = 0; |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
else |
585
|
|
|
|
|
|
|
{ |
586
|
|
|
|
|
|
|
$useHypeGlosses = 1; |
587
|
|
|
|
|
|
|
} |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
$base++; |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
if($_[$base] == 0) |
592
|
|
|
|
|
|
|
{ |
593
|
|
|
|
|
|
|
$useHypoGlosses = 0; |
594
|
|
|
|
|
|
|
} |
595
|
|
|
|
|
|
|
else |
596
|
|
|
|
|
|
|
{ |
597
|
|
|
|
|
|
|
$useHypoGlosess = 1; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
$base++; |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
if($_[$base] == 0) |
603
|
|
|
|
|
|
|
{ |
604
|
|
|
|
|
|
|
$useSynsGlosses = 0; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
else |
607
|
|
|
|
|
|
|
{ |
608
|
|
|
|
|
|
|
$useSynsGlosses = 1; |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=item $obj->setBonus($bonus) |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
Allows the user to set the bonus that will be |
615
|
|
|
|
|
|
|
used when scoring lemmas that contain the |
616
|
|
|
|
|
|
|
new lemma. |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
Parameter: the multiplier that should be used in |
619
|
|
|
|
|
|
|
calculating the bonus. |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
Returns: nothing |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=cut |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
sub setBonus() |
626
|
|
|
|
|
|
|
{ |
627
|
|
|
|
|
|
|
my $base = 0; |
628
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
629
|
|
|
|
|
|
|
{ |
630
|
|
|
|
|
|
|
$base = 1; |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
$bonus = $_[$base]; |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=item $obj->scoreSense(@inLemma, $compareSense) |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Serves as a wrapper method to facilitate the |
639
|
|
|
|
|
|
|
main program by directing it to the currently |
640
|
|
|
|
|
|
|
chosen scoring method. By default the average |
641
|
|
|
|
|
|
|
highest scoring method is chosen. This can be |
642
|
|
|
|
|
|
|
changed with setScoreMethod(). |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
645
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
646
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
649
|
|
|
|
|
|
|
compareSense. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=cut |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
sub scoreSense() |
654
|
|
|
|
|
|
|
{ |
655
|
|
|
|
|
|
|
my $base = 0; |
656
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
657
|
|
|
|
|
|
|
{ |
658
|
|
|
|
|
|
|
$base = 1; |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
662
|
|
|
|
|
|
|
$base++; |
663
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
my $score = 0; |
666
|
|
|
|
|
|
|
if($scoringMethod eq "baseline") |
667
|
|
|
|
|
|
|
{ |
668
|
|
|
|
|
|
|
$score = baseline(\@inLemma, $curSense); |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
if($scoringMethod eq "BwS") |
671
|
|
|
|
|
|
|
{ |
672
|
|
|
|
|
|
|
$score = BwS(\@inLemma, $curSense); |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
if($scoringMethod eq "Similarity") |
675
|
|
|
|
|
|
|
{ |
676
|
|
|
|
|
|
|
$score = Similarity(\@inLemma, $curSense); |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
return $score; |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=item $obj->setScoreMethod($scoreMethod) |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
Allows the user to choose which scoring method |
685
|
|
|
|
|
|
|
should be used by default when running the |
686
|
|
|
|
|
|
|
program from the top. Options are: |
687
|
|
|
|
|
|
|
'baseline' |
688
|
|
|
|
|
|
|
'BwS' - baseline system with stemming and lemmitization |
689
|
|
|
|
|
|
|
--as more are added they will appear here. |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
Parameter: the chosen scoring method |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
Returns: nothing. |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=cut |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
sub setScoreMethod() |
698
|
|
|
|
|
|
|
{ |
699
|
|
|
|
|
|
|
my $base = 0; |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
702
|
|
|
|
|
|
|
{ |
703
|
|
|
|
|
|
|
$base = 1; |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
my $scoreMethod = $_[$base]; |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
#check if the score method is in scoring methods. |
709
|
|
|
|
|
|
|
my @matches = grep(/$scoreMethod/, @scoringMethods); |
710
|
|
|
|
|
|
|
if(scalar @matches > 0) |
711
|
|
|
|
|
|
|
{ |
712
|
|
|
|
|
|
|
$scoringMethod = $scoreMethod; |
713
|
|
|
|
|
|
|
} |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=item $obj->Similarity(@inLemma, $compareSense) |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
Calculates a score for the passed sense and returns |
720
|
|
|
|
|
|
|
that score. |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
723
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
724
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
Returns: a score of how related the im lemma is to the |
727
|
|
|
|
|
|
|
compareSense. |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
=cut |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub Similarity() |
732
|
|
|
|
|
|
|
{ |
733
|
|
|
|
|
|
|
my $base = 0; |
734
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
735
|
|
|
|
|
|
|
{ |
736
|
|
|
|
|
|
|
$base = 1; |
737
|
|
|
|
|
|
|
} |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
740
|
|
|
|
|
|
|
$base++; |
741
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
my $def = @inLemma[3]; |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
#split definition and stem the words |
746
|
|
|
|
|
|
|
my @listDef =split(' ', $def); |
747
|
|
|
|
|
|
|
my @defStemmed = @{$stemmer->stem(@listDef)}; |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
#join definition back together and tag with pos |
750
|
|
|
|
|
|
|
$def = join(' ', @defStemmed); |
751
|
|
|
|
|
|
|
my $tagged = $tagger->add_tags($def); |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
#split the tagged definition for individual word processing |
754
|
|
|
|
|
|
|
@tagArray = split(' ', $tagged); |
755
|
|
|
|
|
|
|
my @similar = (); |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
#step through each tagged word and find the first sense in wordnet, then add that to the @similar list |
758
|
|
|
|
|
|
|
foreach my $cur (@tagArray) |
759
|
|
|
|
|
|
|
{ |
760
|
|
|
|
|
|
|
my $pos = ''; |
761
|
|
|
|
|
|
|
if($cur =~ /.*/) |
762
|
|
|
|
|
|
|
{ |
763
|
|
|
|
|
|
|
$pos = 'n'; |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
else |
766
|
|
|
|
|
|
|
{ |
767
|
|
|
|
|
|
|
if($cur =~ /.*/) |
768
|
|
|
|
|
|
|
{ |
769
|
|
|
|
|
|
|
$pos = 'v'; |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
else |
772
|
|
|
|
|
|
|
{ |
773
|
|
|
|
|
|
|
if($cur =~ /.*/) |
774
|
|
|
|
|
|
|
{ |
775
|
|
|
|
|
|
|
$pos = 'a'; |
776
|
|
|
|
|
|
|
} |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
} |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
if(length $pos == 1) |
781
|
|
|
|
|
|
|
{ |
782
|
|
|
|
|
|
|
$cur =~ s/<[nvj\/].{1,3}>//g; |
783
|
|
|
|
|
|
|
@wnQuery = $wn->querySense("$cur#$pos"); |
784
|
|
|
|
|
|
|
push @similar, $wnQuery[0]; |
785
|
|
|
|
|
|
|
} |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
my $score = 0; |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
foreach my $curSim (@similar) |
791
|
|
|
|
|
|
|
{ |
792
|
|
|
|
|
|
|
my $value = $measure->getRelatedness("$curSense", "$curSim"); |
793
|
|
|
|
|
|
|
$score = $score + $value; |
794
|
|
|
|
|
|
|
} |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
return $score; |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
=item $obj->BwS(@inLemma, $compareSense) |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
Calculates a score for the passed sense and returns |
802
|
|
|
|
|
|
|
that score. This is a modified baseline() method |
803
|
|
|
|
|
|
|
which adds stemming to the data. |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
806
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
807
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
810
|
|
|
|
|
|
|
compareSense. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=cut |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
sub BwS() |
815
|
|
|
|
|
|
|
{ |
816
|
|
|
|
|
|
|
my $base = 0; |
817
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
818
|
|
|
|
|
|
|
{ |
819
|
|
|
|
|
|
|
$base = 1; |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
823
|
|
|
|
|
|
|
$base++; |
824
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
if($stemmed == 0) |
827
|
|
|
|
|
|
|
{ |
828
|
|
|
|
|
|
|
$stemming = 1; |
829
|
|
|
|
|
|
|
preProcessing(); |
830
|
|
|
|
|
|
|
$stemmed = 1; |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
return simpleScoreSense(\@inLemma, $curSense); |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
=item $obj->baseline(@inLemma, $compareSense) |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
Calculates a score for the passed sense then returns |
840
|
|
|
|
|
|
|
that score. This class is a wrapper for the |
841
|
|
|
|
|
|
|
simpleScoreSense() method as it makes sure no stemming |
842
|
|
|
|
|
|
|
or lemmatization is present in the preProcessing(). |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
845
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
846
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
849
|
|
|
|
|
|
|
compareSense. |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
=cut |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
sub baseline() |
854
|
|
|
|
|
|
|
{ |
855
|
|
|
|
|
|
|
my $base = 0; |
856
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
857
|
|
|
|
|
|
|
{ |
858
|
|
|
|
|
|
|
$base = 1; |
859
|
|
|
|
|
|
|
} |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
862
|
|
|
|
|
|
|
$base++; |
863
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
if($stemmed == 1) |
866
|
|
|
|
|
|
|
{ |
867
|
|
|
|
|
|
|
$stemming = 0; |
868
|
|
|
|
|
|
|
preProcessing(); |
869
|
|
|
|
|
|
|
$stemmed = 0; |
870
|
|
|
|
|
|
|
} |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
return simpleScoreSense(\@inLemma, $curSense); |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
} |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=item $obj->simpleScoreSense(@inLemma, $compareSense) |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
Calculates a score for the passed sense then |
880
|
|
|
|
|
|
|
returns that score. This is the baseline system which |
881
|
|
|
|
|
|
|
was submitted for SemEval16 task 14. This algorithm |
882
|
|
|
|
|
|
|
scores by overlapping words found in the lemma's gloss |
883
|
|
|
|
|
|
|
and also with the lemma's hypernym and hyponyms' glosses. |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
Parameters: the in lemma in array form |
886
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
887
|
|
|
|
|
|
|
and the sense that the lemma is being compared to. |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
Returns: a score of how related the in lemma is to the |
890
|
|
|
|
|
|
|
compareSense. |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=cut |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
sub simpleScoreSense() |
895
|
|
|
|
|
|
|
{ |
896
|
|
|
|
|
|
|
my $base = 0; |
897
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
898
|
|
|
|
|
|
|
{ |
899
|
|
|
|
|
|
|
$base = 1; |
900
|
|
|
|
|
|
|
} |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
903
|
|
|
|
|
|
|
$base++; |
904
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
905
|
|
|
|
|
|
|
my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word. |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
#_________________Sense Gloss_________________________________ |
908
|
|
|
|
|
|
|
my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss. |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
my @extendedGloss = getExtendedGloss($curSense); |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
#________________Lemma Gloss_________________________________ |
913
|
|
|
|
|
|
|
my $tempLemmaGloss = $inLemma[3]; |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
if($cleanUp == 1) |
917
|
|
|
|
|
|
|
{ |
918
|
|
|
|
|
|
|
#Clean up the words in the temp lemma gloss. |
919
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(\(|\)|\.)//g; |
920
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/^a-zA-Z//g; |
921
|
|
|
|
|
|
|
$tempLemmaGloss = lc $tempLemmaGloss; |
922
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/\b$stopList\b//g; #remove stop words |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
925
|
|
|
|
|
|
|
{ |
926
|
|
|
|
|
|
|
$tempLemmaGloss =~ $userCleanUp; |
927
|
|
|
|
|
|
|
} |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
my @curLemmaGloss = split(' ', $tempLemmaGloss); |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
#__________________Overlaps__________________________________ |
933
|
|
|
|
|
|
|
my $glossLength = 0; |
934
|
|
|
|
|
|
|
my $overlaps = 0.0; #number of overlapped words. |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
#scan through each word from the sense gloss and see if any overlap on the lemma gloss. |
937
|
|
|
|
|
|
|
for my $lWord (0..$#curLemmaGloss) |
938
|
|
|
|
|
|
|
{ |
939
|
|
|
|
|
|
|
$glossLength = $glossLength + length $curLemmaGloss[$lWord]; |
940
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself |
941
|
|
|
|
|
|
|
{ |
942
|
|
|
|
|
|
|
$overlaps = $overlaps + $bonus*(length $word); |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
$spaceWord = $word; |
946
|
|
|
|
|
|
|
$spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below |
947
|
|
|
|
|
|
|
if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/) |
948
|
|
|
|
|
|
|
{ |
949
|
|
|
|
|
|
|
$overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]); |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
for my $sWord (0..$#curSenseGloss) |
953
|
|
|
|
|
|
|
{ |
954
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/) |
955
|
|
|
|
|
|
|
{ |
956
|
|
|
|
|
|
|
$overlaps = $overlaps + length $curSenseGloss[$sWord]; |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
for my $extWord (0..$#extendedGloss) |
960
|
|
|
|
|
|
|
{ |
961
|
|
|
|
|
|
|
if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/) |
962
|
|
|
|
|
|
|
{ |
963
|
|
|
|
|
|
|
$overlaps = $overlaps + length $extendedGloss[$extWord]; |
964
|
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
|
} |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
} |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
$score = $overlaps/$glossLength; |
971
|
|
|
|
|
|
|
return $score; |
972
|
|
|
|
|
|
|
} |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=item $obj->getExtendedGloss($compareSense) |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
Calculates the extended gloss based on which |
977
|
|
|
|
|
|
|
glosses are toggled and returns an array |
978
|
|
|
|
|
|
|
which contains the full glosses. |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
Parameter: the sense which the extended gloss is |
981
|
|
|
|
|
|
|
based on |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
Returns: an array which contains the extended gloss |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=cut |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
sub getExtendedGloss() |
988
|
|
|
|
|
|
|
{ |
989
|
|
|
|
|
|
|
my $base = 0; |
990
|
|
|
|
|
|
|
if(scalar @_ == 2)#checks if method entered by object. |
991
|
|
|
|
|
|
|
{ |
992
|
|
|
|
|
|
|
$base = 1; |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
my $curSense = $_[$base]; |
996
|
|
|
|
|
|
|
my @extendedGloss = (); |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
#__________________Hype Gloss_________________________________ |
999
|
|
|
|
|
|
|
if($useHypeGlosses == 1) |
1000
|
|
|
|
|
|
|
{ |
1001
|
|
|
|
|
|
|
#Now expands to hypernyms glosses in overlaps |
1002
|
|
|
|
|
|
|
my @senseHypes = @{$wnHypes{$curSense}}; |
1003
|
|
|
|
|
|
|
my @senseHypeGloss = (); |
1004
|
|
|
|
|
|
|
my $tempAllHypeGloss = ""; |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
for my $hype (0..$#senseHypes) |
1007
|
|
|
|
|
|
|
{ |
1008
|
|
|
|
|
|
|
my $tempHypeGloss = $wnGlosses{$hype}; |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
$tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss; |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
@senseHypeGloss = split(' ', $tempAllHypeGloss); |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
push(@extendedGloss, @senseHypeGloss); |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
#________________Hypo Gloss__________________________________ |
1019
|
|
|
|
|
|
|
if($useHypoGlosses == 1) |
1020
|
|
|
|
|
|
|
{ |
1021
|
|
|
|
|
|
|
#adds in hyponyms' glosses in overlaps |
1022
|
|
|
|
|
|
|
my @senseHypos = @{$wnHypos{$curSense}}; |
1023
|
|
|
|
|
|
|
my @senseHypoGloss = (); |
1024
|
|
|
|
|
|
|
my $tempAllHypoGloss = ""; |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
for my $hypo (0..$#senseHypos) |
1027
|
|
|
|
|
|
|
{ |
1028
|
|
|
|
|
|
|
my $tempHypoGloss = $wnGlosses{$hypo}; |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
$tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss; |
1031
|
|
|
|
|
|
|
} |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
@senseHypoGloss = split(' ', $tempAllHypoGloss); |
1034
|
|
|
|
|
|
|
push(@extendedGloss, @senseHypoGloss); |
1035
|
|
|
|
|
|
|
} |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
#_________________Syns Gloss_________________________________ |
1038
|
|
|
|
|
|
|
if($useSynsGlosses == 1) |
1039
|
|
|
|
|
|
|
{ |
1040
|
|
|
|
|
|
|
#adds in synsets' glosses in overlaps |
1041
|
|
|
|
|
|
|
my @senseSyns = @{$wnSyns{$curSense}}; |
1042
|
|
|
|
|
|
|
my @senseSynsGloss = (); |
1043
|
|
|
|
|
|
|
my $tempAllSynsGloss = ""; |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
for my $syns (0..$#senseSyns) |
1046
|
|
|
|
|
|
|
{ |
1047
|
|
|
|
|
|
|
if(!($syns =~ /\b$word\b/)) #do not repeat sense |
1048
|
|
|
|
|
|
|
{ |
1049
|
|
|
|
|
|
|
my $tempSynsGloss = $wnGlosses{$syns}; |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
$tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss; |
1052
|
|
|
|
|
|
|
} |
1053
|
|
|
|
|
|
|
} |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
@senseSynsGloss = split(' ', $tempAllSynsGloss); |
1056
|
|
|
|
|
|
|
push(@extendedGloss, @senseSynsGloss); |
1057
|
|
|
|
|
|
|
} |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
return \@extendedGloss; |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
=item $obj->toggleRefineSense($toggle) |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
Allows user to toggle refineSense() on/off. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
Parameter: 0 or 1 to toggle the refine sense method |
1067
|
|
|
|
|
|
|
on or off respectively in the processLemma method. |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
Returns: nothing |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
=cut |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
sub toggleRefineSense() |
1074
|
|
|
|
|
|
|
{ |
1075
|
|
|
|
|
|
|
if($_[0] == 0) |
1076
|
|
|
|
|
|
|
{ |
1077
|
|
|
|
|
|
|
$refineSense = 0; |
1078
|
|
|
|
|
|
|
} |
1079
|
|
|
|
|
|
|
else |
1080
|
|
|
|
|
|
|
{ |
1081
|
|
|
|
|
|
|
$refineSense = 1; |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
=item $obj->refineSense(@inLemma, $highSense) |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
Refines chosen sense, by determing which |
1088
|
|
|
|
|
|
|
numbered sense should be chosen. |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
Parameters: the in lemma in form of |
1091
|
|
|
|
|
|
|
(lemma, part-of-speech, item-id, definition, def source) |
1092
|
|
|
|
|
|
|
and the sense which currently bests matches the inlemma. |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
Returns:the new highest scoring sense |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
=cut |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
sub refineSense() |
1099
|
|
|
|
|
|
|
{ |
1100
|
|
|
|
|
|
|
my $base = 0; |
1101
|
|
|
|
|
|
|
if(scalar @_ == 3)#checks if method entered by object. |
1102
|
|
|
|
|
|
|
{ |
1103
|
|
|
|
|
|
|
$base = 1; |
1104
|
|
|
|
|
|
|
} |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
my @inLemma = @{$_[$base]}; |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
$base++; |
1109
|
|
|
|
|
|
|
my $highSense = $_[$base]; |
1110
|
|
|
|
|
|
|
my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word. |
1111
|
|
|
|
|
|
|
my $shortSense = substr($inLemma[1], 0, 1); |
1112
|
|
|
|
|
|
|
my $sense = $word . "#" . $shortSense; |
1113
|
|
|
|
|
|
|
my $highSenseScore = 0; |
1114
|
|
|
|
|
|
|
my $rSenseScore = 0; |
1115
|
|
|
|
|
|
|
my $refineHigh = "$sense#1"; #assume first sense. |
1116
|
|
|
|
|
|
|
my $tempLemmaGloss = $inLemma[3]; |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
if($cleanUp == 1) |
1119
|
|
|
|
|
|
|
{ |
1120
|
|
|
|
|
|
|
#Clean up the words in the temp lemma gloss. |
1121
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/(\(|\)|\.)//g; |
1122
|
|
|
|
|
|
|
$tempLemmaGloss =~ s/^a-zA-Z//g; |
1123
|
|
|
|
|
|
|
$tempLemmaGloss = lc $tempLemmaGloss; |
1124
|
|
|
|
|
|
|
$tempLemmaGloss =~ $stopList; #remove stop words |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
if($userCleanUp ne "\"\"") |
1127
|
|
|
|
|
|
|
{ |
1128
|
|
|
|
|
|
|
$tempLemmaGloss =~ $userCleanUp; |
1129
|
|
|
|
|
|
|
} |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
my @refineLemmaGloss = split(' ', $tempLemmaGloss); |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
my $rGlossLength = 0.0; |
1134
|
|
|
|
|
|
|
my $rOverlaps = 0.0; |
1135
|
|
|
|
|
|
|
my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word. |
1136
|
|
|
|
|
|
|
for my $rSense (0..$#refineSenses) |
1137
|
|
|
|
|
|
|
{ |
1138
|
|
|
|
|
|
|
my $tempSenseGloss = $wnGlosses{$rSense}; |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
for my $rLemma (0..$#refineLemmaGloss) |
1141
|
|
|
|
|
|
|
{ |
1142
|
|
|
|
|
|
|
$rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma]; |
1143
|
|
|
|
|
|
|
if($refineLemmaGlos[$rLemma] ne $word) |
1144
|
|
|
|
|
|
|
{ |
1145
|
|
|
|
|
|
|
if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/) |
1146
|
|
|
|
|
|
|
{ |
1147
|
|
|
|
|
|
|
$rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma]; |
1148
|
|
|
|
|
|
|
} |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
$rSenseScore = $rOverlaps/$rGlossLength; |
1154
|
|
|
|
|
|
|
if($rSenseScore > $highSenseScore) |
1155
|
|
|
|
|
|
|
{ |
1156
|
|
|
|
|
|
|
$highSenseScore = $rSenseScore; |
1157
|
|
|
|
|
|
|
$refineHigh = $rHypo; |
1158
|
|
|
|
|
|
|
} |
1159
|
|
|
|
|
|
|
} |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
$highSense = $refineHigh; |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
return $highSense; |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
} |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
#**************printHelp()********************** |
1169
|
|
|
|
|
|
|
# Prints indepth help guide to screen. |
1170
|
|
|
|
|
|
|
#*********************************************** |
1171
|
|
|
|
|
|
|
sub printHelp() |
1172
|
|
|
|
|
|
|
{ |
1173
|
|
|
|
|
|
|
printUsage(); |
1174
|
|
|
|
|
|
|
print "Takes in lemmas from file and attempts to\n"; |
1175
|
|
|
|
|
|
|
print "insert them into WordNet by first finding\n"; |
1176
|
|
|
|
|
|
|
print "a hypernym, then either a) merging the \n"; |
1177
|
|
|
|
|
|
|
print "lemma with the hypernym or b) attaching \n"; |
1178
|
|
|
|
|
|
|
print "the lemma to the hypernym.\n"; |
1179
|
|
|
|
|
|
|
} |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
1; |