File Coverage

blib/lib/WordNet/Extend/Locate.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				# WordNet::Extend::Locate.pm version 0.041
2				# Updated: 08/06/17
3				#
4				# Jon Rusert, University of Minnesota Duluth
5				# ruse0008 at d.umn.edu
6				#
7				# Ted Pedersen, University of Minnesota Duluth
8				# tpederse at d.umn.edu
9				#
10				# This program is free software: you can redistribute it and/or modify
11				# it under the terms of the GNU General Public License as published by
12				# the Free Software Foundation, either version 3 of the License, or
13				# (at your option) any later version.
14				#
15				# This program is distributed in the hope that it will be useful,
16				# but WITHOUT ANY WARRANTY; without even the implied warranty of
17				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18				# GNU General Public License for more details.
19				#
20				# You should have received a copy of the GNU General Public License
21				# along with this program. If not, see .
22				#
23
24				package WordNet::Extend::Locate;
25
26				=head1 NAME
27
28				WordNet::Extend::Locate - Perl modules for locating where in WordNet a
29				lemma should be inserted.
30
31				=head1 SYNOPSIS
32
33				=head2 Basic Usage Example
34
35				use WordNet::Extend::Locate;
36
37				my $locate = WordNet::Extend::Locate->new();
38
39				$locate->stopList('(the\|is\|at)');
40
41				$locate->setCleanUp(1);
42
43				$locate->preProcessing();
44
45				$locate->toggleCompareGlosses(1,1,0);
46
47				$locate->setBonus(25);
48
49				$locate->toggleRefineSense(0);
50
51				print "Finding location for 'dog noun withdef.1 man's best friend'\n";
52
53				@location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")};
54
55				print "Location found: @location\n";
56
57				=head1 DESCRIPTION
58
59				=head2 Introduction
60
61				WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location.
62
63				=over
64				=cut
65
66	1	1	80426	use WordNet::QueryData;
	0
	0
67				#use Wiktionary::Parser;
68				use Getopt::Long;
69				use File::Spec;
70				use Lingua::Stem;
71				use Lingua::EN::Tagger;
72				use WordNet::Similarity::vector;
73				#use List::Util;
74
75				our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
76
77				@ISA = qw(Exporter);
78
79				%EXPORT_TAGS = ();
80
81				@EXPORT_OK = ();
82
83				@EXPORT = ();
84
85				$VERSION = '0.041';
86
87				#************Variables********************
88				$wn = WordNet::QueryData->new; #to be used to access data from wordnet
89				$stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc.
90				$tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure
91				$measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity
92				@wordNetNouns; #stores all words for noun sense from wordnet
93				@wordNetVerbs; #stores all words for verb sense from wordnet
94				%wnGlosses = ();
95				@wnNounSenses;
96				@wnVerbSenses;
97				%wnHypes = ();
98				%wnHypos = ();
99				%wnSyns = ();
100				%wnFreq = ();
101				#our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages.
102				#$stopList = "(the\|is\|at\|which\|on\|a\|an\|and\|or\|up\|in\|so)"; #default stop list.
103				$stopList = "(a\|about\|above\|after\|again\|against\|all\|am\|an\|and\|any\|are\|aren't\|as\|at\|be\|because\|been\|before\|being\|below\|between\|both\|but\|by\|can't\|cannot\|could\|couldn't\|did\|didn't\|do\|does\|doesn't\|doing\|don't\|down\|during\|each\|few\|for\|from\|further\|had\|hadn't\|has\|hasn't\|have\|haven't\|having\|he\|he'd\|he'll\|he's\|her\|here\|here's\|hers\|herself\|him\|himself\|his\|how\|how's\|i\|i'd\|i'll\|i'm\|i've\|if\|in\|into\|is\|isn't\|it\|it's\|its\|itself\|let's\|me\|more\|most\|mustn't\|my\|myself\|no\|nor\|not\|of\|off\|on\|once\|only\|or\|other\|ought\|our\|ours\|ourselves\|out\|over\|own\|same\|shan't\|she\|she'd\|she'll\|she's\|should\|shouldn't\|so\|some\|such\|than\|that\|that's\|the\|their\|theirs\|them\|themselves\|then\|there\|there's\|these\|they\|they'd\|they'll\|they're\|they've\|this\|those\|through\|to\|too\|under\|until\|up\|very\|was\|wasn't\|we\|we'd\|we'll\|we're\|we've\|were\|weren't\|what\|what's\|when\|when's\|where\|where's\|which\|while\|who\|who's\|whom\|why\|why's\|with\|won't\|would\|wouldn't\|you\|you'd\|you'll\|you're\|you've\|your\|yours\|yourself\|yourselves)";
104				$preProcessed = 0; #Flag to determine if preProcessing() has been called.
105				$cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp();
106				$userCleanUp = ""; #Cleanup step specified by user in addCleanUp();
107				$useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons.
108				$useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons.
109				$useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons.
110				$bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus();
111				$refineSense = 0; #Toggle for use of refineSense() method, default on.
112				$help = 0;
113				$scoringMethod = 'baseline';
114				@scoringMethods = ('baseline', 'BwS', 'Similarity', 'Word2Vec');
115				$stemming = 0; #Toggle for stemming on or off.
116				$stemmed = 0; #flag for use in BwS
117				$cValue = 0; #Confidence value for w2veccompare can be set in setConfidenceValue()
118				#*********************************************
119
120				GetOptions('help' => \$help);
121				if($help == 1)
122				{
123				printHelp();
124				exit(0);
125				}
126
127				=head2 Methods
128
129				The following methods are defined in this package:
130
131				=head3 Public methods
132
133				=over
134
135				=item $obj->new()
136
137				The constructor for WordNet::Extend::Locate objects.
138
139				Parameters: none.
140
141				Return value: the new blessed object
142
143				=cut
144
145				sub new
146				{
147				my $class = shift;
148				my $self = {};
149
150				$self->{errorString} = '';
151				$self->{error}=0;
152
153				bless $self, $class;
154
155				return $self;
156				}
157
158				=item $obj->getError()
159
160				Allows the object to check if any errors have occurred.
161				Returns an array ($error, $errorString), where $error
162				value equal to 1 represents a warning and 2 represents
163				an error and $errString contains the possible error.
164				For example, if a user forgets to run preProcessing() before
165				a method that relies on it, the error would be 2 and errorString
166				would mention that preProcessing had not been run.
167
168				Parameter: None
169
170				Returns: array of the form ($error, $errorString).
171
172				=cut
173				sub getError()
174				{
175				my $self = shift;
176				my $error = $self->{error};
177				my $errString = $self->{errorString};
178				$self->{error}=0;
179				$self->{errorString} = "";
180				$errString =~ s/^[\r\n\t ]+//;
181				return ($error, $errString);
182				}
183
184				=item $obj->locateFile($input_file, $output_file)
185
186				Attempts to locate best WordNet position for each word
187				from input file into WordNet, outputs results to output file.
188
189				Parameter: location of input file and output file respectively
190
191				Returns: nothing
192
193				=cut
194
195				sub locateFile()
196				{
197				my $input = File::Spec->canonpath($_[1]);
198				my $output = File::Spec->canonpath($_[2]);
199
200				#Attempts to open input data
201				open DATA, "$input" or die $!;
202				open (OUTDATA, '>', "$output") or die $!;
203
204				#if preProcessing() hasn't been called, call it.
205				if($preProcessed == 0)
206				{
207				preProcessing();
208				}
209
210				my @outLemma = ("","","");
211
212				while() #While lemmas are left in the input data
213				{
214				for $tempIn (split("\n")) #processes data line by line.
215				{
216				@outLemma = @{locate($tempIn)};
217
218				if(scalar @outLemma > 0)#only print if ideal lemma found
219				{
220				$tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n";
221
222				print OUTDATA "$tempOut";
223				}
224
225				}
226				}
227				close DATA;
228				close OUTDATA;
229				}
230
231				=item $obj->locate($wordPosGloss)
232
233				Takes in single lemma with gloss and returns location of best
234				insertion point in WordNet.
235
236				Parameter: Lemma string in format of 'word\tpos\titem-id\tdef'
237				NOTE: String must only be separated by \t no space.
238
239				Returns: Array in format of (item-id, WordNet sense, operation)
240
241				=cut
242				sub locate()
243				{
244				my $base = 0;
245				if(scalar @_ == 2)#checks if method entered by object.
246				{
247				$base = 1;
248				}
249
250				#if preProcessing() hasn't been called, call it.
251				if($preProcessed == 0)
252				{
253				preProcessing();
254				}
255
256				my @inLemma = ();
257				if(ref($_[$base]) eq 'ARRAY') #distinguishes between lemmas sent in as array vs string in \t format
258				{
259				@inLemma =@{$_[$base]};
260				}
261				else
262				{
263				@inLemma = split("\t", $_[$base]); #stores lemma as formatted above
264				}
265
266				my @outLemma = ();
267				#word2vec handles all the wordnet words at once, while the other methods handle them one at a time
268				if($scoringMethod eq 'Word2Vec')
269				{
270				@outLemma = @{word2VecCompare(\@inLemma)};
271				}
272				else
273				{
274				@outLemma = @{processLemma(\@inLemma)};
275				}
276
277				return \@outLemma;
278
279				}
280
281				=item $obj->stopList($newStopList)
282
283				Takes in new stop list, in regex form
284
285				Parameter:the new stop list in regex substitution form (w1\|w2\|...\|wn)
286
287				Returns: nothing
288
289				=cut
290
291				sub stopList()
292				{
293				my $base = 0;
294				if(scalar @_ == 2)#checks if method entered by object.
295				{
296				$base = 1;
297				}
298				my $tempStopList = $_[$base];
299				if($tempStopList =~ /$.(\\|.)?$/g)
300				{
301				$stopList = $tempStopList;
302				}
303				else
304				{
305				my $self = shift;
306				$self->{error} = 1;
307				$self->{errorString} = "Proposed stop list not in regex substition form (w1\|w2\|...\|wn), default remains";
308				}
309				}
310
311				=item $obj->setCleanUp($switch)
312
313				Allows the user to toggle whether or not
314				glosses should be cleaned up.
315
316				Parameter: 0 or 1 to turn clean up off or on respectively
317
318				Returns: nothing
319
320				=cut
321
322				sub setCleanUp()
323				{
324				my $base = 0;
325				if(scalar @_ == 2)#checks if method entered by object.
326				{
327				$base = 1;
328				}
329
330				if($_[$base] == 0) #turns cleanUp off.
331				{
332				$cleanUp = 0;
333				}
334				else #turns cleanUp on.
335				{
336				$cleanUp = 1;
337				}
338				}
339
340				=item $obj->addCleanUp($cleanUp)
341
342				Allows the user to add their own
343				regex for cleaning up the glosses.
344
345				Parameter: Regex representing the cleanup
346				the user wants performed.
347
348				Returns: Nothing
349
350				=cut
351
352				sub addCleanUp()
353				{
354				my $base = 0;
355				if(scalar @_ == 2)#checks if method entered by object.
356				{
357				$base = 1;
358				}
359
360				my $tempCleanUp = $_[$base];
361				if($tempCleanUp =~ /(s\|t)\/.*\/g?/g)
362				{
363				$userCleanUp = $tempCleanUp;
364				}
365				else
366				{
367				my $self = shift;
368				$self->{error} = 1;
369				$self->{errorString} = "Clean Up not in regex format '/.../', default remains on";
370				}
371				}
372
373				=item $obj->preProcessing()
374
375				Highly increases speed of program by making
376				as many outside calls as possible and storing
377				outside info to be used later.
378
379				Parameter: none
380
381				Returns: nothing
382
383				=cut
384
385				sub preProcessing()
386				{
387				$preProcessed = 1; #Flag that preProcessing has been called.
388				@wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses.
389				@wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses.
390				#reset all glosses, senses, etc.
391				%wnGlosses = ();
392				@wnNounSenses;
393				@wnVerbSenses;
394				%wnHypes = ();
395				%wnHypos = ();
396				%wnSyns = ();
397				%wnFreq = ();
398
399
400				#Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration.
401				foreach my $noun (@wordNetNouns)
402				{
403				my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word
404				foreach my $curNSense (@nSenses)
405				{
406				#stores in noun senses to differentiate from verbs.
407				push(@wnNounSenses, $curNSense);
408
409				#obtain each gloss and clean up before inserting into hash.
410				my @nGlosses = $wn->querySense($curNSense, "glos");
411				my $tempSenseGloss = $nGlosses[0];
412
413				if($cleanUp == 1)
414				{
415				#Clean up the words in the temporary sense gloss.
416				$tempSenseGloss =~ s/($\|$\|\.)//g;
417				$tempSenseGloss =~ s/^a-zA-Z//g;
418				$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
419				$tempSenseGloss =~ s/(^\|\s)$stopList(\s\|$)/ /g; #remove stop words
420				}
421				if($userCleanUp ne "\"\"")
422				{
423				$tempSenseGloss =~ $userCleanUp;
424				}
425
426				#if stemming is on, stem each word in each gloss
427				if($stemming == 1)
428				{
429				my @tempStem = split(' ', $tempSenseGloss);
430				my @stemmedGloss = @{$stemmer->stem(@tempStem)};
431				$tempSenseGloss = join(' ', @stemmedGloss);
432				}
433
434				#maps each sense to its gloss
435				$wnGlosses{$curNSense} = $tempSenseGloss;
436
437				#obtains and stores, hypes, hypos, and syns
438				my @hypes = $wn->querySense($curNSense, "hype");
439				$wnHypes{$curNSense} = \@hypes;
440				my @hypos = $wn->querySense($curNSense, "hypo");
441				$wnHypos{$curNSense} = \@hypos;
442				my @syns = $wn->querySense($curNSense, "syns");
443				$wnSyns{$curNSense} = \@syns;
444				$wnFreq{$curNSense} = $wn->frequency($curNSense);
445				}
446				}
447
448				#stores verbs' senses' glosses, hypes, hypos, and syns.
449				foreach my $verb (@wordNetVerbs)
450				{
451				my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word
452				foreach my $curVSense (@vSenses)
453				{
454				#stores in verb senses to differentiate later.
455				push(@wnVerbSenses, $curVSense);
456
457				#obtain each gloss and clean up before inserting into hash.
458				my @vGlosses = $wn->querySense($curVSense, "glos");
459				my $tempSenseGloss = $vGlosses[0];
460
461				if($cleanUp == 1)
462				{
463				#Clean up the words in the temporary sense gloss.
464				$tempSenseGloss =~ s/($\|$\|\.)//g;
465				$tempSenseGloss =~ s/^a-zA-Z//g;
466				$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
467				$tempSenseGloss =~ s/(^\|\s)$stopList(\s\|$)/ /g; #remove stop words
468				}
469				if($userCleanUp ne "\"\"")
470				{
471				$tempSenseGloss =~ $userCleanUp;
472				}
473
474				#if stemming is on, stem each word in each gloss
475				if($stemming == 1)
476				{
477				my @tempStem = split(' ', $tempSenseGloss);
478				my @stemmedGloss = @{$stemmer->stem(@tempStem)};
479				$tempSenseGloss = join(' ', @stemmedGloss);
480				}
481
482				#maps each sense to its gloss
483				$wnGlosses{$curVSense} = $tempSenseGloss;
484
485				#obtains and stores, hypes, hypos, and syns
486				my @hypes = $wn->querySense($curVSense, "hype");
487				$wnHypes{$curVSense} = \@hypes;
488				my @hypos = $wn->querySense($curVSense, "hypo");
489				$wnHypos{$curVSense} = \@hypos;
490				my @syns = $wn->querySense($curVSense, "syns");
491				$wnSyns{$curVSense} = \@syns;
492				$wnFreq{$curVSense} = $wn->frequency($curVSense);
493				}
494				}
495
496
497				}
498
499				=item $obj->processLemma(@inLemma)
500
501				Determines where the OOV Lemma should be
502				inserted into WordNet, returns the output.
503
504				Parameter: the lemma to be inserted in array form
505				(lemma, part-of-speech, item-id, definition, def source)
506
507				Returns: chosen lemma in array form
508				(item-id, WordNet sense, operation)
509
510				=cut
511
512				sub processLemma()
513				{
514				my $base = 0;
515				if(scalar @_ == 2)#checks if method entered by object.
516				{
517				$base = 1;
518				}
519
520				my %senseScores = ();
521				my $highSenseScore = 0;
522				my $highSense = "";
523				my @inLemma = @{$_[$base]};
524				my @outLemma = ("","","");
525				my $attachMerge = "";
526				my @senses = ();
527
528				if($preProcessed == 1)
529				{
530				if($inLemma[1] =~ /noun/)
531				{
532				@senses = @wnNounSenses;
533				}
534				else
535				{
536				@senses = @wnVerbSenses;
537				}
538
539				foreach $curSense (@senses) #runs through each sense of current word
540				{
541				my $score = scoreSense(\@inLemma, $curSense);
542
543				if($score >= $highSenseScore)
544				{
545				$highSenseScore = $score;
546				$highSense = $curSense;
547				}
548
549				$senseScores{$curSense} = $score;
550				}
551
552				if($refineSense == 1)
553				{
554				$highSense = refineSense(\@inLemma, $highSense);
555				}
556
557				if($wnFreq{$highSense} == 0)
558				{
559				$attachMerge = "attach";
560				}
561				else
562				{
563				$attachMerge = "merge";
564				}
565
566				$outLemma[0] = $inLemma[2];
567				$outLemma[1] = $highSense;
568				$outLemma[2] = $attachMerge;
569				return \@outLemma;
570				}
571				else
572				{
573				my $self = shift;
574				$self->{error} = 2;
575				$self->{errorString} = "PreProcessing must be run before processLemma() is called.";
576				}
577				}
578
579				=item $obj->toggleCompareGlosses($hype,$hypo,$syns)
580
581				Toggles which glosses are used in score sense.
582				by default, the sense, the sense's hypernyms'
583				glosses,hyponyms' glosses, and synsets' glosses
584				are turned on. This method allows for toggling
585				of hypes,hypos,synsets, by passing in three
586				parameters, 1 for on and 0 for off.
587				Example: toggleCompareGlosses(0,0,0) toggles
588				all three off.
589
590				Parameters: 0 or 1 for toggling hypernyms, hyponyms,
591				and synset comparisons.
592
593				Returns: nothing
594
595				=cut
596
597				sub toggleCompareGlosses()
598				{
599				my $base = 0;
600				if(scalar @_ == 4)#checks if method entered by object.
601				{
602				$base = 1;
603				}
604
605				if($_[$base] == 0)
606				{
607				$useHypeGlosses = 0;
608				}
609				else
610				{
611				$useHypeGlosses = 1;
612				}
613
614				$base++;
615
616				if($_[$base] == 0)
617				{
618				$useHypoGlosses = 0;
619				}
620				else
621				{
622				$useHypoGlosess = 1;
623				}
624
625				$base++;
626
627				if($_[$base] == 0)
628				{
629				$useSynsGlosses = 0;
630				}
631				else
632				{
633				$useSynsGlosses = 1;
634				}
635				}
636
637				=item $obj->setBonus($bonus)
638
639				Allows the user to set the bonus that will be
640				used when scoring lemmas that contain the
641				new lemma.
642
643				Parameter: the multiplier that should be used in
644				calculating the bonus.
645
646				Returns: nothing
647
648				=ctu
649
650				sub setBonus()
651				{
652				my $base = 0;
653				if(scalar @_ == 2)#checks if method entered by object.
654				{
655				$base = 1;
656				}
657
658				$bonus = $_[$base];
659				}
660
661				=item $obj->scoreSense(@inLemma, $compareSense)
662
663				Serves as a wrapper method to facilitate the
664				main program by directing it to the currently
665				chosen scoring method. By default the average
666				highest scoring method is chosen. This can be
667				changed with setScoreMethod().
668
669				Parameters: the in lemma in array form
670				(lemma, part-of-speech, item-id, definition, def source)
671				and the sense that the lemma is being compared to.
672
673				Returns: a score of how related the in lemma is to the
674				compareSense.
675
676				=cut
677
678				sub scoreSense()
679				{
680				my $base = 0;
681				if(scalar @_ == 3)#checks if method entered by object.
682				{
683				$base = 1;
684				}
685
686				my @inLemma = @{$_[$base]};
687				$base++;
688				my $curSense = $_[$base];
689
690				my $score = 0;
691				if($scoringMethod eq "baseline")
692				{
693				$score = baseline(\@inLemma, $curSense);
694				}
695				if($scoringMethod eq "BwS")
696				{
697				$score = BwS(\@inLemma, $curSense);
698				}
699				if($scoringMethod eq "Similarity")
700				{
701				$score = Similarity(\@inLemma, $curSense);
702				}
703
704				return $score;
705				}
706
707				=item $obj->setScoreMethod($scoreMethod)
708
709				Allows the user to choose which scoring method
710				should be used by default when running the
711				program from the top. Options are:
712				'baseline'
713				'BwS' - baseline system with stemming and lemmitization
714				--as more are added they will appear here.
715
716				Parameter: the chosen scoring method
717
718				Returns: nothing.
719
720				=cut
721
722				sub setScoreMethod()
723				{
724				my $base = 0;
725
726				if(scalar @_ == 2)#checks if method entered by object.
727				{
728				$base = 1;
729				}
730
731				my $scoreMethod = $_[$base];
732
733				#check if the score method is in scoring methods.
734				my @matches = grep(/$scoreMethod/, @scoringMethods);
735				if(scalar @matches > 0)
736				{
737				$scoringMethod = $scoreMethod;
738				}
739
740				}
741
742				=item $obj->Similarity(@inLemma, $compareSense)
743
744				Calculates a score for the passed sense and returns
745				that score.
746
747				Parameters: the in lemma in array form
748				(lemma, part-of-speech, item-id, definition, def source)
749				and the sense that the lemma is being compared to.
750
751				Returns: a score of how related the im lemma is to the
752				compareSense.
753
754				=cut
755
756				sub Similarity()
757				{
758				my $base = 0;
759				if(scalar @_ == 3)#checks if method entered by object.
760				{
761				$base = 1;
762				}
763
764				my @inLemma = @{$_[$base]};
765				$base++;
766				my $curSense = $_[$base];
767
768				my $def = @inLemma[3];
769
770				#split definition and stem the words
771				my @listDef =split(' ', $def);
772				my @defStemmed = @{$stemmer->stem(@listDef)};
773
774				#join definition back together and tag with pos
775				$def = join(' ', @defStemmed);
776				my $tagged = $tagger->add_tags($def);
777
778				#split the tagged definition for individual word processing
779				@tagArray = split(' ', $tagged);
780				my @similar = ();
781
782				#step through each tagged word and find the first sense in wordnet, then add that to the @similar list
783				foreach my $cur (@tagArray)
784				{
785				my $pos = '';
786				if($cur =~ /.*/)
787				{
788				$pos = 'n';
789				}
790				else
791				{
792				if($cur =~ /.*/)
793				{
794				$pos = 'v';
795				}
796				else
797				{
798				if($cur =~ /.*/)
799				{
800				$pos = 'a';
801				}
802				}
803				}
804
805				if(length $pos == 1)
806				{
807				$cur =~ s/<[nvj\/].{1,3}>//g;
808				@wnQuery = $wn->querySense("$cur#$pos");
809				push @similar, $wnQuery[0];
810				}
811				}
812
813				my $score = 0;
814
815				foreach my $curSim (@similar)
816				{
817				my $value = $measure->getRelatedness("$curSense", "$curSim");
818				$score = $score + $value;
819				}
820
821				return $score;
822				}
823
824				=item $obj->BwS(@inLemma, $compareSense)
825
826				Calculates a score for the passed sense and returns
827				that score. This is a modified baseline() method
828				which adds stemming to the data.
829
830				Parameters: the in lemma in array form
831				(lemma, part-of-speech, item-id, definition, def source)
832				and the sense that the lemma is being compared to.
833
834				Returns: a score of how related the in lemma is to the
835				compareSense.
836
837				=cut
838
839				sub BwS()
840				{
841				my $base = 0;
842				if(scalar @_ == 3)#checks if method entered by object.
843				{
844				$base = 1;
845				}
846
847				my @inLemma = @{$_[$base]};
848				$base++;
849				my $curSense = $_[$base];
850
851				if($stemmed == 0)
852				{
853				$stemming = 1;
854				preProcessing();
855				$stemmed = 1;
856				}
857
858				return simpleScoreSense(\@inLemma, $curSense);
859
860				}
861
862				=item $obj->baseline(@inLemma, $compareSense)
863
864				Calculates a score for the passed sense then returns
865				that score. This class is a wrapper for the
866				simpleScoreSense() method as it makes sure no stemming
867				or lemmatization is present in the preProcessing().
868
869				Parameters: the in lemma in array form
870				(lemma, part-of-speech, item-id, definition, def source)
871				and the sense that the lemma is being compared to.
872
873				Returns: a score of how related the in lemma is to the
874				compareSense.
875
876				=cut
877
878				sub baseline()
879				{
880				my $base = 0;
881				if(scalar @_ == 3)#checks if method entered by object.
882				{
883				$base = 1;
884				}
885
886				my @inLemma = @{$_[$base]};
887				$base++;
888				my $curSense = $_[$base];
889
890				if($stemmed == 1)
891				{
892				$stemming = 0;
893				preProcessing();
894				$stemmed = 0;
895				}
896
897				return simpleScoreSense(\@inLemma, $curSense);
898
899				}
900
901				=item $obj->word2VecCompare(@inLemma)
902
903				Calculates a score for the passed sense by
904				using the gensim Word2Vec model trained on Google
905				news vectors.
906
907				Parameters: the in lemma in array form
908				(lemma, part-of-speech, item-id, definition, def source)
909				and the sense that the lemma is being compared to.
910
911				Returns: a score of how related the in lemma is to the
912				compareSense.
913
914				=cut
915
916				sub word2VecCompare()
917				{
918				my $base = 0;
919				if(scalar @_ == 2)#checks if method entered by object.
920				{
921				$base = 1;
922				}
923
924				my @inLemma = @{$_[$base]};
925				my @candidateArray = ();
926
927				my $tempLemmaGloss = $inLemma[3];
928
929				if($cleanUp == 1)
930				{
931				#Clean up the words in the temp lemma gloss.
932				$tempLemmaGloss =~ s/($\|$\|\.)//g;
933				$tempLemmaGloss =~ s/^a-zA-Z//g;
934				$tempLemmaGloss = lc $tempLemmaGloss;
935				$tempLemmaGloss =~ s/(^\|\s)$stopList(\s\|$)/ /g; #remove stop words
936				}
937
938
939				if($inLemma[1] eq 'noun')
940				{
941				@candidateArray = @wordNetNouns;
942				}
943				else
944				{
945				@candidateArray = @wordNetVerbs;
946				}
947
948				open (WNFILE, '>', "tmpfile") or die $!;
949				print WNFILE "$cValue\n";
950				print WNFILE "$inLemma[0]\n"; #print OOV Lemma first which will be handled by python
951				print WNFILE "$tempLemmaGloss\n";
952				#create a file of all candidate WordNet words to be passed to python word2vec
953				foreach $curW (@candidateArray)
954				{
955				if($curW !~ /(^\|\s)$stopList(\s\|$)/g)
956				{
957				print WNFILE "$curW\n";
958				}
959				}
960				close WNFILE;
961
962				#open(my $ideal, "\|-", "python ~/WordNet-Extend/word2vecSimilarity.py tmpfile $inLemma[0]") or die "Cannot run python script: $!";
963
964				$ideal =`python -W ignore ~/bin/word2vecSimilarity.py tmpfile`;
965
966				chomp $ideal;
967				my $attachMerge = "";
968				if($wnFreq{$ideal} == 0)
969				{
970				$attachMerge = "attach";
971				}
972				else
973				{
974				$attachMerge = "merge";
975				}
976
977				my $pos = "";
978				if($inLemma[1] eq 'noun')
979				{
980				$pos = 'n';
981				}
982				else
983				{
984				$pos = 'v'
985				}
986
987				my @outLemma = ();
988				if($ideal ne "")
989				{
990				@outLemma = ("$inLemma[2]", "$ideal#$pos#1", "$attachMerge");
991				}
992				# else
993				# {
994				# my $self = shift;
995				# $self->{error} = 1;
996				# $self->{errorString} = "No ideal found, consider changing confidence value";
997				# }
998				#unlink 'tmpfile';
999
1000				return \@outLemma;
1001
1002				}
1003
1004				=item $obj->setConfidenceValue()
1005
1006				Allows the user to set the confidence value for word2vecCompare().
1007				The confidence value is the cutoff for the similarity score. If
1008				the similarity score is below the confidence value it will be dropped.
1009				This aims to increase accuracy but will reduce recall.
1010
1011				Parameters: the new confidence value, default is set to 0
1012
1013				Returns: Nothing
1014
1015				=cut
1016
1017				sub setConfidenceValue()
1018				{
1019				my $base = 0;
1020
1021				if(scalar @_ == 2)#checks if method entered by object
1022				{
1023				$base = 1;
1024				}
1025
1026				my $newCValue = $_[$base];
1027
1028				$cValue = $newCValue;
1029
1030				}
1031
1032
1033				=item $obj->simpleScoreSense(@inLemma, $compareSense)
1034
1035				Calculates a score for the passed sense then
1036				returns that score. This is the baseline system which
1037				was submitted for SemEval16 task 14. This algorithm
1038				scores by overlapping words found in the lemma's gloss
1039				and also with the lemma's hypernym and hyponyms' glosses.
1040
1041				Parameters: the in lemma in array form
1042				(lemma, part-of-speech, item-id, definition, def source)
1043				and the sense that the lemma is being compared to.
1044
1045				Returns: a score of how related the in lemma is to the
1046				compareSense.
1047
1048				=cut
1049
1050				sub simpleScoreSense()
1051				{
1052				my $base = 0;
1053				if(scalar @_ == 3)#checks if method entered by object.
1054				{
1055				$base = 1;
1056				}
1057
1058				my @inLemma = @{$_[$base]};
1059				$base++;
1060				my $curSense = $_[$base];
1061				my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word.
1062
1063				#_________________Sense Gloss_________________________________
1064				my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss.
1065
1066				my @extendedGloss = getExtendedGloss($curSense);
1067
1068				#________________Lemma Gloss_________________________________
1069				my $tempLemmaGloss = $inLemma[3];
1070
1071
1072				if($cleanUp == 1)
1073				{
1074				#Clean up the words in the temp lemma gloss.
1075				$tempLemmaGloss =~ s/($\|$\|\.)//g;
1076				$tempLemmaGloss =~ s/^a-zA-Z//g;
1077				$tempLemmaGloss = lc $tempLemmaGloss;
1078				$tempLemmaGloss =~ s/(^\|\s)$stopList(\s\|$)/ /g; #remove stop words
1079				}
1080				if($userCleanUp ne "\"\"")
1081				{
1082				$tempLemmaGloss =~ $userCleanUp;
1083				}
1084
1085				my @curLemmaGloss = split(' ', $tempLemmaGloss);
1086
1087
1088				#__________________Overlaps__________________________________
1089				my $glossLength = 0;
1090				my $overlaps = 0.0; #number of overlapped words.
1091
1092				#scan through each word from the sense gloss and see if any overlap on the lemma gloss.
1093				for my $lWord (0..$#curLemmaGloss)
1094				{
1095				$glossLength = $glossLength + length $curLemmaGloss[$lWord];
1096				if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself
1097				{
1098				$overlaps = $overlaps + $bonus*(length $word);
1099				}
1100
1101				$spaceWord = $word;
1102				$spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below
1103				if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)\|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/)
1104				{
1105				$overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]);
1106				}
1107
1108				for my $sWord (0..$#curSenseGloss)
1109				{
1110				if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/)
1111				{
1112				$overlaps = $overlaps + length $curSenseGloss[$sWord];
1113				}
1114				}
1115				for my $extWord (0..$#extendedGloss)
1116				{
1117				if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/)
1118				{
1119				$overlaps = $overlaps + length $extendedGloss[$extWord];
1120				}
1121				}
1122
1123				}
1124
1125				$score = $overlaps/$glossLength;
1126
1127				return $score;
1128				}
1129
1130				=item $obj->getExtendedGloss($compareSense)
1131
1132				Calculates the extended gloss based on which
1133				glosses are toggled and returns an array
1134
1135				which contains the full glosses.
1136
1137				Parameter: the sense which the extended gloss is
1138				based on
1139
1140				Returns: an array which contains the extended gloss
1141
1142				=cut
1143
1144				sub getExtendedGloss()
1145				{
1146				my $base = 0;
1147				if(scalar @_ == 2)#checks if method entered by object.
1148				{
1149				$base = 1;
1150				}
1151
1152				my $curSense = $_[$base];
1153				my @extendedGloss = ();
1154
1155				#__________________Hype Gloss_________________________________
1156				if($useHypeGlosses == 1)
1157				{
1158				#Now expands to hypernyms glosses in overlaps
1159				my @senseHypes = @{$wnHypes{$curSense}};
1160				my @senseHypeGloss = ();
1161				my $tempAllHypeGloss = "";
1162
1163				for my $hype (0..$#senseHypes)
1164				{
1165				my $tempHypeGloss = $wnGlosses{$hype};
1166
1167				$tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss;
1168				}
1169
1170				@senseHypeGloss = split(' ', $tempAllHypeGloss);
1171
1172				push(@extendedGloss, @senseHypeGloss);
1173				}
1174
1175				#________________Hypo Gloss__________________________________
1176				if($useHypoGlosses == 1)
1177				{
1178				#adds in hyponyms' glosses in overlaps
1179				my @senseHypos = @{$wnHypos{$curSense}};
1180				my @senseHypoGloss = ();
1181				my $tempAllHypoGloss = "";
1182
1183				for my $hypo (0..$#senseHypos)
1184				{
1185				my $tempHypoGloss = $wnGlosses{$hypo};
1186
1187				$tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss;
1188				}
1189
1190				@senseHypoGloss = split(' ', $tempAllHypoGloss);
1191				push(@extendedGloss, @senseHypoGloss);
1192				}
1193
1194				#_________________Syns Gloss_________________________________
1195				if($useSynsGlosses == 1)
1196				{
1197				#adds in synsets' glosses in overlaps
1198				my @senseSyns = @{$wnSyns{$curSense}};
1199				my @senseSynsGloss = ();
1200				my $tempAllSynsGloss = "";
1201
1202				for my $syns (0..$#senseSyns)
1203				{
1204				if(!($syns =~ /\b$word\b/)) #do not repeat sense
1205				{
1206				my $tempSynsGloss = $wnGlosses{$syns};
1207
1208				$tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss;
1209				}
1210				}
1211
1212				@senseSynsGloss = split(' ', $tempAllSynsGloss);
1213				push(@extendedGloss, @senseSynsGloss);
1214				}
1215
1216				return \@extendedGloss;
1217				}
1218
1219				=item $obj->toggleRefineSense($toggle)
1220
1221				Allows user to toggle refineSense() on/off.
1222
1223				Parameter: 0 or 1 to toggle the refine sense method
1224				on or off respectively in the processLemma method.
1225
1226				Returns: nothing
1227
1228				=cut
1229
1230				sub toggleRefineSense()
1231				{
1232				if($_[0] == 0)
1233				{
1234				$refineSense = 0;
1235				}
1236				else
1237				{
1238				$refineSense = 1;
1239				}
1240				}
1241
1242				=item $obj->refineSense(@inLemma, $highSense)
1243
1244				Refines chosen sense, by determing which
1245				numbered sense should be chosen.
1246
1247				Parameters: the in lemma in form of
1248				(lemma, part-of-speech, item-id, definition, def source)
1249				and the sense which currently bests matches the inlemma.
1250
1251				Returns:the new highest scoring sense
1252
1253				=cut
1254
1255				sub refineSense()
1256				{
1257				my $base = 0;
1258				if(scalar @_ == 3)#checks if method entered by object.
1259				{
1260				$base = 1;
1261				}
1262
1263				my @inLemma = @{$_[$base]};
1264
1265				$base++;
1266				my $highSense = $_[$base];
1267				my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word.
1268				my $shortSense = substr($inLemma[1], 0, 1);
1269				my $sense = $word . "#" . $shortSense;
1270				my $highSenseScore = 0;
1271				my $rSenseScore = 0;
1272				my $refineHigh = "$sense#1"; #assume first sense.
1273				my $tempLemmaGloss = $inLemma[3];
1274
1275				if($cleanUp == 1)
1276				{
1277				#Clean up the words in the temp lemma gloss.
1278				$tempLemmaGloss =~ s/($\|$\|\.)//g;
1279				$tempLemmaGloss =~ s/^a-zA-Z//g;
1280				$tempLemmaGloss = lc $tempLemmaGloss;
1281				$tempLemmaGloss =~ s/(^\|\s)$stopList(\s\|$)/ /g; #remove stop words
1282				}
1283				if($userCleanUp ne "\"\"")
1284				{
1285				$tempLemmaGloss =~ $userCleanUp;
1286				}
1287
1288				my @refineLemmaGloss = split(' ', $tempLemmaGloss);
1289
1290				my $rGlossLength = 0.0;
1291				my $rOverlaps = 0.0;
1292				my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word.
1293				for my $rSense (0..$#refineSenses)
1294				{
1295				my $tempSenseGloss = $wnGlosses{$rSense};
1296
1297				for my $rLemma (0..$#refineLemmaGloss)
1298				{
1299				$rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma];
1300				if($refineLemmaGlos[$rLemma] ne $word)
1301				{
1302				if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/)
1303				{
1304				$rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma];
1305				}
1306				}
1307
1308				}
1309
1310				$rSenseScore = $rOverlaps/$rGlossLength;
1311				if($rSenseScore > $highSenseScore)
1312				{
1313				$highSenseScore = $rSenseScore;
1314				$refineHigh = $rHypo;
1315				}
1316				}
1317
1318				$highSense = $refineHigh;
1319
1320				return $highSense;
1321
1322				}
1323
1324
1325				#************printHelp()********************
1326				# Prints indepth help guide to screen.
1327				#***********************************************
1328				sub printHelp()
1329				{
1330				printUsage();
1331				print "Takes in lemmas from file and attempts to\n";
1332				print "insert them into WordNet by first finding\n";
1333				print "a hypernym, then either a) merging the \n";
1334				print "lemma with the hypernym or b) attaching \n";
1335				print "the lemma to the hypernym.\n";
1336				}
1337
1338				1;