File Coverage

blib/lib/WordNet/Extend/Locate.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				# WordNet::Extend::Locate.pm version 0.030
2				# Updated: 01/16/17
3				#
4				# Ted Pedersen, University of Minnesota Duluth
5				# tpederse at d.umn.edu
6				#
7				# Jon Rusert, University of Minnesota Duluth
8				# ruse0008 at d.umn.edu
9				#
10				# This program is free software: you can redistribute it and/or modify
11				# it under the terms of the GNU General Public License as published by
12				# the Free Software Foundation, either version 3 of the License, or
13				# (at your option) any later version.
14				#
15				# This program is distributed in the hope that it will be useful,
16				# but WITHOUT ANY WARRANTY; without even the implied warranty of
17				# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18				# GNU General Public License for more details.
19				#
20				# You should have received a copy of the GNU General Public License
21				# along with this program. If not, see .
22				#
23
24				package WordNet::Extend::Locate;
25
26				=head1 NAME
27
28				WordNet::Extend::Locate - Perl modules for locating where in WordNet a
29				lemma should be inserted.
30
31				=head1 SYNOPSIS
32
33				=head2 Basic Usage Example
34
35				use WordNet::Extend::Locate;
36
37				my $locate = WordNet::Extend::Locate->new();
38
39				$locate->stopList('s/\b(the\|is\|at)\b//');
40
41				$locate->setCleanUp(1);
42
43				$locate->preProcessing();
44
45				$locate->toggleCompareGlosses(1,1,0);
46
47				$locate->setBonus(25);
48
49				$locate->toggleRefineSense(0);
50
51				print "Finding location for 'dog noun withdef.1 man's best friend'\n";
52
53				@location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")};
54
55				print "Location found: @location\n";
56
57				=head1 DESCRIPTION
58
59				=head2 Introduction
60
61				WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location.
62
63				=over
64				=cut
65
66	1	1	13174	use WordNet::QueryData;
	0
	0
67				#use Wiktionary::Parser;
68				use Getopt::Long;
69				use File::Spec;
70				use Lingua::Stem;
71				use Lingua::EN::Tagger;
72				use WordNet::Similarity::vector;
73				#use List::Util;
74
75				our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
76
77				@ISA = qw(Exporter);
78
79				%EXPORT_TAGS = ();
80
81				@EXPORT_OK = ();
82
83				@EXPORT = ();
84
85				$VERSION = '0.030';
86
87				#************Variables********************
88				$wn = WordNet::QueryData->new; #to be used to access data from wordnet
89				$stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc.
90				$tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure
91				$measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity
92				@wordNetNouns; #stores all words for noun sense from wordnet
93				@wordNetVerbs; #stores all words for verb sense from wordnet
94				%wnGlosses = ();
95				@wnNounSenses;
96				@wnVerbSenses;
97				%wnHypes = ();
98				%wnHypos = ();
99				%wnSyns = ();
100				%wnFreq = ();
101				#our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages.
102				$stopList = "s/\b(the\|is\|at\|which\|on\|a\|an\|and\|or\|up)\b//g"; #default stop list.
103				$preProcessed = 0; #Flag to determine if preProcessing() has been called.
104				$cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp();
105				$userCleanUp = ""; #Cleanup step specified by user in addCleanUp();
106				$useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons.
107				$useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons.
108				$useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons.
109				$bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus();
110				$refineSense = 0; #Toggle for use of refineSense() method, default on.
111				$help = 0;
112				$scoringMethod = 'baseline';
113				@scoringMethods = ('baseline', 'BwS', 'Similarity');
114				$stemming = 0; #Toggle for stemming on or off.
115				$stemmed = 0; #flag for use in BwS
116				#*********************************************
117
118				GetOptions('help' => \$help);
119				if($help == 1)
120				{
121				printHelp();
122				exit(0);
123				}
124
125				=head2 Methods
126
127				The following methods are defined in this package:
128
129				=head3 Public methods
130
131				=over
132
133				=item $obj->new()
134
135				The constructor for WordNet::Extend::Locate objects.
136
137				Parameters: none.
138
139				Return value: the new blessed object
140
141				=cut
142
143				sub new
144				{
145				my $class = shift;
146				my $self = {};
147
148				$self->{errorString} = '';
149				$self->{error}=0;
150
151				bless $self, $class;
152
153				return $self;
154				}
155
156				=item $obj->getError()
157
158				Allows the object to check if any errors have occurred.
159				Returns an array ($error, $errorString), where $error
160				value equal to 1 represents a warning and 2 represents
161				an error and $errString contains the possible error.
162				For example, if a user forgets to run preProcessing() before
163				a method that relies on it, the error would be 2 and errorString
164				would mention that preProcessing had not been run.
165
166				Parameter: None
167
168				Returns: array of the form ($error, $errorString).
169
170				=cut
171				sub getError()
172				{
173				my $self = shift;
174				my $error = $self->{error};
175				my $errString = $self->{errorString};
176				$self->{error}=0;
177				$self->{errorString} = "";
178				$errString =~ s/^[\r\n\t ]+//;
179				return ($error, $errString);
180				}
181
182				=item $obj->locateFile($input_file, $output_file)
183
184				Attempts to locate best WordNet position for each word
185				from input file into WordNet, outputs results to output file.
186
187				Parameter: location of input file and output file respectively
188
189				Returns: nothing
190
191				=cut
192
193				sub locateFile()
194				{
195				my $input = File::Spec->canonpath($_[1]);
196				my $output = File::Spec->canonpath($_[2]);
197
198				#Attempts to open input data
199				open DATA, "$input" or die $!;
200				open (OUTDATA, '>', "$output") or die $!;
201
202				#if preProcessing() hasn't been called, call it.
203				if($preProcessed == 0)
204				{
205				preProcessing();
206				}
207
208				my @outLemma = ("","","");
209
210				while() #While lemmas are left in the input data
211				{
212				for $tempIn (split("\n")) #processes data line by line.
213				{
214				@outLemma = @{locate($tempIn)};
215
216				$tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n";
217
218				print OUTDATA "$tempOut";
219				}
220				}
221				close DATA;
222				close OUTDATA;
223				}
224
225				=item $obj->locate($wordPosGloss)
226
227				Takes in single lemma with gloss and returns location of best
228				insertion point in WordNet.
229
230				Parameter: Lemma string in format of 'word\tpos\titem-id\tdef'
231				NOTE: String must only be separated by \t no space.
232
233				Returns: Array in format of (item-id, WordNet sense, operation)
234
235				=cut
236				sub locate()
237				{
238				my $base = 0;
239				if(scalar @_ == 2)#checks if method entered by object.
240				{
241				$base = 1;
242				}
243
244				#if preProcessing() hasn't been called, call it.
245				if($preProcessed == 0)
246				{
247				preProcessing();
248				}
249
250				my @inLemma = split("\t", $_[$base]); #stores lemma as formatted above
251				my @outLemma = @{processLemma(\@inLemma)};
252				return \@outLemma;
253
254				}
255
256				=item $obj->stopList($newStopList)
257
258				Takes in new stop list, in regex form
259
260				Parameter:the new stop list in regex substitution form s/.../g?
261
262				Returns: nothing
263
264				=cut
265
266				sub stopList()
267				{
268				my $base = 0;
269				if(scalar @_ == 2)#checks if method entered by object.
270				{
271				$base = 1;
272				}
273				my $tempStopList = $_[$base];
274				if($tempStopList =~ /s\/.*?\/g?/g)
275				{
276				$stopList = $tempStopList;
277				}
278				else
279				{
280				my $self = shift;
281				$self->{error} = 1;
282				$self->{errorString} = "Proposed stop list not in regex substition form s/.../g?, default remains";
283				}
284				}
285
286				=item $obj->setCleanUp($switch)
287
288				Allows the user to toggle whether or not
289				glosses should be cleaned up.
290
291				Parameter: 0 or 1 to turn clean up off or on respectively
292
293				Returns: nothing
294
295				=cut
296
297				sub setCleanUp()
298				{
299				my $base = 0;
300				if(scalar @_ == 2)#checks if method entered by object.
301				{
302				$base = 1;
303				}
304
305				if($_[$base] == 0) #turns cleanUp off.
306				{
307				$cleanUp = 0;
308				}
309				else #turns cleanUp on.
310				{
311				$cleanUp = 1;
312				}
313				}
314
315				=item $obj->addCleanUp($cleanUp)
316
317				Allows the user to add their own
318				regex for cleaning up the glosses.
319
320				Parameter: Regex representing the cleanup
321				the user wants performed.
322
323				Returns: Nothing
324
325				=cut
326
327				sub addCleanUp()
328				{
329				my $base = 0;
330				if(scalar @_ == 2)#checks if method entered by object.
331				{
332				$base = 1;
333				}
334
335				my $tempCleanUp = $_[$base];
336				if($tempCleanUp =~ /(s\|t)\/.*\/g?/g)
337				{
338				$userCleanUp = $tempCleanUp;
339				}
340				else
341				{
342				my $self = shift;
343				$self->{error} = 1;
344				$self->{errorString} = "Clean Up not in regex format '/.../', default remains on";
345				}
346				}
347
348				=item $obj->preProcessing()
349
350				Highly increases speed of program by making
351				as many outside calls as possible and storing
352				outside info to be used later.
353
354				Parameter: none
355
356				Returns: nothing
357
358				=cut
359
360				sub preProcessing()
361				{
362				$preProcessed = 1; #Flag that preProcessing has been called.
363				@wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses.
364				@wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses.
365				#reset all glosses, senses, etc.
366				%wnGlosses = ();
367				@wnNounSenses;
368				@wnVerbSenses;
369				%wnHypes = ();
370				%wnHypos = ();
371				%wnSyns = ();
372				%wnFreq = ();
373
374
375				#Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration.
376				foreach my $noun (@wordNetNouns)
377				{
378				my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word
379				foreach my $curNSense (@nSenses)
380				{
381				#stores in noun senses to differentiate from verbs.
382				push(@wnNounSenses, $curNSense);
383
384				#obtain each gloss and clean up before inserting into hash.
385				my @nGlosses = $wn->querySense($curNSense, "glos");
386				my $tempSenseGloss = $nGlosses[0];
387
388				if($cleanUp == 1)
389				{
390				#Clean up the words in the temporary sense gloss.
391				$tempSenseGloss =~ s/($\|$\|\.)//g;
392				$tempSenseGloss =~ s/^a-zA-Z//g;
393				$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
394				$tempSenseGloss =~ $stopList; #remove stop words
395				}
396				if($userCleanUp ne "\"\"")
397				{
398				$tempSenseGloss =~ $userCleanUp;
399				}
400
401				#if stemming is on, stem each word in each gloss
402				if($stemming == 1)
403				{
404				my @tempStem = split(' ', $tempSenseGloss);
405				my @stemmedGloss = @{$stemmer->stem(@tempStem)};
406				$tempSenseGloss = join(' ', @stemmedGloss);
407				}
408
409				#maps each sense to its gloss
410				$wnGlosses{$curNSense} = $tempSenseGloss;
411
412				#obtains and stores, hypes, hypos, and syns
413				my @hypes = $wn->querySense($curNSense, "hype");
414				$wnHypes{$curNSense} = \@hypes;
415				my @hypos = $wn->querySense($curNSense, "hypo");
416				$wnHypos{$curNSense} = \@hypos;
417				my @syns = $wn->querySense($curNSense, "syns");
418				$wnSyns{$curNSense} = \@syns;
419				$wnFreq{$curNSense} = $wn->frequency($curNSense);
420				}
421				}
422
423				#stores verbs' senses' glosses, hypes, hypos, and syns.
424				foreach my $verb (@wordNetVerbs)
425				{
426				my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word
427				foreach my $curVSense (@vSenses)
428				{
429				#stores in verb senses to differentiate later.
430				push(@wnVerbSenses, $curVSense);
431
432				#obtain each gloss and clean up before inserting into hash.
433				my @vGlosses = $wn->querySense($curVSense, "glos");
434				my $tempSenseGloss = $vGlosses[0];
435
436				if($cleanUp == 1)
437				{
438				#Clean up the words in the temporary sense gloss.
439				$tempSenseGloss =~ s/($\|$\|\.)//g;
440				$tempSenseGloss =~ s/^a-zA-Z//g;
441				$tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
442				$tempSenseGloss =~ s/\b$stopList\b//g; #remove stop words
443				}
444				if($userCleanUp ne "\"\"")
445				{
446				$tempSenseGloss =~ $userCleanUp;
447				}
448
449				#if stemming is on, stem each word in each gloss
450				if($stemming == 1)
451				{
452				my @tempStem = split(' ', $tempSenseGloss);
453				my @stemmedGloss = @{$stemmer->stem(@tempStem)};
454				$tempSenseGloss = join(' ', @stemmedGloss);
455				}
456
457				#maps each sense to its gloss
458				$wnGlosses{$curVSense} = $tempSenseGloss;
459
460				#obtains and stores, hypes, hypos, and syns
461				my @hypes = $wn->querySense($curVSense, "hype");
462				$wnHypes{$curVSense} = \@hypes;
463				my @hypos = $wn->querySense($curVSense, "hypo");
464				$wnHypos{$curVSense} = \@hypos;
465				my @syns = $wn->querySense($curVSense, "syns");
466				$wnSyns{$curVSense} = \@syns;
467				$wnFreq{$curVSense} = $wn->frequency($curVSense);
468				}
469				}
470
471
472				}
473
474				=item $obj->processLemma(@inLemma)
475
476				Determines where the OOV Lemma should be
477				inserted into WordNet, returns the output.
478
479				Parameter: the lemma to be inserted in array form
480				(lemma, part-of-speech, item-id, definition, def source)
481
482				Returns: chosen lemma in array form
483				(item-id, WordNet sense, operation)
484
485				=cut
486
487				sub processLemma()
488				{
489				my $base = 0;
490				if(scalar @_ == 2)#checks if method entered by object.
491				{
492				$base = 1;
493				}
494
495				my %senseScores = ();
496				my $highSenseScore = 0;
497				my $highSense = "";
498				my @inLemma = @{$_[$base]};
499				my @outLemma = ("","","");
500				my $attachMerge = "";
501				my @senses = ();
502
503				if($preProcessed == 1)
504				{
505				if($inLemma[1] =~ /noun/)
506				{
507				@senses = @wnNounSenses;
508				}
509				else
510				{
511				@senses = @wnVerbSenses;
512				}
513
514				foreach $curSense (@senses) #runs through each sense of current word
515				{
516				my $score = scoreSense(\@inLemma, $curSense);
517
518				if($score >= $highSenseScore)
519				{
520				$highSenseScore = $score;
521				$highSense = $curSense;
522				}
523
524				$senseScores{$curSense} = $score;
525				}
526
527				if($refineSense == 1)
528				{
529				$highSense = refineSense(\@inLemma, $highSense);
530				}
531
532				if($wnFreq{$highSense} == 0)
533				{
534				$attachMerge = "attach";
535				}
536				else
537				{
538				$attachMerge = "merge";
539				}
540
541				$outLemma[0] = $inLemma[2];
542				$outLemma[1] = $highSense;
543				$outLemma[2] = $attachMerge;
544				return \@outLemma;
545				}
546				else
547				{
548				my $self = shift;
549				$self->{error} = 2;
550				$self->{errorString} = "PreProcessing must be run before processLemma() is called.";
551				}
552				}
553
554				=item $obj->toggleCompareGlosses($hype,$hypo,$syns)
555
556				Toggles which glosses are used in score sense.
557				by default, the sense, the sense's hypernyms'
558				glosses,hyponyms' glosses, and synsets' glosses
559				are turned on. This method allows for toggling
560				of hypes,hypos,synsets, by passing in three
561				parameters, 1 for on and 0 for off.
562				Example: toggleCompareGlosses(0,0,0) toggles
563				all three off.
564
565				Parameters: 0 or 1 for toggling hypernyms, hyponyms,
566				and synset comparisons.
567
568				Returns: nothing
569
570				=cut
571
572				sub toggleCompareGlosses()
573				{
574				my $base = 0;
575				if(scalar @_ == 4)#checks if method entered by object.
576				{
577				$base = 1;
578				}
579
580				if($_[$base] == 0)
581				{
582				$useHypeGlosses = 0;
583				}
584				else
585				{
586				$useHypeGlosses = 1;
587				}
588
589				$base++;
590
591				if($_[$base] == 0)
592				{
593				$useHypoGlosses = 0;
594				}
595				else
596				{
597				$useHypoGlosess = 1;
598				}
599
600				$base++;
601
602				if($_[$base] == 0)
603				{
604				$useSynsGlosses = 0;
605				}
606				else
607				{
608				$useSynsGlosses = 1;
609				}
610				}
611
612				=item $obj->setBonus($bonus)
613
614				Allows the user to set the bonus that will be
615				used when scoring lemmas that contain the
616				new lemma.
617
618				Parameter: the multiplier that should be used in
619				calculating the bonus.
620
621				Returns: nothing
622
623				=cut
624
625				sub setBonus()
626				{
627				my $base = 0;
628				if(scalar @_ == 2)#checks if method entered by object.
629				{
630				$base = 1;
631				}
632
633				$bonus = $_[$base];
634				}
635
636				=item $obj->scoreSense(@inLemma, $compareSense)
637
638				Serves as a wrapper method to facilitate the
639				main program by directing it to the currently
640				chosen scoring method. By default the average
641				highest scoring method is chosen. This can be
642				changed with setScoreMethod().
643
644				Parameters: the in lemma in array form
645				(lemma, part-of-speech, item-id, definition, def source)
646				and the sense that the lemma is being compared to.
647
648				Returns: a score of how related the in lemma is to the
649				compareSense.
650
651				=cut
652
653				sub scoreSense()
654				{
655				my $base = 0;
656				if(scalar @_ == 3)#checks if method entered by object.
657				{
658				$base = 1;
659				}
660
661				my @inLemma = @{$_[$base]};
662				$base++;
663				my $curSense = $_[$base];
664
665				my $score = 0;
666				if($scoringMethod eq "baseline")
667				{
668				$score = baseline(\@inLemma, $curSense);
669				}
670				if($scoringMethod eq "BwS")
671				{
672				$score = BwS(\@inLemma, $curSense);
673				}
674				if($scoringMethod eq "Similarity")
675				{
676				$score = Similarity(\@inLemma, $curSense);
677				}
678
679				return $score;
680				}
681
682				=item $obj->setScoreMethod($scoreMethod)
683
684				Allows the user to choose which scoring method
685				should be used by default when running the
686				program from the top. Options are:
687				'baseline'
688				'BwS' - baseline system with stemming and lemmitization
689				--as more are added they will appear here.
690
691				Parameter: the chosen scoring method
692
693				Returns: nothing.
694
695				=cut
696
697				sub setScoreMethod()
698				{
699				my $base = 0;
700
701				if(scalar @_ == 2)#checks if method entered by object.
702				{
703				$base = 1;
704				}
705
706				my $scoreMethod = $_[$base];
707
708				#check if the score method is in scoring methods.
709				my @matches = grep(/$scoreMethod/, @scoringMethods);
710				if(scalar @matches > 0)
711				{
712				$scoringMethod = $scoreMethod;
713				}
714
715				}
716
717				=item $obj->Similarity(@inLemma, $compareSense)
718
719				Calculates a score for the passed sense and returns
720				that score.
721
722				Parameters: the in lemma in array form
723				(lemma, part-of-speech, item-id, definition, def source)
724				and the sense that the lemma is being compared to.
725
726				Returns: a score of how related the im lemma is to the
727				compareSense.
728
729				=cut
730
731				sub Similarity()
732				{
733				my $base = 0;
734				if(scalar @_ == 3)#checks if method entered by object.
735				{
736				$base = 1;
737				}
738
739				my @inLemma = @{$_[$base]};
740				$base++;
741				my $curSense = $_[$base];
742
743				my $def = @inLemma[3];
744
745				#split definition and stem the words
746				my @listDef =split(' ', $def);
747				my @defStemmed = @{$stemmer->stem(@listDef)};
748
749				#join definition back together and tag with pos
750				$def = join(' ', @defStemmed);
751				my $tagged = $tagger->add_tags($def);
752
753				#split the tagged definition for individual word processing
754				@tagArray = split(' ', $tagged);
755				my @similar = ();
756
757				#step through each tagged word and find the first sense in wordnet, then add that to the @similar list
758				foreach my $cur (@tagArray)
759				{
760				my $pos = '';
761				if($cur =~ /.*/)
762				{
763				$pos = 'n';
764				}
765				else
766				{
767				if($cur =~ /.*/)
768				{
769				$pos = 'v';
770				}
771				else
772				{
773				if($cur =~ /.*/)
774				{
775				$pos = 'a';
776				}
777				}
778				}
779
780				if(length $pos == 1)
781				{
782				$cur =~ s/<[nvj\/].{1,3}>//g;
783				@wnQuery = $wn->querySense("$cur#$pos");
784				push @similar, $wnQuery[0];
785				}
786				}
787
788				my $score = 0;
789
790				foreach my $curSim (@similar)
791				{
792				my $value = $measure->getRelatedness("$curSense", "$curSim");
793				$score = $score + $value;
794				}
795
796				return $score;
797				}
798
799				=item $obj->BwS(@inLemma, $compareSense)
800
801				Calculates a score for the passed sense and returns
802				that score. This is a modified baseline() method
803				which adds stemming to the data.
804
805				Parameters: the in lemma in array form
806				(lemma, part-of-speech, item-id, definition, def source)
807				and the sense that the lemma is being compared to.
808
809				Returns: a score of how related the in lemma is to the
810				compareSense.
811
812				=cut
813
814				sub BwS()
815				{
816				my $base = 0;
817				if(scalar @_ == 3)#checks if method entered by object.
818				{
819				$base = 1;
820				}
821
822				my @inLemma = @{$_[$base]};
823				$base++;
824				my $curSense = $_[$base];
825
826				if($stemmed == 0)
827				{
828				$stemming = 1;
829				preProcessing();
830				$stemmed = 1;
831				}
832
833				return simpleScoreSense(\@inLemma, $curSense);
834
835				}
836
837				=item $obj->baseline(@inLemma, $compareSense)
838
839				Calculates a score for the passed sense then returns
840				that score. This class is a wrapper for the
841				simpleScoreSense() method as it makes sure no stemming
842				or lemmatization is present in the preProcessing().
843
844				Parameters: the in lemma in array form
845				(lemma, part-of-speech, item-id, definition, def source)
846				and the sense that the lemma is being compared to.
847
848				Returns: a score of how related the in lemma is to the
849				compareSense.
850
851				=cut
852
853				sub baseline()
854				{
855				my $base = 0;
856				if(scalar @_ == 3)#checks if method entered by object.
857				{
858				$base = 1;
859				}
860
861				my @inLemma = @{$_[$base]};
862				$base++;
863				my $curSense = $_[$base];
864
865				if($stemmed == 1)
866				{
867				$stemming = 0;
868				preProcessing();
869				$stemmed = 0;
870				}
871
872				return simpleScoreSense(\@inLemma, $curSense);
873
874				}
875
876
877				=item $obj->simpleScoreSense(@inLemma, $compareSense)
878
879				Calculates a score for the passed sense then
880				returns that score. This is the baseline system which
881				was submitted for SemEval16 task 14. This algorithm
882				scores by overlapping words found in the lemma's gloss
883				and also with the lemma's hypernym and hyponyms' glosses.
884
885				Parameters: the in lemma in array form
886				(lemma, part-of-speech, item-id, definition, def source)
887				and the sense that the lemma is being compared to.
888
889				Returns: a score of how related the in lemma is to the
890				compareSense.
891
892				=cut
893
894				sub simpleScoreSense()
895				{
896				my $base = 0;
897				if(scalar @_ == 3)#checks if method entered by object.
898				{
899				$base = 1;
900				}
901
902				my @inLemma = @{$_[$base]};
903				$base++;
904				my $curSense = $_[$base];
905				my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word.
906
907				#_________________Sense Gloss_________________________________
908				my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss.
909
910				my @extendedGloss = getExtendedGloss($curSense);
911
912				#________________Lemma Gloss_________________________________
913				my $tempLemmaGloss = $inLemma[3];
914
915
916				if($cleanUp == 1)
917				{
918				#Clean up the words in the temp lemma gloss.
919				$tempLemmaGloss =~ s/($\|$\|\.)//g;
920				$tempLemmaGloss =~ s/^a-zA-Z//g;
921				$tempLemmaGloss = lc $tempLemmaGloss;
922				$tempLemmaGloss =~ s/\b$stopList\b//g; #remove stop words
923				}
924				if($userCleanUp ne "\"\"")
925				{
926				$tempLemmaGloss =~ $userCleanUp;
927				}
928
929				my @curLemmaGloss = split(' ', $tempLemmaGloss);
930
931
932				#__________________Overlaps__________________________________
933				my $glossLength = 0;
934				my $overlaps = 0.0; #number of overlapped words.
935
936				#scan through each word from the sense gloss and see if any overlap on the lemma gloss.
937				for my $lWord (0..$#curLemmaGloss)
938				{
939				$glossLength = $glossLength + length $curLemmaGloss[$lWord];
940				if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself
941				{
942				$overlaps = $overlaps + $bonus*(length $word);
943				}
944
945				$spaceWord = $word;
946				$spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below
947				if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)\|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/)
948				{
949				$overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]);
950				}
951
952				for my $sWord (0..$#curSenseGloss)
953				{
954				if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/)
955				{
956				$overlaps = $overlaps + length $curSenseGloss[$sWord];
957				}
958				}
959				for my $extWord (0..$#extendedGloss)
960				{
961				if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/)
962				{
963				$overlaps = $overlaps + length $extendedGloss[$extWord];
964				}
965				}
966
967				}
968
969
970				$score = $overlaps/$glossLength;
971				return $score;
972				}
973
974				=item $obj->getExtendedGloss($compareSense)
975
976				Calculates the extended gloss based on which
977				glosses are toggled and returns an array
978				which contains the full glosses.
979
980				Parameter: the sense which the extended gloss is
981				based on
982
983				Returns: an array which contains the extended gloss
984
985				=cut
986
987				sub getExtendedGloss()
988				{
989				my $base = 0;
990				if(scalar @_ == 2)#checks if method entered by object.
991				{
992				$base = 1;
993				}
994
995				my $curSense = $_[$base];
996				my @extendedGloss = ();
997
998				#__________________Hype Gloss_________________________________
999				if($useHypeGlosses == 1)
1000				{
1001				#Now expands to hypernyms glosses in overlaps
1002				my @senseHypes = @{$wnHypes{$curSense}};
1003				my @senseHypeGloss = ();
1004				my $tempAllHypeGloss = "";
1005
1006				for my $hype (0..$#senseHypes)
1007				{
1008				my $tempHypeGloss = $wnGlosses{$hype};
1009
1010				$tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss;
1011				}
1012
1013				@senseHypeGloss = split(' ', $tempAllHypeGloss);
1014
1015				push(@extendedGloss, @senseHypeGloss);
1016				}
1017
1018				#________________Hypo Gloss__________________________________
1019				if($useHypoGlosses == 1)
1020				{
1021				#adds in hyponyms' glosses in overlaps
1022				my @senseHypos = @{$wnHypos{$curSense}};
1023				my @senseHypoGloss = ();
1024				my $tempAllHypoGloss = "";
1025
1026				for my $hypo (0..$#senseHypos)
1027				{
1028				my $tempHypoGloss = $wnGlosses{$hypo};
1029
1030				$tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss;
1031				}
1032
1033				@senseHypoGloss = split(' ', $tempAllHypoGloss);
1034				push(@extendedGloss, @senseHypoGloss);
1035				}
1036
1037				#_________________Syns Gloss_________________________________
1038				if($useSynsGlosses == 1)
1039				{
1040				#adds in synsets' glosses in overlaps
1041				my @senseSyns = @{$wnSyns{$curSense}};
1042				my @senseSynsGloss = ();
1043				my $tempAllSynsGloss = "";
1044
1045				for my $syns (0..$#senseSyns)
1046				{
1047				if(!($syns =~ /\b$word\b/)) #do not repeat sense
1048				{
1049				my $tempSynsGloss = $wnGlosses{$syns};
1050
1051				$tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss;
1052				}
1053				}
1054
1055				@senseSynsGloss = split(' ', $tempAllSynsGloss);
1056				push(@extendedGloss, @senseSynsGloss);
1057				}
1058
1059				return \@extendedGloss;
1060				}
1061
1062				=item $obj->toggleRefineSense($toggle)
1063
1064				Allows user to toggle refineSense() on/off.
1065
1066				Parameter: 0 or 1 to toggle the refine sense method
1067				on or off respectively in the processLemma method.
1068
1069				Returns: nothing
1070
1071				=cut
1072
1073				sub toggleRefineSense()
1074				{
1075				if($_[0] == 0)
1076				{
1077				$refineSense = 0;
1078				}
1079				else
1080				{
1081				$refineSense = 1;
1082				}
1083				}
1084
1085				=item $obj->refineSense(@inLemma, $highSense)
1086
1087				Refines chosen sense, by determing which
1088				numbered sense should be chosen.
1089
1090				Parameters: the in lemma in form of
1091				(lemma, part-of-speech, item-id, definition, def source)
1092				and the sense which currently bests matches the inlemma.
1093
1094				Returns:the new highest scoring sense
1095
1096				=cut
1097
1098				sub refineSense()
1099				{
1100				my $base = 0;
1101				if(scalar @_ == 3)#checks if method entered by object.
1102				{
1103				$base = 1;
1104				}
1105
1106				my @inLemma = @{$_[$base]};
1107
1108				$base++;
1109				my $highSense = $_[$base];
1110				my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word.
1111				my $shortSense = substr($inLemma[1], 0, 1);
1112				my $sense = $word . "#" . $shortSense;
1113				my $highSenseScore = 0;
1114				my $rSenseScore = 0;
1115				my $refineHigh = "$sense#1"; #assume first sense.
1116				my $tempLemmaGloss = $inLemma[3];
1117
1118				if($cleanUp == 1)
1119				{
1120				#Clean up the words in the temp lemma gloss.
1121				$tempLemmaGloss =~ s/($\|$\|\.)//g;
1122				$tempLemmaGloss =~ s/^a-zA-Z//g;
1123				$tempLemmaGloss = lc $tempLemmaGloss;
1124				$tempLemmaGloss =~ $stopList; #remove stop words
1125				}
1126				if($userCleanUp ne "\"\"")
1127				{
1128				$tempLemmaGloss =~ $userCleanUp;
1129				}
1130
1131				my @refineLemmaGloss = split(' ', $tempLemmaGloss);
1132
1133				my $rGlossLength = 0.0;
1134				my $rOverlaps = 0.0;
1135				my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word.
1136				for my $rSense (0..$#refineSenses)
1137				{
1138				my $tempSenseGloss = $wnGlosses{$rSense};
1139
1140				for my $rLemma (0..$#refineLemmaGloss)
1141				{
1142				$rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma];
1143				if($refineLemmaGlos[$rLemma] ne $word)
1144				{
1145				if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/)
1146				{
1147				$rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma];
1148				}
1149				}
1150
1151				}
1152
1153				$rSenseScore = $rOverlaps/$rGlossLength;
1154				if($rSenseScore > $highSenseScore)
1155				{
1156				$highSenseScore = $rSenseScore;
1157				$refineHigh = $rHypo;
1158				}
1159				}
1160
1161				$highSense = $refineHigh;
1162
1163				return $highSense;
1164
1165				}
1166
1167
1168				#************printHelp()********************
1169				# Prints indepth help guide to screen.
1170				#***********************************************
1171				sub printHelp()
1172				{
1173				printUsage();
1174				print "Takes in lemmas from file and attempts to\n";
1175				print "insert them into WordNet by first finding\n";
1176				print "a hypernym, then either a) merging the \n";
1177				print "lemma with the hypernym or b) attaching \n";
1178				print "the lemma to the hypernym.\n";
1179				}
1180
1181				1;