File Coverage

blib/lib/WordNet/Extend/Locate.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             # WordNet::Extend::Locate.pm version 0.031
2             # Updated: 01/16/17
3             #
4             # Ted Pedersen, University of Minnesota Duluth
5             # tpederse at d.umn.edu
6             #
7             # Jon Rusert, University of Minnesota Duluth
8             # ruse0008 at d.umn.edu
9             #
10             # This program is free software: you can redistribute it and/or modify
11             # it under the terms of the GNU General Public License as published by
12             # the Free Software Foundation, either version 3 of the License, or
13             # (at your option) any later version.
14             #
15             # This program is distributed in the hope that it will be useful,
16             # but WITHOUT ANY WARRANTY; without even the implied warranty of
17             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18             # GNU General Public License for more details.
19             #
20             # You should have received a copy of the GNU General Public License
21             # along with this program. If not, see .
22             #
23              
24             package WordNet::Extend::Locate;
25              
26             =head1 NAME
27              
28             WordNet::Extend::Locate - Perl modules for locating where in WordNet a
29             lemma should be inserted.
30              
31             =head1 SYNOPSIS
32              
33             =head2 Basic Usage Example
34              
35             use WordNet::Extend::Locate;
36              
37             my $locate = WordNet::Extend::Locate->new();
38              
39             $locate->stopList('s/\b(the|is|at)\b//');
40              
41             $locate->setCleanUp(1);
42              
43             $locate->preProcessing();
44              
45             $locate->toggleCompareGlosses(1,1,0);
46              
47             $locate->setBonus(25);
48              
49             $locate->toggleRefineSense(0);
50              
51             print "Finding location for 'dog noun withdef.1 man's best friend'\n";
52              
53             @location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")};
54              
55             print "Location found: @location\n";
56              
57             =head1 DESCRIPTION
58              
59             =head2 Introduction
60              
61             WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location.
62              
63             =over
64             =cut
65              
66 1     1   24572 use WordNet::QueryData;
  0            
  0            
67             #use Wiktionary::Parser;
68             use Getopt::Long;
69             use File::Spec;
70             use Lingua::Stem;
71             use Lingua::EN::Tagger;
72             use WordNet::Similarity::vector;
73             #use List::Util;
74              
75             our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
76              
77             @ISA = qw(Exporter);
78              
79             %EXPORT_TAGS = ();
80              
81             @EXPORT_OK = ();
82              
83             @EXPORT = ();
84              
85             $VERSION = '0.031';
86              
87             #**************Variables**********************
88             $wn = WordNet::QueryData->new; #to be used to access data from wordnet
89             $stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc.
90             $tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure
91             $measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity
92             @wordNetNouns; #stores all words for noun sense from wordnet
93             @wordNetVerbs; #stores all words for verb sense from wordnet
94             %wnGlosses = ();
95             @wnNounSenses;
96             @wnVerbSenses;
97             %wnHypes = ();
98             %wnHypos = ();
99             %wnSyns = ();
100             %wnFreq = ();
101             #our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages.
102             $stopList = "s/\b(the|is|at|which|on|a|an|and|or|up)\b//g"; #default stop list.
103             $preProcessed = 0; #Flag to determine if preProcessing() has been called.
104             $cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp();
105             $userCleanUp = ""; #Cleanup step specified by user in addCleanUp();
106             $useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons.
107             $useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons.
108             $useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons.
109             $bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus();
110             $refineSense = 0; #Toggle for use of refineSense() method, default on.
111             $help = 0;
112             $scoringMethod = 'baseline';
113             @scoringMethods = ('baseline', 'BwS', 'Similarity');
114             $stemming = 0; #Toggle for stemming on or off.
115             $stemmed = 0; #flag for use in BwS
116             #*********************************************
117              
118             GetOptions('help' => \$help);
119             if($help == 1)
120             {
121             printHelp();
122             exit(0);
123             }
124              
125             =head2 Methods
126              
127             The following methods are defined in this package:
128              
129             =head3 Public methods
130              
131             =over
132              
133             =item $obj->new()
134              
135             The constructor for WordNet::Extend::Locate objects.
136              
137             Parameters: none.
138              
139             Return value: the new blessed object
140              
141             =cut
142              
143             sub new
144             {
145             my $class = shift;
146             my $self = {};
147              
148             $self->{errorString} = '';
149             $self->{error}=0;
150              
151             bless $self, $class;
152            
153             return $self;
154             }
155              
156             =item $obj->getError()
157              
158             Allows the object to check if any errors have occurred.
159             Returns an array ($error, $errorString), where $error
160             value equal to 1 represents a warning and 2 represents
161             an error and $errString contains the possible error.
162             For example, if a user forgets to run preProcessing() before
163             a method that relies on it, the error would be 2 and errorString
164             would mention that preProcessing had not been run.
165              
166             Parameter: None
167              
168             Returns: array of the form ($error, $errorString).
169              
170             =cut
171             sub getError()
172             {
173             my $self = shift;
174             my $error = $self->{error};
175             my $errString = $self->{errorString};
176             $self->{error}=0;
177             $self->{errorString} = "";
178             $errString =~ s/^[\r\n\t ]+//;
179             return ($error, $errString);
180             }
181              
182             =item $obj->locateFile($input_file, $output_file)
183              
184             Attempts to locate best WordNet position for each word
185             from input file into WordNet, outputs results to output file.
186              
187             Parameter: location of input file and output file respectively
188              
189             Returns: nothing
190              
191             =cut
192              
193             sub locateFile()
194             {
195             my $input = File::Spec->canonpath($_[1]);
196             my $output = File::Spec->canonpath($_[2]);
197            
198             #Attempts to open input data
199             open DATA, "$input" or die $!;
200             open (OUTDATA, '>', "$output") or die $!;
201            
202             #if preProcessing() hasn't been called, call it.
203             if($preProcessed == 0)
204             {
205             preProcessing();
206             }
207              
208             my @outLemma = ("","","");
209              
210             while() #While lemmas are left in the input data
211             {
212             for $tempIn (split("\n")) #processes data line by line.
213             {
214             @outLemma = @{locate($tempIn)};
215            
216             $tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n";
217              
218             print OUTDATA "$tempOut";
219             }
220             }
221             close DATA;
222             close OUTDATA;
223             }
224              
225             =item $obj->locate($wordPosGloss)
226              
227             Takes in single lemma with gloss and returns location of best
228             insertion point in WordNet.
229              
230             Parameter: Lemma string in format of 'word\tpos\titem-id\tdef'
231             NOTE: String must only be separated by \t no space.
232              
233             Returns: Array in format of (item-id, WordNet sense, operation)
234              
235             =cut
236             sub locate()
237             {
238             my $base = 0;
239             if(scalar @_ == 2)#checks if method entered by object.
240             {
241             $base = 1;
242             }
243              
244             #if preProcessing() hasn't been called, call it.
245             if($preProcessed == 0)
246             {
247             preProcessing();
248             }
249            
250             my @inLemma = split("\t", $_[$base]); #stores lemma as formatted above
251             my @outLemma = @{processLemma(\@inLemma)};
252             return \@outLemma;
253            
254             }
255              
256             =item $obj->stopList($newStopList)
257              
258             Takes in new stop list, in regex form
259              
260             Parameter:the new stop list in regex substitution form s/.../g?
261              
262             Returns: nothing
263              
264             =cut
265              
266             sub stopList()
267             {
268             my $base = 0;
269             if(scalar @_ == 2)#checks if method entered by object.
270             {
271             $base = 1;
272             }
273             my $tempStopList = $_[$base];
274             if($tempStopList =~ /s\/.*?\/g?/g)
275             {
276             $stopList = $tempStopList;
277             }
278             else
279             {
280             my $self = shift;
281             $self->{error} = 1;
282             $self->{errorString} = "Proposed stop list not in regex substition form s/.../g?, default remains";
283             }
284             }
285              
286             =item $obj->setCleanUp($switch)
287              
288             Allows the user to toggle whether or not
289             glosses should be cleaned up.
290              
291             Parameter: 0 or 1 to turn clean up off or on respectively
292              
293             Returns: nothing
294              
295             =cut
296              
297             sub setCleanUp()
298             {
299             my $base = 0;
300             if(scalar @_ == 2)#checks if method entered by object.
301             {
302             $base = 1;
303             }
304              
305             if($_[$base] == 0) #turns cleanUp off.
306             {
307             $cleanUp = 0;
308             }
309             else #turns cleanUp on.
310             {
311             $cleanUp = 1;
312             }
313             }
314              
315             =item $obj->addCleanUp($cleanUp)
316              
317             Allows the user to add their own
318             regex for cleaning up the glosses.
319              
320             Parameter: Regex representing the cleanup
321             the user wants performed.
322              
323             Returns: Nothing
324              
325             =cut
326              
327             sub addCleanUp()
328             {
329             my $base = 0;
330             if(scalar @_ == 2)#checks if method entered by object.
331             {
332             $base = 1;
333             }
334              
335             my $tempCleanUp = $_[$base];
336             if($tempCleanUp =~ /(s|t)\/.*\/g?/g)
337             {
338             $userCleanUp = $tempCleanUp;
339             }
340             else
341             {
342             my $self = shift;
343             $self->{error} = 1;
344             $self->{errorString} = "Clean Up not in regex format '/.../', default remains on";
345             }
346             }
347              
348             =item $obj->preProcessing()
349              
350             Highly increases speed of program by making
351             as many outside calls as possible and storing
352             outside info to be used later.
353              
354             Parameter: none
355              
356             Returns: nothing
357            
358             =cut
359              
360             sub preProcessing()
361             {
362             $preProcessed = 1; #Flag that preProcessing has been called.
363             @wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses.
364             @wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses.
365             #reset all glosses, senses, etc.
366             %wnGlosses = ();
367             @wnNounSenses;
368             @wnVerbSenses;
369             %wnHypes = ();
370             %wnHypos = ();
371             %wnSyns = ();
372             %wnFreq = ();
373            
374            
375             #Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration.
376             foreach my $noun (@wordNetNouns)
377             {
378             my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word
379             foreach my $curNSense (@nSenses)
380             {
381             #stores in noun senses to differentiate from verbs.
382             push(@wnNounSenses, $curNSense);
383              
384             #obtain each gloss and clean up before inserting into hash.
385             my @nGlosses = $wn->querySense($curNSense, "glos");
386             my $tempSenseGloss = $nGlosses[0];
387            
388             if($cleanUp == 1)
389             {
390             #Clean up the words in the temporary sense gloss.
391             $tempSenseGloss =~ s/(\(|\)|\.)//g;
392             $tempSenseGloss =~ s/^a-zA-Z//g;
393             $tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
394             $tempSenseGloss =~ $stopList; #remove stop words
395             }
396             if($userCleanUp ne "\"\"")
397             {
398             $tempSenseGloss =~ $userCleanUp;
399             }
400              
401             #if stemming is on, stem each word in each gloss
402             if($stemming == 1)
403             {
404             my @tempStem = split(' ', $tempSenseGloss);
405             my @stemmedGloss = @{$stemmer->stem(@tempStem)};
406             $tempSenseGloss = join(' ', @stemmedGloss);
407             }
408            
409             #maps each sense to its gloss
410             $wnGlosses{$curNSense} = $tempSenseGloss;
411            
412             #obtains and stores, hypes, hypos, and syns
413             my @hypes = $wn->querySense($curNSense, "hype");
414             $wnHypes{$curNSense} = \@hypes;
415             my @hypos = $wn->querySense($curNSense, "hypo");
416             $wnHypos{$curNSense} = \@hypos;
417             my @syns = $wn->querySense($curNSense, "syns");
418             $wnSyns{$curNSense} = \@syns;
419             $wnFreq{$curNSense} = $wn->frequency($curNSense);
420             }
421             }
422              
423             #stores verbs' senses' glosses, hypes, hypos, and syns.
424             foreach my $verb (@wordNetVerbs)
425             {
426             my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word
427             foreach my $curVSense (@vSenses)
428             {
429             #stores in verb senses to differentiate later.
430             push(@wnVerbSenses, $curVSense);
431              
432             #obtain each gloss and clean up before inserting into hash.
433             my @vGlosses = $wn->querySense($curVSense, "glos");
434             my $tempSenseGloss = $vGlosses[0];
435            
436             if($cleanUp == 1)
437             {
438             #Clean up the words in the temporary sense gloss.
439             $tempSenseGloss =~ s/(\(|\)|\.)//g;
440             $tempSenseGloss =~ s/^a-zA-Z//g;
441             $tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
442             $tempSenseGloss =~ s/\b$stopList\b//g; #remove stop words
443             }
444             if($userCleanUp ne "\"\"")
445             {
446             $tempSenseGloss =~ $userCleanUp;
447             }
448              
449             #if stemming is on, stem each word in each gloss
450             if($stemming == 1)
451             {
452             my @tempStem = split(' ', $tempSenseGloss);
453             my @stemmedGloss = @{$stemmer->stem(@tempStem)};
454             $tempSenseGloss = join(' ', @stemmedGloss);
455             }
456            
457             #maps each sense to its gloss
458             $wnGlosses{$curVSense} = $tempSenseGloss;
459              
460             #obtains and stores, hypes, hypos, and syns
461             my @hypes = $wn->querySense($curVSense, "hype");
462             $wnHypes{$curVSense} = \@hypes;
463             my @hypos = $wn->querySense($curVSense, "hypo");
464             $wnHypos{$curVSense} = \@hypos;
465             my @syns = $wn->querySense($curVSense, "syns");
466             $wnSyns{$curVSense} = \@syns;
467             $wnFreq{$curVSense} = $wn->frequency($curVSense);
468             }
469             }
470              
471              
472             }
473              
474             =item $obj->processLemma(@inLemma)
475              
476             Determines where the OOV Lemma should be
477             inserted into WordNet, returns the output.
478              
479             Parameter: the lemma to be inserted in array form
480             (lemma, part-of-speech, item-id, definition, def source)
481              
482             Returns: chosen lemma in array form
483             (item-id, WordNet sense, operation)
484              
485             =cut
486              
487             sub processLemma()
488             {
489             my $base = 0;
490             if(scalar @_ == 2)#checks if method entered by object.
491             {
492             $base = 1;
493             }
494              
495             my %senseScores = ();
496             my $highSenseScore = 0;
497             my $highSense = "";
498             my @inLemma = @{$_[$base]};
499             my @outLemma = ("","","");
500             my $attachMerge = "";
501             my @senses = ();
502              
503             if($preProcessed == 1)
504             {
505             if($inLemma[1] =~ /noun/)
506             {
507             @senses = @wnNounSenses;
508             }
509             else
510             {
511             @senses = @wnVerbSenses;
512             }
513            
514             foreach $curSense (@senses) #runs through each sense of current word
515             {
516             my $score = scoreSense(\@inLemma, $curSense);
517            
518             if($score >= $highSenseScore)
519             {
520             $highSenseScore = $score;
521             $highSense = $curSense;
522             }
523            
524             $senseScores{$curSense} = $score;
525             }
526            
527             if($refineSense == 1)
528             {
529             $highSense = refineSense(\@inLemma, $highSense);
530             }
531            
532             if($wnFreq{$highSense} == 0)
533             {
534             $attachMerge = "attach";
535             }
536             else
537             {
538             $attachMerge = "merge";
539             }
540            
541             $outLemma[0] = $inLemma[2];
542             $outLemma[1] = $highSense;
543             $outLemma[2] = $attachMerge;
544             return \@outLemma;
545             }
546             else
547             {
548             my $self = shift;
549             $self->{error} = 2;
550             $self->{errorString} = "PreProcessing must be run before processLemma() is called.";
551             }
552             }
553              
554             =item $obj->toggleCompareGlosses($hype,$hypo,$syns)
555              
556             Toggles which glosses are used in score sense.
557             by default, the sense, the sense's hypernyms'
558             glosses,hyponyms' glosses, and synsets' glosses
559             are turned on. This method allows for toggling
560             of hypes,hypos,synsets, by passing in three
561             parameters, 1 for on and 0 for off.
562             Example: toggleCompareGlosses(0,0,0) toggles
563             all three off.
564              
565             Parameters: 0 or 1 for toggling hypernyms, hyponyms,
566             and synset comparisons.
567              
568             Returns: nothing
569              
570             =cut
571              
572             sub toggleCompareGlosses()
573             {
574             my $base = 0;
575             if(scalar @_ == 4)#checks if method entered by object.
576             {
577             $base = 1;
578             }
579              
580             if($_[$base] == 0)
581             {
582             $useHypeGlosses = 0;
583             }
584             else
585             {
586             $useHypeGlosses = 1;
587             }
588              
589             $base++;
590            
591             if($_[$base] == 0)
592             {
593             $useHypoGlosses = 0;
594             }
595             else
596             {
597             $useHypoGlosess = 1;
598             }
599            
600             $base++;
601              
602             if($_[$base] == 0)
603             {
604             $useSynsGlosses = 0;
605             }
606             else
607             {
608             $useSynsGlosses = 1;
609             }
610             }
611              
612             =item $obj->setBonus($bonus)
613              
614             Allows the user to set the bonus that will be
615             used when scoring lemmas that contain the
616             new lemma.
617              
618             Parameter: the multiplier that should be used in
619             calculating the bonus.
620              
621             Returns: nothing
622              
623             =cut
624              
625             sub setBonus()
626             {
627             my $base = 0;
628             if(scalar @_ == 2)#checks if method entered by object.
629             {
630             $base = 1;
631             }
632              
633             $bonus = $_[$base];
634             }
635              
636             =item $obj->scoreSense(@inLemma, $compareSense)
637              
638             Serves as a wrapper method to facilitate the
639             main program by directing it to the currently
640             chosen scoring method. By default the average
641             highest scoring method is chosen. This can be
642             changed with setScoreMethod().
643              
644             Parameters: the in lemma in array form
645             (lemma, part-of-speech, item-id, definition, def source)
646             and the sense that the lemma is being compared to.
647              
648             Returns: a score of how related the in lemma is to the
649             compareSense.
650              
651             =cut
652              
653             sub scoreSense()
654             {
655             my $base = 0;
656             if(scalar @_ == 3)#checks if method entered by object.
657             {
658             $base = 1;
659             }
660              
661             my @inLemma = @{$_[$base]};
662             $base++;
663             my $curSense = $_[$base];
664              
665             my $score = 0;
666             if($scoringMethod eq "baseline")
667             {
668             $score = baseline(\@inLemma, $curSense);
669             }
670             if($scoringMethod eq "BwS")
671             {
672             $score = BwS(\@inLemma, $curSense);
673             }
674             if($scoringMethod eq "Similarity")
675             {
676             $score = Similarity(\@inLemma, $curSense);
677             }
678              
679             return $score;
680             }
681              
682             =item $obj->setScoreMethod($scoreMethod)
683              
684             Allows the user to choose which scoring method
685             should be used by default when running the
686             program from the top. Options are:
687             'baseline'
688             'BwS' - baseline system with stemming and lemmitization
689             --as more are added they will appear here.
690              
691             Parameter: the chosen scoring method
692              
693             Returns: nothing.
694              
695             =cut
696              
697             sub setScoreMethod()
698             {
699             my $base = 0;
700              
701             if(scalar @_ == 2)#checks if method entered by object.
702             {
703             $base = 1;
704             }
705              
706             my $scoreMethod = $_[$base];
707              
708             #check if the score method is in scoring methods.
709             my @matches = grep(/$scoreMethod/, @scoringMethods);
710             if(scalar @matches > 0)
711             {
712             $scoringMethod = $scoreMethod;
713             }
714            
715             }
716              
717             =item $obj->Similarity(@inLemma, $compareSense)
718              
719             Calculates a score for the passed sense and returns
720             that score.
721              
722             Parameters: the in lemma in array form
723             (lemma, part-of-speech, item-id, definition, def source)
724             and the sense that the lemma is being compared to.
725              
726             Returns: a score of how related the im lemma is to the
727             compareSense.
728              
729             =cut
730              
731             sub Similarity()
732             {
733             my $base = 0;
734             if(scalar @_ == 3)#checks if method entered by object.
735             {
736             $base = 1;
737             }
738              
739             my @inLemma = @{$_[$base]};
740             $base++;
741             my $curSense = $_[$base];
742              
743             my $def = @inLemma[3];
744              
745             #split definition and stem the words
746             my @listDef =split(' ', $def);
747             my @defStemmed = @{$stemmer->stem(@listDef)};
748              
749             #join definition back together and tag with pos
750             $def = join(' ', @defStemmed);
751             my $tagged = $tagger->add_tags($def);
752              
753             #split the tagged definition for individual word processing
754             @tagArray = split(' ', $tagged);
755             my @similar = ();
756              
757             #step through each tagged word and find the first sense in wordnet, then add that to the @similar list
758             foreach my $cur (@tagArray)
759             {
760             my $pos = '';
761             if($cur =~ /.*/)
762             {
763             $pos = 'n';
764             }
765             else
766             {
767             if($cur =~ /.*/)
768             {
769             $pos = 'v';
770             }
771             else
772             {
773             if($cur =~ /.*/)
774             {
775             $pos = 'a';
776             }
777             }
778             }
779              
780             if(length $pos == 1)
781             {
782             $cur =~ s/<[nvj\/].{1,3}>//g;
783             @wnQuery = $wn->querySense("$cur#$pos");
784             push @similar, $wnQuery[0];
785             }
786             }
787              
788             my $score = 0;
789              
790             foreach my $curSim (@similar)
791             {
792             my $value = $measure->getRelatedness("$curSense", "$curSim");
793             $score = $score + $value;
794             }
795            
796             return $score;
797             }
798              
799             =item $obj->BwS(@inLemma, $compareSense)
800              
801             Calculates a score for the passed sense and returns
802             that score. This is a modified baseline() method
803             which adds stemming to the data.
804              
805             Parameters: the in lemma in array form
806             (lemma, part-of-speech, item-id, definition, def source)
807             and the sense that the lemma is being compared to.
808            
809             Returns: a score of how related the in lemma is to the
810             compareSense.
811              
812             =cut
813              
814             sub BwS()
815             {
816             my $base = 0;
817             if(scalar @_ == 3)#checks if method entered by object.
818             {
819             $base = 1;
820             }
821              
822             my @inLemma = @{$_[$base]};
823             $base++;
824             my $curSense = $_[$base];
825              
826             if($stemmed == 0)
827             {
828             $stemming = 1;
829             preProcessing();
830             $stemmed = 1;
831             }
832              
833             return simpleScoreSense(\@inLemma, $curSense);
834              
835             }
836              
837             =item $obj->baseline(@inLemma, $compareSense)
838              
839             Calculates a score for the passed sense then returns
840             that score. This class is a wrapper for the
841             simpleScoreSense() method as it makes sure no stemming
842             or lemmatization is present in the preProcessing().
843              
844             Parameters: the in lemma in array form
845             (lemma, part-of-speech, item-id, definition, def source)
846             and the sense that the lemma is being compared to.
847            
848             Returns: a score of how related the in lemma is to the
849             compareSense.
850            
851             =cut
852              
853             sub baseline()
854             {
855             my $base = 0;
856             if(scalar @_ == 3)#checks if method entered by object.
857             {
858             $base = 1;
859             }
860              
861             my @inLemma = @{$_[$base]};
862             $base++;
863             my $curSense = $_[$base];
864              
865             if($stemmed == 1)
866             {
867             $stemming = 0;
868             preProcessing();
869             $stemmed = 0;
870             }
871              
872             return simpleScoreSense(\@inLemma, $curSense);
873            
874             }
875            
876              
877             =item $obj->simpleScoreSense(@inLemma, $compareSense)
878              
879             Calculates a score for the passed sense then
880             returns that score. This is the baseline system which
881             was submitted for SemEval16 task 14. This algorithm
882             scores by overlapping words found in the lemma's gloss
883             and also with the lemma's hypernym and hyponyms' glosses.
884              
885             Parameters: the in lemma in array form
886             (lemma, part-of-speech, item-id, definition, def source)
887             and the sense that the lemma is being compared to.
888              
889             Returns: a score of how related the in lemma is to the
890             compareSense.
891              
892             =cut
893            
894             sub simpleScoreSense()
895             {
896             my $base = 0;
897             if(scalar @_ == 3)#checks if method entered by object.
898             {
899             $base = 1;
900             }
901              
902             my @inLemma = @{$_[$base]};
903             $base++;
904             my $curSense = $_[$base];
905             my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word.
906              
907             #_________________Sense Gloss_________________________________
908             my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss.
909            
910             my @extendedGloss = getExtendedGloss($curSense);
911              
912             #________________Lemma Gloss_________________________________
913             my $tempLemmaGloss = $inLemma[3];
914              
915            
916             if($cleanUp == 1)
917             {
918             #Clean up the words in the temp lemma gloss.
919             $tempLemmaGloss =~ s/(\(|\)|\.)//g;
920             $tempLemmaGloss =~ s/^a-zA-Z//g;
921             $tempLemmaGloss = lc $tempLemmaGloss;
922             $tempLemmaGloss =~ s/\b$stopList\b//g; #remove stop words
923             }
924             if($userCleanUp ne "\"\"")
925             {
926             $tempLemmaGloss =~ $userCleanUp;
927             }
928            
929             my @curLemmaGloss = split(' ', $tempLemmaGloss);
930              
931              
932             #__________________Overlaps__________________________________
933             my $glossLength = 0;
934             my $overlaps = 0.0; #number of overlapped words.
935              
936             #scan through each word from the sense gloss and see if any overlap on the lemma gloss.
937             for my $lWord (0..$#curLemmaGloss)
938             {
939             $glossLength = $glossLength + length $curLemmaGloss[$lWord];
940             if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself
941             {
942             $overlaps = $overlaps + $bonus*(length $word);
943             }
944            
945             $spaceWord = $word;
946             $spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below
947             if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/)
948             {
949             $overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]);
950             }
951              
952             for my $sWord (0..$#curSenseGloss)
953             {
954             if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/)
955             {
956             $overlaps = $overlaps + length $curSenseGloss[$sWord];
957             }
958             }
959             for my $extWord (0..$#extendedGloss)
960             {
961             if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/)
962             {
963             $overlaps = $overlaps + length $extendedGloss[$extWord];
964             }
965             }
966            
967             }
968              
969              
970             $score = $overlaps/$glossLength;
971             return $score;
972             }
973              
974             =item $obj->getExtendedGloss($compareSense)
975              
976             Calculates the extended gloss based on which
977             glosses are toggled and returns an array
978             which contains the full glosses.
979              
980             Parameter: the sense which the extended gloss is
981             based on
982              
983             Returns: an array which contains the extended gloss
984              
985             =cut
986              
987             sub getExtendedGloss()
988             {
989             my $base = 0;
990             if(scalar @_ == 2)#checks if method entered by object.
991             {
992             $base = 1;
993             }
994              
995             my $curSense = $_[$base];
996             my @extendedGloss = ();
997              
998             #__________________Hype Gloss_________________________________
999             if($useHypeGlosses == 1)
1000             {
1001             #Now expands to hypernyms glosses in overlaps
1002             my @senseHypes = @{$wnHypes{$curSense}};
1003             my @senseHypeGloss = ();
1004             my $tempAllHypeGloss = "";
1005            
1006             for my $hype (0..$#senseHypes)
1007             {
1008             my $tempHypeGloss = $wnGlosses{$hype};
1009            
1010             $tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss;
1011             }
1012            
1013             @senseHypeGloss = split(' ', $tempAllHypeGloss);
1014            
1015             push(@extendedGloss, @senseHypeGloss);
1016             }
1017            
1018             #________________Hypo Gloss__________________________________
1019             if($useHypoGlosses == 1)
1020             {
1021             #adds in hyponyms' glosses in overlaps
1022             my @senseHypos = @{$wnHypos{$curSense}};
1023             my @senseHypoGloss = ();
1024             my $tempAllHypoGloss = "";
1025            
1026             for my $hypo (0..$#senseHypos)
1027             {
1028             my $tempHypoGloss = $wnGlosses{$hypo};
1029            
1030             $tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss;
1031             }
1032            
1033             @senseHypoGloss = split(' ', $tempAllHypoGloss);
1034             push(@extendedGloss, @senseHypoGloss);
1035             }
1036              
1037             #_________________Syns Gloss_________________________________
1038             if($useSynsGlosses == 1)
1039             {
1040             #adds in synsets' glosses in overlaps
1041             my @senseSyns = @{$wnSyns{$curSense}};
1042             my @senseSynsGloss = ();
1043             my $tempAllSynsGloss = "";
1044            
1045             for my $syns (0..$#senseSyns)
1046             {
1047             if(!($syns =~ /\b$word\b/)) #do not repeat sense
1048             {
1049             my $tempSynsGloss = $wnGlosses{$syns};
1050            
1051             $tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss;
1052             }
1053             }
1054            
1055             @senseSynsGloss = split(' ', $tempAllSynsGloss);
1056             push(@extendedGloss, @senseSynsGloss);
1057             }
1058              
1059             return \@extendedGloss;
1060             }
1061              
1062             =item $obj->toggleRefineSense($toggle)
1063            
1064             Allows user to toggle refineSense() on/off.
1065            
1066             Parameter: 0 or 1 to toggle the refine sense method
1067             on or off respectively in the processLemma method.
1068              
1069             Returns: nothing
1070              
1071             =cut
1072              
1073             sub toggleRefineSense()
1074             {
1075             if($_[0] == 0)
1076             {
1077             $refineSense = 0;
1078             }
1079             else
1080             {
1081             $refineSense = 1;
1082             }
1083             }
1084              
1085             =item $obj->refineSense(@inLemma, $highSense)
1086            
1087             Refines chosen sense, by determing which
1088             numbered sense should be chosen.
1089              
1090             Parameters: the in lemma in form of
1091             (lemma, part-of-speech, item-id, definition, def source)
1092             and the sense which currently bests matches the inlemma.
1093              
1094             Returns:the new highest scoring sense
1095              
1096             =cut
1097              
1098             sub refineSense()
1099             {
1100             my $base = 0;
1101             if(scalar @_ == 3)#checks if method entered by object.
1102             {
1103             $base = 1;
1104             }
1105              
1106             my @inLemma = @{$_[$base]};
1107            
1108             $base++;
1109             my $highSense = $_[$base];
1110             my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word.
1111             my $shortSense = substr($inLemma[1], 0, 1);
1112             my $sense = $word . "#" . $shortSense;
1113             my $highSenseScore = 0;
1114             my $rSenseScore = 0;
1115             my $refineHigh = "$sense#1"; #assume first sense.
1116             my $tempLemmaGloss = $inLemma[3];
1117            
1118             if($cleanUp == 1)
1119             {
1120             #Clean up the words in the temp lemma gloss.
1121             $tempLemmaGloss =~ s/(\(|\)|\.)//g;
1122             $tempLemmaGloss =~ s/^a-zA-Z//g;
1123             $tempLemmaGloss = lc $tempLemmaGloss;
1124             $tempLemmaGloss =~ $stopList; #remove stop words
1125             }
1126             if($userCleanUp ne "\"\"")
1127             {
1128             $tempLemmaGloss =~ $userCleanUp;
1129             }
1130              
1131             my @refineLemmaGloss = split(' ', $tempLemmaGloss);
1132            
1133             my $rGlossLength = 0.0;
1134             my $rOverlaps = 0.0;
1135             my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word.
1136             for my $rSense (0..$#refineSenses)
1137             {
1138             my $tempSenseGloss = $wnGlosses{$rSense};
1139            
1140             for my $rLemma (0..$#refineLemmaGloss)
1141             {
1142             $rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma];
1143             if($refineLemmaGlos[$rLemma] ne $word)
1144             {
1145             if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/)
1146             {
1147             $rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma];
1148             }
1149             }
1150            
1151             }
1152              
1153             $rSenseScore = $rOverlaps/$rGlossLength;
1154             if($rSenseScore > $highSenseScore)
1155             {
1156             $highSenseScore = $rSenseScore;
1157             $refineHigh = $rHypo;
1158             }
1159             }
1160            
1161             $highSense = $refineHigh;
1162              
1163             return $highSense;
1164            
1165             }
1166              
1167              
1168             #**************printHelp()**********************
1169             # Prints indepth help guide to screen.
1170             #***********************************************
1171             sub printHelp()
1172             {
1173             printUsage();
1174             print "Takes in lemmas from file and attempts to\n";
1175             print "insert them into WordNet by first finding\n";
1176             print "a hypernym, then either a) merging the \n";
1177             print "lemma with the hypernym or b) attaching \n";
1178             print "the lemma to the hypernym.\n";
1179             }
1180              
1181             1;