File Coverage

blib/lib/WordNet/Extend/Locate.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             # WordNet::Extend::Locate.pm version 0.041
2             # Updated: 08/06/17
3             #
4             # Jon Rusert, University of Minnesota Duluth
5             # ruse0008 at d.umn.edu
6             #
7             # Ted Pedersen, University of Minnesota Duluth
8             # tpederse at d.umn.edu
9             #
10             # This program is free software: you can redistribute it and/or modify
11             # it under the terms of the GNU General Public License as published by
12             # the Free Software Foundation, either version 3 of the License, or
13             # (at your option) any later version.
14             #
15             # This program is distributed in the hope that it will be useful,
16             # but WITHOUT ANY WARRANTY; without even the implied warranty of
17             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18             # GNU General Public License for more details.
19             #
20             # You should have received a copy of the GNU General Public License
21             # along with this program. If not, see .
22             #
23              
24             package WordNet::Extend::Locate;
25              
26             =head1 NAME
27              
28             WordNet::Extend::Locate - Perl modules for locating where in WordNet a
29             lemma should be inserted.
30              
31             =head1 SYNOPSIS
32              
33             =head2 Basic Usage Example
34              
35             use WordNet::Extend::Locate;
36              
37             my $locate = WordNet::Extend::Locate->new();
38              
39             $locate->stopList('(the|is|at)');
40              
41             $locate->setCleanUp(1);
42              
43             $locate->preProcessing();
44              
45             $locate->toggleCompareGlosses(1,1,0);
46              
47             $locate->setBonus(25);
48              
49             $locate->toggleRefineSense(0);
50              
51             print "Finding location for 'dog noun withdef.1 man's best friend'\n";
52              
53             @location = @{$locate->locate("dog\tnoun\twithdef.1\tman\'s best friend")};
54              
55             print "Location found: @location\n";
56              
57             =head1 DESCRIPTION
58              
59             =head2 Introduction
60              
61             WordNet is a widely used tool in NLP and other research areas. A drawback of WordNet is the amount of time between updates. WordNet was last updated and released in December, 2006, and no further updates are planned. WordNet::Extend::Locate aims to help users decide where a good place to insert new lemmas into WordNet is by presenting several different methods to run. Users can then take the suggestion from Locate and use that with WordNet::Extend::Insert or simply use it as a guiding point and choose their own location.
62              
63             =over
64             =cut
65              
66 1     1   80426 use WordNet::QueryData;
  0            
  0            
67             #use Wiktionary::Parser;
68             use Getopt::Long;
69             use File::Spec;
70             use Lingua::Stem;
71             use Lingua::EN::Tagger;
72             use WordNet::Similarity::vector;
73             #use List::Util;
74              
75             our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
76              
77             @ISA = qw(Exporter);
78              
79             %EXPORT_TAGS = ();
80              
81             @EXPORT_OK = ();
82              
83             @EXPORT = ();
84              
85             $VERSION = '0.041';
86              
87             #**************Variables**********************
88             $wn = WordNet::QueryData->new; #to be used to access data from wordnet
89             $stemmer = Lingua::Stem->new; #used to stem words for better overlaps etc.
90             $tagger = Lingua::EN::Tagger->new; #used to tag words' pos for similarity measure
91             $measure = WordNet::Similarity::vector->new ($wn); #used to measure similarity for Similarity
92             @wordNetNouns; #stores all words for noun sense from wordnet
93             @wordNetVerbs; #stores all words for verb sense from wordnet
94             %wnGlosses = ();
95             @wnNounSenses;
96             @wnVerbSenses;
97             %wnHypes = ();
98             %wnHypos = ();
99             %wnSyns = ();
100             %wnFreq = ();
101             #our $wikParser = Wiktionary::Parser->new(); #Parses data from wiktionary pages.
102             #$stopList = "(the|is|at|which|on|a|an|and|or|up|in|so)"; #default stop list.
103             $stopList = "(a|about|above|after|again|against|all|am|an|and|any|are|aren't|as|at|be|because|been|before|being|below|between|both|but|by|can't|cannot|could|couldn't|did|didn't|do|does|doesn't|doing|don't|down|during|each|few|for|from|further|had|hadn't|has|hasn't|have|haven't|having|he|he'd|he'll|he's|her|here|here's|hers|herself|him|himself|his|how|how's|i|i'd|i'll|i'm|i've|if|in|into|is|isn't|it|it's|its|itself|let's|me|more|most|mustn't|my|myself|no|nor|not|of|off|on|once|only|or|other|ought|our|ours|ourselves|out|over|own|same|shan't|she|she'd|she'll|she's|should|shouldn't|so|some|such|than|that|that's|the|their|theirs|them|themselves|then|there|there's|these|they|they'd|they'll|they're|they've|this|those|through|to|too|under|until|up|very|was|wasn't|we|we'd|we'll|we're|we've|were|weren't|what|what's|when|when's|where|where's|which|while|who|who's|whom|why|why's|with|won't|would|wouldn't|you|you'd|you'll|you're|you've|your|yours|yourself|yourselves)";
104             $preProcessed = 0; #Flag to determine if preProcessing() has been called.
105             $cleanUp = 1; #If cleanUp is on, glosses will be cleanedUp, can be toggled with setCleanUp();
106             $userCleanUp = ""; #Cleanup step specified by user in addCleanUp();
107             $useHypeGlosses = 1; #Toggle for use of hypernym glosses in comparisons.
108             $useHypoGlosses = 1; #Toggle for use of hyponym glosses in comparisons.
109             $useSynsGlosses = 1; #Toggle for use of synset glosses in comparisons.
110             $bonus = 10; #Bonus to be used for lemmas that contain the new lemma. Can be set with setBonus();
111             $refineSense = 0; #Toggle for use of refineSense() method, default on.
112             $help = 0;
113             $scoringMethod = 'baseline';
114             @scoringMethods = ('baseline', 'BwS', 'Similarity', 'Word2Vec');
115             $stemming = 0; #Toggle for stemming on or off.
116             $stemmed = 0; #flag for use in BwS
117             $cValue = 0; #Confidence value for w2veccompare can be set in setConfidenceValue()
118             #*********************************************
119              
120             GetOptions('help' => \$help);
121             if($help == 1)
122             {
123             printHelp();
124             exit(0);
125             }
126              
127             =head2 Methods
128              
129             The following methods are defined in this package:
130              
131             =head3 Public methods
132              
133             =over
134              
135             =item $obj->new()
136              
137             The constructor for WordNet::Extend::Locate objects.
138              
139             Parameters: none.
140              
141             Return value: the new blessed object
142              
143             =cut
144              
145             sub new
146             {
147             my $class = shift;
148             my $self = {};
149              
150             $self->{errorString} = '';
151             $self->{error}=0;
152              
153             bless $self, $class;
154            
155             return $self;
156             }
157              
158             =item $obj->getError()
159              
160             Allows the object to check if any errors have occurred.
161             Returns an array ($error, $errorString), where $error
162             value equal to 1 represents a warning and 2 represents
163             an error and $errString contains the possible error.
164             For example, if a user forgets to run preProcessing() before
165             a method that relies on it, the error would be 2 and errorString
166             would mention that preProcessing had not been run.
167              
168             Parameter: None
169              
170             Returns: array of the form ($error, $errorString).
171              
172             =cut
173             sub getError()
174             {
175             my $self = shift;
176             my $error = $self->{error};
177             my $errString = $self->{errorString};
178             $self->{error}=0;
179             $self->{errorString} = "";
180             $errString =~ s/^[\r\n\t ]+//;
181             return ($error, $errString);
182             }
183              
184             =item $obj->locateFile($input_file, $output_file)
185              
186             Attempts to locate best WordNet position for each word
187             from input file into WordNet, outputs results to output file.
188              
189             Parameter: location of input file and output file respectively
190              
191             Returns: nothing
192              
193             =cut
194              
195             sub locateFile()
196             {
197             my $input = File::Spec->canonpath($_[1]);
198             my $output = File::Spec->canonpath($_[2]);
199            
200             #Attempts to open input data
201             open DATA, "$input" or die $!;
202             open (OUTDATA, '>', "$output") or die $!;
203            
204             #if preProcessing() hasn't been called, call it.
205             if($preProcessed == 0)
206             {
207             preProcessing();
208             }
209              
210             my @outLemma = ("","","");
211              
212             while() #While lemmas are left in the input data
213             {
214             for $tempIn (split("\n")) #processes data line by line.
215             {
216             @outLemma = @{locate($tempIn)};
217              
218             if(scalar @outLemma > 0)#only print if ideal lemma found
219             {
220             $tempOut = "$outLemma[0]\t$outLemma[1]\t$outLemma[2]\n";
221              
222             print OUTDATA "$tempOut";
223             }
224            
225             }
226             }
227             close DATA;
228             close OUTDATA;
229             }
230              
231             =item $obj->locate($wordPosGloss)
232              
233             Takes in single lemma with gloss and returns location of best
234             insertion point in WordNet.
235              
236             Parameter: Lemma string in format of 'word\tpos\titem-id\tdef'
237             NOTE: String must only be separated by \t no space.
238              
239             Returns: Array in format of (item-id, WordNet sense, operation)
240              
241             =cut
242             sub locate()
243             {
244             my $base = 0;
245             if(scalar @_ == 2)#checks if method entered by object.
246             {
247             $base = 1;
248             }
249              
250             #if preProcessing() hasn't been called, call it.
251             if($preProcessed == 0)
252             {
253             preProcessing();
254             }
255              
256             my @inLemma = ();
257             if(ref($_[$base]) eq 'ARRAY') #distinguishes between lemmas sent in as array vs string in \t format
258             {
259             @inLemma =@{$_[$base]};
260             }
261             else
262             {
263             @inLemma = split("\t", $_[$base]); #stores lemma as formatted above
264             }
265              
266             my @outLemma = ();
267             #word2vec handles all the wordnet words at once, while the other methods handle them one at a time
268             if($scoringMethod eq 'Word2Vec')
269             {
270             @outLemma = @{word2VecCompare(\@inLemma)};
271             }
272             else
273             {
274             @outLemma = @{processLemma(\@inLemma)};
275             }
276            
277             return \@outLemma;
278            
279             }
280              
281             =item $obj->stopList($newStopList)
282              
283             Takes in new stop list, in regex form
284              
285             Parameter:the new stop list in regex substitution form (w1|w2|...|wn)
286              
287             Returns: nothing
288              
289             =cut
290              
291             sub stopList()
292             {
293             my $base = 0;
294             if(scalar @_ == 2)#checks if method entered by object.
295             {
296             $base = 1;
297             }
298             my $tempStopList = $_[$base];
299             if($tempStopList =~ /\(.*(\|.*)?\)/g)
300             {
301             $stopList = $tempStopList;
302             }
303             else
304             {
305             my $self = shift;
306             $self->{error} = 1;
307             $self->{errorString} = "Proposed stop list not in regex substition form (w1|w2|...|wn), default remains";
308             }
309             }
310              
311             =item $obj->setCleanUp($switch)
312              
313             Allows the user to toggle whether or not
314             glosses should be cleaned up.
315              
316             Parameter: 0 or 1 to turn clean up off or on respectively
317              
318             Returns: nothing
319              
320             =cut
321              
322             sub setCleanUp()
323             {
324             my $base = 0;
325             if(scalar @_ == 2)#checks if method entered by object.
326             {
327             $base = 1;
328             }
329              
330             if($_[$base] == 0) #turns cleanUp off.
331             {
332             $cleanUp = 0;
333             }
334             else #turns cleanUp on.
335             {
336             $cleanUp = 1;
337             }
338             }
339              
340             =item $obj->addCleanUp($cleanUp)
341              
342             Allows the user to add their own
343             regex for cleaning up the glosses.
344              
345             Parameter: Regex representing the cleanup
346             the user wants performed.
347              
348             Returns: Nothing
349              
350             =cut
351              
352             sub addCleanUp()
353             {
354             my $base = 0;
355             if(scalar @_ == 2)#checks if method entered by object.
356             {
357             $base = 1;
358             }
359              
360             my $tempCleanUp = $_[$base];
361             if($tempCleanUp =~ /(s|t)\/.*\/g?/g)
362             {
363             $userCleanUp = $tempCleanUp;
364             }
365             else
366             {
367             my $self = shift;
368             $self->{error} = 1;
369             $self->{errorString} = "Clean Up not in regex format '/.../', default remains on";
370             }
371             }
372              
373             =item $obj->preProcessing()
374              
375             Highly increases speed of program by making
376             as many outside calls as possible and storing
377             outside info to be used later.
378              
379             Parameter: none
380              
381             Returns: nothing
382            
383             =cut
384              
385             sub preProcessing()
386             {
387             $preProcessed = 1; #Flag that preProcessing has been called.
388             @wordNetNouns = $wn->listAllWords('noun'); #Stores all nouns from wordNet for multiple uses.
389             @wordNetVerbs = $wn->listAllWords('verb'); #Stores all verbs from wordNet for multiple uses.
390             #reset all glosses, senses, etc.
391             %wnGlosses = ();
392             @wnNounSenses;
393             @wnVerbSenses;
394             %wnHypes = ();
395             %wnHypos = ();
396             %wnSyns = ();
397             %wnFreq = ();
398            
399            
400             #Preemptively retrieves glosses, hypes, hypos, and syns for all senses as they will be used every iteration.
401             foreach my $noun (@wordNetNouns)
402             {
403             my @nSenses = $wn->querySense("$noun\#n"); #gets all senses for that word
404             foreach my $curNSense (@nSenses)
405             {
406             #stores in noun senses to differentiate from verbs.
407             push(@wnNounSenses, $curNSense);
408              
409             #obtain each gloss and clean up before inserting into hash.
410             my @nGlosses = $wn->querySense($curNSense, "glos");
411             my $tempSenseGloss = $nGlosses[0];
412            
413             if($cleanUp == 1)
414             {
415             #Clean up the words in the temporary sense gloss.
416             $tempSenseGloss =~ s/(\(|\)|\.)//g;
417             $tempSenseGloss =~ s/^a-zA-Z//g;
418             $tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
419             $tempSenseGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words
420             }
421             if($userCleanUp ne "\"\"")
422             {
423             $tempSenseGloss =~ $userCleanUp;
424             }
425              
426             #if stemming is on, stem each word in each gloss
427             if($stemming == 1)
428             {
429             my @tempStem = split(' ', $tempSenseGloss);
430             my @stemmedGloss = @{$stemmer->stem(@tempStem)};
431             $tempSenseGloss = join(' ', @stemmedGloss);
432             }
433            
434             #maps each sense to its gloss
435             $wnGlosses{$curNSense} = $tempSenseGloss;
436            
437             #obtains and stores, hypes, hypos, and syns
438             my @hypes = $wn->querySense($curNSense, "hype");
439             $wnHypes{$curNSense} = \@hypes;
440             my @hypos = $wn->querySense($curNSense, "hypo");
441             $wnHypos{$curNSense} = \@hypos;
442             my @syns = $wn->querySense($curNSense, "syns");
443             $wnSyns{$curNSense} = \@syns;
444             $wnFreq{$curNSense} = $wn->frequency($curNSense);
445             }
446             }
447              
448             #stores verbs' senses' glosses, hypes, hypos, and syns.
449             foreach my $verb (@wordNetVerbs)
450             {
451             my @vSenses = $wn->querySense("$verb\#v"); #gets all senses for that word
452             foreach my $curVSense (@vSenses)
453             {
454             #stores in verb senses to differentiate later.
455             push(@wnVerbSenses, $curVSense);
456              
457             #obtain each gloss and clean up before inserting into hash.
458             my @vGlosses = $wn->querySense($curVSense, "glos");
459             my $tempSenseGloss = $vGlosses[0];
460            
461             if($cleanUp == 1)
462             {
463             #Clean up the words in the temporary sense gloss.
464             $tempSenseGloss =~ s/(\(|\)|\.)//g;
465             $tempSenseGloss =~ s/^a-zA-Z//g;
466             $tempSenseGloss = lc $tempSenseGloss; #converts all words to lowercase.
467             $tempSenseGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words
468             }
469             if($userCleanUp ne "\"\"")
470             {
471             $tempSenseGloss =~ $userCleanUp;
472             }
473              
474             #if stemming is on, stem each word in each gloss
475             if($stemming == 1)
476             {
477             my @tempStem = split(' ', $tempSenseGloss);
478             my @stemmedGloss = @{$stemmer->stem(@tempStem)};
479             $tempSenseGloss = join(' ', @stemmedGloss);
480             }
481            
482             #maps each sense to its gloss
483             $wnGlosses{$curVSense} = $tempSenseGloss;
484              
485             #obtains and stores, hypes, hypos, and syns
486             my @hypes = $wn->querySense($curVSense, "hype");
487             $wnHypes{$curVSense} = \@hypes;
488             my @hypos = $wn->querySense($curVSense, "hypo");
489             $wnHypos{$curVSense} = \@hypos;
490             my @syns = $wn->querySense($curVSense, "syns");
491             $wnSyns{$curVSense} = \@syns;
492             $wnFreq{$curVSense} = $wn->frequency($curVSense);
493             }
494             }
495              
496              
497             }
498              
499             =item $obj->processLemma(@inLemma)
500              
501             Determines where the OOV Lemma should be
502             inserted into WordNet, returns the output.
503              
504             Parameter: the lemma to be inserted in array form
505             (lemma, part-of-speech, item-id, definition, def source)
506              
507             Returns: chosen lemma in array form
508             (item-id, WordNet sense, operation)
509              
510             =cut
511              
512             sub processLemma()
513             {
514             my $base = 0;
515             if(scalar @_ == 2)#checks if method entered by object.
516             {
517             $base = 1;
518             }
519              
520             my %senseScores = ();
521             my $highSenseScore = 0;
522             my $highSense = "";
523             my @inLemma = @{$_[$base]};
524             my @outLemma = ("","","");
525             my $attachMerge = "";
526             my @senses = ();
527              
528             if($preProcessed == 1)
529             {
530             if($inLemma[1] =~ /noun/)
531             {
532             @senses = @wnNounSenses;
533             }
534             else
535             {
536             @senses = @wnVerbSenses;
537             }
538            
539             foreach $curSense (@senses) #runs through each sense of current word
540             {
541             my $score = scoreSense(\@inLemma, $curSense);
542            
543             if($score >= $highSenseScore)
544             {
545             $highSenseScore = $score;
546             $highSense = $curSense;
547             }
548            
549             $senseScores{$curSense} = $score;
550             }
551            
552             if($refineSense == 1)
553             {
554             $highSense = refineSense(\@inLemma, $highSense);
555             }
556            
557             if($wnFreq{$highSense} == 0)
558             {
559             $attachMerge = "attach";
560             }
561             else
562             {
563             $attachMerge = "merge";
564             }
565            
566             $outLemma[0] = $inLemma[2];
567             $outLemma[1] = $highSense;
568             $outLemma[2] = $attachMerge;
569             return \@outLemma;
570             }
571             else
572             {
573             my $self = shift;
574             $self->{error} = 2;
575             $self->{errorString} = "PreProcessing must be run before processLemma() is called.";
576             }
577             }
578              
579             =item $obj->toggleCompareGlosses($hype,$hypo,$syns)
580              
581             Toggles which glosses are used in score sense.
582             by default, the sense, the sense's hypernyms'
583             glosses,hyponyms' glosses, and synsets' glosses
584             are turned on. This method allows for toggling
585             of hypes,hypos,synsets, by passing in three
586             parameters, 1 for on and 0 for off.
587             Example: toggleCompareGlosses(0,0,0) toggles
588             all three off.
589              
590             Parameters: 0 or 1 for toggling hypernyms, hyponyms,
591             and synset comparisons.
592              
593             Returns: nothing
594              
595             =cut
596              
597             sub toggleCompareGlosses()
598             {
599             my $base = 0;
600             if(scalar @_ == 4)#checks if method entered by object.
601             {
602             $base = 1;
603             }
604              
605             if($_[$base] == 0)
606             {
607             $useHypeGlosses = 0;
608             }
609             else
610             {
611             $useHypeGlosses = 1;
612             }
613              
614             $base++;
615            
616             if($_[$base] == 0)
617             {
618             $useHypoGlosses = 0;
619             }
620             else
621             {
622             $useHypoGlosess = 1;
623             }
624            
625             $base++;
626              
627             if($_[$base] == 0)
628             {
629             $useSynsGlosses = 0;
630             }
631             else
632             {
633             $useSynsGlosses = 1;
634             }
635             }
636              
637             =item $obj->setBonus($bonus)
638              
639             Allows the user to set the bonus that will be
640             used when scoring lemmas that contain the
641             new lemma.
642              
643             Parameter: the multiplier that should be used in
644             calculating the bonus.
645              
646             Returns: nothing
647              
648             =ctu
649              
650             sub setBonus()
651             {
652             my $base = 0;
653             if(scalar @_ == 2)#checks if method entered by object.
654             {
655             $base = 1;
656             }
657              
658             $bonus = $_[$base];
659             }
660              
661             =item $obj->scoreSense(@inLemma, $compareSense)
662              
663             Serves as a wrapper method to facilitate the
664             main program by directing it to the currently
665             chosen scoring method. By default the average
666             highest scoring method is chosen. This can be
667             changed with setScoreMethod().
668              
669             Parameters: the in lemma in array form
670             (lemma, part-of-speech, item-id, definition, def source)
671             and the sense that the lemma is being compared to.
672              
673             Returns: a score of how related the in lemma is to the
674             compareSense.
675              
676             =cut
677              
678             sub scoreSense()
679             {
680             my $base = 0;
681             if(scalar @_ == 3)#checks if method entered by object.
682             {
683             $base = 1;
684             }
685              
686             my @inLemma = @{$_[$base]};
687             $base++;
688             my $curSense = $_[$base];
689              
690             my $score = 0;
691             if($scoringMethod eq "baseline")
692             {
693             $score = baseline(\@inLemma, $curSense);
694             }
695             if($scoringMethod eq "BwS")
696             {
697             $score = BwS(\@inLemma, $curSense);
698             }
699             if($scoringMethod eq "Similarity")
700             {
701             $score = Similarity(\@inLemma, $curSense);
702             }
703              
704             return $score;
705             }
706              
707             =item $obj->setScoreMethod($scoreMethod)
708              
709             Allows the user to choose which scoring method
710             should be used by default when running the
711             program from the top. Options are:
712             'baseline'
713             'BwS' - baseline system with stemming and lemmitization
714             --as more are added they will appear here.
715              
716             Parameter: the chosen scoring method
717              
718             Returns: nothing.
719              
720             =cut
721              
722             sub setScoreMethod()
723             {
724             my $base = 0;
725              
726             if(scalar @_ == 2)#checks if method entered by object.
727             {
728             $base = 1;
729             }
730              
731             my $scoreMethod = $_[$base];
732              
733             #check if the score method is in scoring methods.
734             my @matches = grep(/$scoreMethod/, @scoringMethods);
735             if(scalar @matches > 0)
736             {
737             $scoringMethod = $scoreMethod;
738             }
739            
740             }
741              
742             =item $obj->Similarity(@inLemma, $compareSense)
743              
744             Calculates a score for the passed sense and returns
745             that score.
746              
747             Parameters: the in lemma in array form
748             (lemma, part-of-speech, item-id, definition, def source)
749             and the sense that the lemma is being compared to.
750              
751             Returns: a score of how related the im lemma is to the
752             compareSense.
753              
754             =cut
755              
756             sub Similarity()
757             {
758             my $base = 0;
759             if(scalar @_ == 3)#checks if method entered by object.
760             {
761             $base = 1;
762             }
763              
764             my @inLemma = @{$_[$base]};
765             $base++;
766             my $curSense = $_[$base];
767              
768             my $def = @inLemma[3];
769              
770             #split definition and stem the words
771             my @listDef =split(' ', $def);
772             my @defStemmed = @{$stemmer->stem(@listDef)};
773              
774             #join definition back together and tag with pos
775             $def = join(' ', @defStemmed);
776             my $tagged = $tagger->add_tags($def);
777              
778             #split the tagged definition for individual word processing
779             @tagArray = split(' ', $tagged);
780             my @similar = ();
781              
782             #step through each tagged word and find the first sense in wordnet, then add that to the @similar list
783             foreach my $cur (@tagArray)
784             {
785             my $pos = '';
786             if($cur =~ /.*/)
787             {
788             $pos = 'n';
789             }
790             else
791             {
792             if($cur =~ /.*/)
793             {
794             $pos = 'v';
795             }
796             else
797             {
798             if($cur =~ /.*/)
799             {
800             $pos = 'a';
801             }
802             }
803             }
804              
805             if(length $pos == 1)
806             {
807             $cur =~ s/<[nvj\/].{1,3}>//g;
808             @wnQuery = $wn->querySense("$cur#$pos");
809             push @similar, $wnQuery[0];
810             }
811             }
812              
813             my $score = 0;
814              
815             foreach my $curSim (@similar)
816             {
817             my $value = $measure->getRelatedness("$curSense", "$curSim");
818             $score = $score + $value;
819             }
820            
821             return $score;
822             }
823              
824             =item $obj->BwS(@inLemma, $compareSense)
825              
826             Calculates a score for the passed sense and returns
827             that score. This is a modified baseline() method
828             which adds stemming to the data.
829              
830             Parameters: the in lemma in array form
831             (lemma, part-of-speech, item-id, definition, def source)
832             and the sense that the lemma is being compared to.
833            
834             Returns: a score of how related the in lemma is to the
835             compareSense.
836              
837             =cut
838              
839             sub BwS()
840             {
841             my $base = 0;
842             if(scalar @_ == 3)#checks if method entered by object.
843             {
844             $base = 1;
845             }
846              
847             my @inLemma = @{$_[$base]};
848             $base++;
849             my $curSense = $_[$base];
850              
851             if($stemmed == 0)
852             {
853             $stemming = 1;
854             preProcessing();
855             $stemmed = 1;
856             }
857              
858             return simpleScoreSense(\@inLemma, $curSense);
859              
860             }
861              
862             =item $obj->baseline(@inLemma, $compareSense)
863              
864             Calculates a score for the passed sense then returns
865             that score. This class is a wrapper for the
866             simpleScoreSense() method as it makes sure no stemming
867             or lemmatization is present in the preProcessing().
868              
869             Parameters: the in lemma in array form
870             (lemma, part-of-speech, item-id, definition, def source)
871             and the sense that the lemma is being compared to.
872            
873             Returns: a score of how related the in lemma is to the
874             compareSense.
875            
876             =cut
877              
878             sub baseline()
879             {
880             my $base = 0;
881             if(scalar @_ == 3)#checks if method entered by object.
882             {
883             $base = 1;
884             }
885              
886             my @inLemma = @{$_[$base]};
887             $base++;
888             my $curSense = $_[$base];
889              
890             if($stemmed == 1)
891             {
892             $stemming = 0;
893             preProcessing();
894             $stemmed = 0;
895             }
896              
897             return simpleScoreSense(\@inLemma, $curSense);
898            
899             }
900              
901             =item $obj->word2VecCompare(@inLemma)
902              
903             Calculates a score for the passed sense by
904             using the gensim Word2Vec model trained on Google
905             news vectors.
906              
907             Parameters: the in lemma in array form
908             (lemma, part-of-speech, item-id, definition, def source)
909             and the sense that the lemma is being compared to.
910              
911             Returns: a score of how related the in lemma is to the
912             compareSense.
913              
914             =cut
915              
916             sub word2VecCompare()
917             {
918             my $base = 0;
919             if(scalar @_ == 2)#checks if method entered by object.
920             {
921             $base = 1;
922             }
923              
924             my @inLemma = @{$_[$base]};
925             my @candidateArray = ();
926              
927             my $tempLemmaGloss = $inLemma[3];
928            
929             if($cleanUp == 1)
930             {
931             #Clean up the words in the temp lemma gloss.
932             $tempLemmaGloss =~ s/(\(|\)|\.)//g;
933             $tempLemmaGloss =~ s/^a-zA-Z//g;
934             $tempLemmaGloss = lc $tempLemmaGloss;
935             $tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words
936             }
937            
938            
939             if($inLemma[1] eq 'noun')
940             {
941             @candidateArray = @wordNetNouns;
942             }
943             else
944             {
945             @candidateArray = @wordNetVerbs;
946             }
947              
948             open (WNFILE, '>', "tmpfile") or die $!;
949             print WNFILE "$cValue\n";
950             print WNFILE "$inLemma[0]\n"; #print OOV Lemma first which will be handled by python
951             print WNFILE "$tempLemmaGloss\n";
952             #create a file of all candidate WordNet words to be passed to python word2vec
953             foreach $curW (@candidateArray)
954             {
955             if($curW !~ /(^|\s)$stopList(\s|$)/g)
956             {
957             print WNFILE "$curW\n";
958             }
959             }
960             close WNFILE;
961            
962             #open(my $ideal, "|-", "python ~/WordNet-Extend/word2vecSimilarity.py tmpfile $inLemma[0]") or die "Cannot run python script: $!";
963              
964             $ideal =`python -W ignore ~/bin/word2vecSimilarity.py tmpfile`;
965              
966             chomp $ideal;
967             my $attachMerge = "";
968             if($wnFreq{$ideal} == 0)
969             {
970             $attachMerge = "attach";
971             }
972             else
973             {
974             $attachMerge = "merge";
975             }
976            
977             my $pos = "";
978             if($inLemma[1] eq 'noun')
979             {
980             $pos = 'n';
981             }
982             else
983             {
984             $pos = 'v'
985             }
986              
987             my @outLemma = ();
988             if($ideal ne "")
989             {
990             @outLemma = ("$inLemma[2]", "$ideal#$pos#1", "$attachMerge");
991             }
992             # else
993             # {
994             # my $self = shift;
995             # $self->{error} = 1;
996             # $self->{errorString} = "No ideal found, consider changing confidence value";
997             # }
998             #unlink 'tmpfile';
999            
1000             return \@outLemma;
1001            
1002             }
1003              
1004             =item $obj->setConfidenceValue()
1005              
1006             Allows the user to set the confidence value for word2vecCompare().
1007             The confidence value is the cutoff for the similarity score. If
1008             the similarity score is below the confidence value it will be dropped.
1009             This aims to increase accuracy but will reduce recall.
1010              
1011             Parameters: the new confidence value, default is set to 0
1012              
1013             Returns: Nothing
1014              
1015             =cut
1016              
1017             sub setConfidenceValue()
1018             {
1019             my $base = 0;
1020              
1021             if(scalar @_ == 2)#checks if method entered by object
1022             {
1023             $base = 1;
1024             }
1025            
1026             my $newCValue = $_[$base];
1027              
1028             $cValue = $newCValue;
1029            
1030             }
1031              
1032              
1033             =item $obj->simpleScoreSense(@inLemma, $compareSense)
1034              
1035             Calculates a score for the passed sense then
1036             returns that score. This is the baseline system which
1037             was submitted for SemEval16 task 14. This algorithm
1038             scores by overlapping words found in the lemma's gloss
1039             and also with the lemma's hypernym and hyponyms' glosses.
1040              
1041             Parameters: the in lemma in array form
1042             (lemma, part-of-speech, item-id, definition, def source)
1043             and the sense that the lemma is being compared to.
1044              
1045             Returns: a score of how related the in lemma is to the
1046             compareSense.
1047              
1048             =cut
1049            
1050             sub simpleScoreSense()
1051             {
1052             my $base = 0;
1053             if(scalar @_ == 3)#checks if method entered by object.
1054             {
1055             $base = 1;
1056             }
1057              
1058             my @inLemma = @{$_[$base]};
1059             $base++;
1060             my $curSense = $_[$base];
1061             my $word = substr($curSense, 0, index($curSense, '#')); #extracts base word.
1062              
1063             #_________________Sense Gloss_________________________________
1064             my @curSenseGloss = split (' ', $wnGlosses{$curSense}); #initialize current sense gloss.
1065            
1066             my @extendedGloss = getExtendedGloss($curSense);
1067              
1068             #________________Lemma Gloss_________________________________
1069             my $tempLemmaGloss = $inLemma[3];
1070              
1071            
1072             if($cleanUp == 1)
1073             {
1074             #Clean up the words in the temp lemma gloss.
1075             $tempLemmaGloss =~ s/(\(|\)|\.)//g;
1076             $tempLemmaGloss =~ s/^a-zA-Z//g;
1077             $tempLemmaGloss = lc $tempLemmaGloss;
1078             $tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words
1079             }
1080             if($userCleanUp ne "\"\"")
1081             {
1082             $tempLemmaGloss =~ $userCleanUp;
1083             }
1084            
1085             my @curLemmaGloss = split(' ', $tempLemmaGloss);
1086              
1087              
1088             #__________________Overlaps__________________________________
1089             my $glossLength = 0;
1090             my $overlaps = 0.0; #number of overlapped words.
1091              
1092             #scan through each word from the sense gloss and see if any overlap on the lemma gloss.
1093             for my $lWord (0..$#curLemmaGloss)
1094             {
1095             $glossLength = $glossLength + length $curLemmaGloss[$lWord];
1096             if($curLemmaGloss[$lWord] =~ /\b$word\b/) #if lemma contains current word from sense itself
1097             {
1098             $overlaps = $overlaps + $bonus*(length $word);
1099             }
1100            
1101             $spaceWord = $word;
1102             $spaceWord =~ s/_/ /g; #substitute underscores for spaces for comparison below
1103             if($spaceWord =~ /(^\w+\s\b$curLemmaGloss[$lWord]\b$)|(^\b$curLemmaGloss[$lWord]\b\s\w+$)/)
1104             {
1105             $overlaps = $overlaps + $bonus*(length $curLemmaGloss[$lWord]);
1106             }
1107              
1108             for my $sWord (0..$#curSenseGloss)
1109             {
1110             if($curLemmaGloss[$lWord] =~ /\b\Q$curSenseGloss[$sWord]\E\b?/)
1111             {
1112             $overlaps = $overlaps + length $curSenseGloss[$sWord];
1113             }
1114             }
1115             for my $extWord (0..$#extendedGloss)
1116             {
1117             if($curLemmaGloss[$lWord] =~ /\b\Q$extendedGloss[$extWord]\E\b?/)
1118             {
1119             $overlaps = $overlaps + length $extendedGloss[$extWord];
1120             }
1121             }
1122            
1123             }
1124            
1125             $score = $overlaps/$glossLength;
1126            
1127             return $score;
1128             }
1129              
1130             =item $obj->getExtendedGloss($compareSense)
1131              
1132             Calculates the extended gloss based on which
1133             glosses are toggled and returns an array
1134              
1135             which contains the full glosses.
1136              
1137             Parameter: the sense which the extended gloss is
1138             based on
1139              
1140             Returns: an array which contains the extended gloss
1141              
1142             =cut
1143              
1144             sub getExtendedGloss()
1145             {
1146             my $base = 0;
1147             if(scalar @_ == 2)#checks if method entered by object.
1148             {
1149             $base = 1;
1150             }
1151              
1152             my $curSense = $_[$base];
1153             my @extendedGloss = ();
1154              
1155             #__________________Hype Gloss_________________________________
1156             if($useHypeGlosses == 1)
1157             {
1158             #Now expands to hypernyms glosses in overlaps
1159             my @senseHypes = @{$wnHypes{$curSense}};
1160             my @senseHypeGloss = ();
1161             my $tempAllHypeGloss = "";
1162            
1163             for my $hype (0..$#senseHypes)
1164             {
1165             my $tempHypeGloss = $wnGlosses{$hype};
1166            
1167             $tempAllHypeGloss = $tempAllHypeGloss . " " . $tempHypeGloss;
1168             }
1169            
1170             @senseHypeGloss = split(' ', $tempAllHypeGloss);
1171            
1172             push(@extendedGloss, @senseHypeGloss);
1173             }
1174            
1175             #________________Hypo Gloss__________________________________
1176             if($useHypoGlosses == 1)
1177             {
1178             #adds in hyponyms' glosses in overlaps
1179             my @senseHypos = @{$wnHypos{$curSense}};
1180             my @senseHypoGloss = ();
1181             my $tempAllHypoGloss = "";
1182            
1183             for my $hypo (0..$#senseHypos)
1184             {
1185             my $tempHypoGloss = $wnGlosses{$hypo};
1186            
1187             $tempAllHypoGloss = $tempAllHypoGloss . " " . $tempHypoGloss;
1188             }
1189            
1190             @senseHypoGloss = split(' ', $tempAllHypoGloss);
1191             push(@extendedGloss, @senseHypoGloss);
1192             }
1193              
1194             #_________________Syns Gloss_________________________________
1195             if($useSynsGlosses == 1)
1196             {
1197             #adds in synsets' glosses in overlaps
1198             my @senseSyns = @{$wnSyns{$curSense}};
1199             my @senseSynsGloss = ();
1200             my $tempAllSynsGloss = "";
1201            
1202             for my $syns (0..$#senseSyns)
1203             {
1204             if(!($syns =~ /\b$word\b/)) #do not repeat sense
1205             {
1206             my $tempSynsGloss = $wnGlosses{$syns};
1207            
1208             $tempAllSynsGloss = $tempAllSynsGloss . " " . $tempSynsGloss;
1209             }
1210             }
1211            
1212             @senseSynsGloss = split(' ', $tempAllSynsGloss);
1213             push(@extendedGloss, @senseSynsGloss);
1214             }
1215              
1216             return \@extendedGloss;
1217             }
1218              
1219             =item $obj->toggleRefineSense($toggle)
1220            
1221             Allows user to toggle refineSense() on/off.
1222            
1223             Parameter: 0 or 1 to toggle the refine sense method
1224             on or off respectively in the processLemma method.
1225              
1226             Returns: nothing
1227              
1228             =cut
1229              
1230             sub toggleRefineSense()
1231             {
1232             if($_[0] == 0)
1233             {
1234             $refineSense = 0;
1235             }
1236             else
1237             {
1238             $refineSense = 1;
1239             }
1240             }
1241              
1242             =item $obj->refineSense(@inLemma, $highSense)
1243            
1244             Refines chosen sense, by determing which
1245             numbered sense should be chosen.
1246              
1247             Parameters: the in lemma in form of
1248             (lemma, part-of-speech, item-id, definition, def source)
1249             and the sense which currently bests matches the inlemma.
1250              
1251             Returns:the new highest scoring sense
1252              
1253             =cut
1254              
1255             sub refineSense()
1256             {
1257             my $base = 0;
1258             if(scalar @_ == 3)#checks if method entered by object.
1259             {
1260             $base = 1;
1261             }
1262              
1263             my @inLemma = @{$_[$base]};
1264            
1265             $base++;
1266             my $highSense = $_[$base];
1267             my $word = substr($highSense, 0, index($highSense, '#')); #extracts base word.
1268             my $shortSense = substr($inLemma[1], 0, 1);
1269             my $sense = $word . "#" . $shortSense;
1270             my $highSenseScore = 0;
1271             my $rSenseScore = 0;
1272             my $refineHigh = "$sense#1"; #assume first sense.
1273             my $tempLemmaGloss = $inLemma[3];
1274            
1275             if($cleanUp == 1)
1276             {
1277             #Clean up the words in the temp lemma gloss.
1278             $tempLemmaGloss =~ s/(\(|\)|\.)//g;
1279             $tempLemmaGloss =~ s/^a-zA-Z//g;
1280             $tempLemmaGloss = lc $tempLemmaGloss;
1281             $tempLemmaGloss =~ s/(^|\s)$stopList(\s|$)/ /g; #remove stop words
1282             }
1283             if($userCleanUp ne "\"\"")
1284             {
1285             $tempLemmaGloss =~ $userCleanUp;
1286             }
1287              
1288             my @refineLemmaGloss = split(' ', $tempLemmaGloss);
1289            
1290             my $rGlossLength = 0.0;
1291             my $rOverlaps = 0.0;
1292             my @refineSenses = $wn->querySense($sense); #obtains the other senses for the same word.
1293             for my $rSense (0..$#refineSenses)
1294             {
1295             my $tempSenseGloss = $wnGlosses{$rSense};
1296            
1297             for my $rLemma (0..$#refineLemmaGloss)
1298             {
1299             $rGlossLength = $rGlossLength + length $refineLemmaGloss[$rLemma];
1300             if($refineLemmaGlos[$rLemma] ne $word)
1301             {
1302             if($tempSenseGloss =~ /$refineLemmaGloss[$rLemma]/)
1303             {
1304             $rOverlaps = $rOverlaps + length $refineLemmaGloss[$rLemma];
1305             }
1306             }
1307            
1308             }
1309              
1310             $rSenseScore = $rOverlaps/$rGlossLength;
1311             if($rSenseScore > $highSenseScore)
1312             {
1313             $highSenseScore = $rSenseScore;
1314             $refineHigh = $rHypo;
1315             }
1316             }
1317            
1318             $highSense = $refineHigh;
1319              
1320             return $highSense;
1321            
1322             }
1323              
1324              
1325             #**************printHelp()**********************
1326             # Prints indepth help guide to screen.
1327             #***********************************************
1328             sub printHelp()
1329             {
1330             printUsage();
1331             print "Takes in lemmas from file and attempts to\n";
1332             print "insert them into WordNet by first finding\n";
1333             print "a hypernym, then either a) merging the \n";
1334             print "lemma with the hypernym or b) attaching \n";
1335             print "the lemma to the hypernym.\n";
1336             }
1337              
1338             1;