File Coverage

blib/lib/WordNet/Similarity/lesk.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # WordNet::Similarity::lesk.pm version 2.04
2             # (Last updated $Id: lesk.pm,v 1.30 2015/10/04 15:06:16 tpederse Exp $)
3             #
4             # Module to accept two WordNet synsets and to return a floating point
5             # number that indicates how similar those two synsets are, using an
6             # adaptation of the Lesk method as outlined in
7             # Satanjeev Banerjee, Ted Pedersen>
8             #
9             # Copyright (c) 2005,
10             #
11             # Ted Pedersen, University of Minnesota Duluth
12             # tpederse at d.umn.edu
13             #
14             # Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
15             # banerjee+ at cs.cmu.edu
16             #
17             # Siddharth Patwardhan, University of Utah, Salt Lake City
18             # sidd at cs.utah.edu
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36             #
37             # ------------------------------------------------------------------
38              
39             package WordNet::Similarity::lesk;
40              
41             =head1 NAME
42              
43             WordNet::Similarity::lesk - Perl module for computing semantic relatedness
44             of word senses using gloss overlaps as described by Banerjee and Pedersen
45             (2002) -- a method that adapts the Lesk approach to WordNet.
46              
47             =head1 SYNOPSIS
48              
49             use WordNet::Similarity::lesk;
50              
51             use WordNet::QueryData;
52              
53             my $wn = WordNet::QueryData->new();
54              
55             my $lesk = WordNet::Similarity::lesk->new($wn);
56              
57             my $value = $lesk->getRelatedness("car#n#1", "bus#n#2");
58              
59             ($error, $errorString) = $lesk->getError();
60              
61             die "$errorString\n" if($error);
62              
63             print "car (sense 1) <-> bus (sense 2) = $value\n";
64              
65             =head1 DESCRIPTION
66              
67             Lesk (1985) proposed that the relatedness of two words is proportional to
68             to the extent of overlaps of their dictionary definitions. Banerjee and
69             Pedersen (2002) extended this notion to use WordNet as the dictionary
70             for the word definitions. This notion was further extended to use the rich
71             network of relationships between concepts present is WordNet. This adapted
72             lesk measure has been implemented in this module.
73              
74             =head2 Methods
75              
76             =over
77              
78             =cut
79              
80 3     3   6258 use strict;
  3         5  
  3         113  
81 3     3   12 use warnings;
  3         5  
  3         73  
82 3     3   1947 use Text::OverlapFinder;
  3         4896  
  3         103  
83 3     3   100 use WordNet::Similarity;
  0            
  0            
84             use File::Spec;
85             use WordNet::Similarity::GlossFinder;
86              
87             our @ISA = qw(WordNet::Similarity::GlossFinder);
88             our $VERSION = '2.04';
89              
90             WordNet::Similarity::addConfigOption ("normalize", 0, "i", 0);
91              
92             =item $measure->initialize($file)
93              
94             Overrides the initialize method in the parent class (GlossFinder.pm). This method
95             essentially initializes the measure for use.
96              
97             Parameters: $file -- configuration file.
98              
99             Returns: none.
100              
101             =cut
102              
103             # Initialization of the WordNet::Similarity::lesk object... parses the config file and sets up
104             # global variables, or sets them to default values.
105             # INPUT PARAMS : $paramFile .. File containing the module specific params.
106             # RETURN VALUES : (none)
107             sub initialize
108             {
109             my $self = shift;
110             my $paramFile;
111             my $wn = $self->{wn};
112             my $class = ref $self || $self;
113              
114             # Stemming? Normalizing?
115             $self->{stem} = 0;
116             $self->{normalize} = 0;
117              
118             # Look for the default lesk relation file...
119             if(!defined $self->{relationDefault})
120             {
121             my $path;
122             my $header;
123             my @possiblePaths = ();
124            
125             # Look for all possible default data files installed.
126             foreach $path (@INC)
127             {
128             # JM 1-16-04 -- modified to use File::Spec
129             my $file = File::Spec->catfile($path, 'WordNet', 'lesk-relation.dat');
130             push @possiblePaths, $file if(-e $file);
131             }
132            
133             # If there are multiple possibilities, get the one in the correct format.
134             foreach $path (@possiblePaths)
135             {
136             next if(!open(RELATIONS, $path));
137             $header = ;
138             $header =~ s/\s+//g;
139             if($header =~ /RelationFile/)
140             {
141             $self->{relationDefault} = $path;
142             close(RELATIONS);
143             last;
144             }
145             close(RELATIONS);
146             }
147             }
148              
149             # Call the initialize method in the parent...
150             $self->SUPER::initialize (@_);
151              
152             # initialize string compare module. No stemming in string
153             # comparison, so put 0.
154             #&string_compare_initialize(0, %stopHash);
155             my @finder_args = ();
156              
157             if (defined $self->{stop}) {
158             push @finder_args, stoplist => $self->{stop};
159             }
160             # lesk doesn't use a comp file, so we can ignore that
161            
162             $self->{finder} = Text::OverlapFinder->new (@finder_args);
163             }
164              
165             =item $lesk->traceOptions()
166              
167             This method is internally called to determine the extra options
168             specified by this measure (apart from the default options specified
169             in the WordNet::Similarity base class).
170              
171             Parameters: none.
172              
173             Returns: none.
174              
175             =cut
176              
177             # 12/5/03 JM (#1)
178             # show all config options specific to this module
179             sub traceOptions
180             {
181             my $self = shift;
182             $self->{traceString} .= "normalize :: $self->{normalize}\n";
183             $self->SUPER::traceOptions();
184             }
185              
186             =item $lesk->getRelatedness
187              
188             Computes the relatedness of two word senses using the Extended Gloss
189             Overlaps algorithm.
190              
191             Parameters: two word senses in "word#pos#sense" format.
192              
193             Returns: Unless a problem occurs, the return value is the relatedness
194             score, which is greater-than or equal-to 0. If an error occurs,
195             then the error level is set to non-zero and an error
196             string is created (see the description of getError()).
197              
198             =cut
199              
200             sub getRelatedness
201             {
202             my $self = shift;
203             my $wps1 = shift;
204             my $wps2 = shift;
205             my $wn = $self->{wn};
206             my $class = ref $self || $self;
207            
208             # Check the existence of the WordNet::QueryData object.
209             unless($wn)
210             {
211             $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
212             $self->{errorString} .= "A WordNet::QueryData object is required.";
213             $self->{error} = 2;
214             return undef;
215             }
216              
217             # Using validation code from parseWps() in a super-class
218             my $ret = $self->parseWps($wps1, $wps2);
219             ref $ret or return undef;
220              
221             # Initialize traces.
222             $self->{traceString} = "";
223              
224             # Now check if the similarity value for these two synsets is in
225             # fact in the cache... if so return the cached value.
226             my $relatedness =
227             $self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
228             defined $relatedness and return $relatedness;
229            
230             # Now get down to really finding the relatedness of these two.
231             # see if any traces reqd. if so, put in the synset arrays.
232             if($self->{trace})
233             {
234             # ah so we do need SOME traces! put in the synset names.
235             $self->{traceString} = "Synset 1: $wps1\n";
236             $self->{traceString} .= "Synset 2: $wps2\n";
237             }
238            
239             # NOTE: Thanks to Wybo Wiersma for contributing optimizations
240             # in the following code.
241              
242             # Get the gloss strings from the get_wn_info module
243             my ($firstStringArray, $secondStringArray, $weightsArray, $functionsStringArray) = $self->getSuperGlosses($wps1, $wps2);
244             my $score = 0;
245             for(my $i = 0; $i < scalar(@{$weightsArray}); $i++)
246             {
247             my $functionsScore = 0;
248             my $funcStringPrinted = 0;
249              
250             # so those are the two strings for this relation pair. get the
251             # string overlaps
252             my ($overlaps, $wc1, $wc2);
253             if(defined($firstStringArray->[$i]) && defined($secondStringArray->[$i])
254             && $firstStringArray->[$i] ne "" && $secondStringArray->[$i] ne "")
255             {
256             ($overlaps, $wc1, $wc2) = $self->{finder}->getOverlaps($firstStringArray->[$i], $secondStringArray->[$i]);
257             }
258            
259             my $overlapsTraceString = "";
260             my $key;
261             # modified by tdp oct 4 2015 to make matching more robust
262             # results from keys can come back in various orders, so sort them
263             # foreach $key (keys %{$overlaps})
264             foreach $key (sort keys %{$overlaps})
265             {
266             # find the length of the key, square it, multiply with its
267             # value and finally with the weight associated with this
268             # relation pair to get the score for this particular
269             # overlap.
270            
271             my @tempArray = split(/\s+/, $key);
272             my $value = ($#tempArray + 1) * ($#tempArray + 1) * $overlaps->{$key};
273             $functionsScore += $value;
274              
275             # put this overlap into the trace string, if necessary
276             if($self->{trace} == 1)
277             {
278             $overlapsTraceString .= "$overlaps->{$key} x \"$key\" ";
279             }
280             }
281            
282             # normalize the function score computed above if required
283             if ($self->{normalize} && defined($wc1) && defined($wc2) && ($wc1 * $wc2))
284             {
285             $functionsScore /= $wc1 * $wc2;
286             }
287            
288             # weight functionsScore with weight of this function
289             $functionsScore *= $weightsArray->[$i];
290            
291             # add to main score for this sense
292             $score += $functionsScore;
293            
294             # if we have an overlap, send functionsString, functionsScore
295             # and overlapsTraceString to trace string, if trace string requested
296             if($self->{trace} == 1 && $overlapsTraceString ne "")
297             {
298             $self->{traceString} .= "".($functionsStringArray->[$i]).": $functionsScore\n";
299             $funcStringPrinted = 1;
300              
301             $self->{traceString} .= "Overlaps: $overlapsTraceString\n";
302             }
303            
304             # check if the two strings need to be reported in the trace.
305             if ($self->{trace} == 2)
306             {
307             if(!$funcStringPrinted)
308             {
309             $self->{traceString} .= "".($functionsStringArray->[$i])."\n";
310             $funcStringPrinted = 1;
311             }
312              
313             $self->{traceString} .= "String 1: \"".($firstStringArray->[$i])."\"\n";
314             $self->{traceString} .= "String 2: \"".($secondStringArray->[$i])."\"\n";
315             }
316             }
317              
318             # that does all the scoring. Put in cache if doing caching. Then
319             # return the score.
320             $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
321             return $score;
322             }
323              
324             1;
325             __END__