File Coverage

blib/lib/WordNet/Similarity/lesk.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # WordNet::Similarity::lesk.pm version 2.04
2             # (Last updated $Id: lesk.pm,v 1.29 2008/03/27 06:21:17 sidz1979 Exp $)
3             #
4             # Module to accept two WordNet synsets and to return a floating point
5             # number that indicates how similar those two synsets are, using an
6             # adaptation of the Lesk method as outlined in
7             # Satanjeev Banerjee, Ted Pedersen>
8             #
9             # Copyright (c) 2005,
10             #
11             # Ted Pedersen, University of Minnesota Duluth
12             # tpederse at d.umn.edu
13             #
14             # Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
15             # banerjee+ at cs.cmu.edu
16             #
17             # Siddharth Patwardhan, University of Utah, Salt Lake City
18             # sidd at cs.utah.edu
19             #
20             # This program is free software; you can redistribute it and/or
21             # modify it under the terms of the GNU General Public License
22             # as published by the Free Software Foundation; either version 2
23             # of the License, or (at your option) any later version.
24             #
25             # This program is distributed in the hope that it will be useful,
26             # but WITHOUT ANY WARRANTY; without even the implied warranty of
27             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             # GNU General Public License for more details.
29             #
30             # You should have received a copy of the GNU General Public License
31             # along with this program; if not, write to
32             #
33             # The Free Software Foundation, Inc.,
34             # 59 Temple Place - Suite 330,
35             # Boston, MA 02111-1307, USA.
36             #
37             # ------------------------------------------------------------------
38              
39             package WordNet::Similarity::lesk;
40              
41             =head1 NAME
42              
43             WordNet::Similarity::lesk - Perl module for computing semantic relatedness
44             of word senses using gloss overlaps as described by Banerjee and Pedersen
45             (2002) -- a method that adapts the Lesk approach to WordNet.
46              
47             =head1 SYNOPSIS
48              
49             use WordNet::Similarity::lesk;
50              
51             use WordNet::QueryData;
52              
53             my $wn = WordNet::QueryData->new();
54              
55             my $lesk = WordNet::Similarity::lesk->new($wn);
56              
57             my $value = $lesk->getRelatedness("car#n#1", "bus#n#2");
58              
59             ($error, $errorString) = $lesk->getError();
60              
61             die "$errorString\n" if($error);
62              
63             print "car (sense 1) <-> bus (sense 2) = $value\n";
64              
65             =head1 DESCRIPTION
66              
67             Lesk (1985) proposed that the relatedness of two words is proportional to
68             to the extent of overlaps of their dictionary definitions. Banerjee and
69             Pedersen (2002) extended this notion to use WordNet as the dictionary
70             for the word definitions. This notion was further extended to use the rich
71             network of relationships between concepts present is WordNet. This adapted
72             lesk measure has been implemented in this module.
73              
74             =head2 Methods
75              
76             =over
77              
78             =cut
79              
80 3     3   6826 use strict;
  3         5  
  3         114  
81 3     3   15 use warnings;
  3         7  
  3         77  
82 3     3   3180 use Text::OverlapFinder;
  3         5728  
  3         123  
83 3     3   141 use WordNet::Similarity;
  0            
  0            
84             use File::Spec;
85             use WordNet::Similarity::GlossFinder;
86              
87             our @ISA = qw(WordNet::Similarity::GlossFinder);
88             our $VERSION = '2.04';
89              
90             WordNet::Similarity::addConfigOption ("normalize", 0, "i", 0);
91              
92             =item $measure->initialize($file)
93              
94             Overrides the initialize method in the parent class (GlossFinder.pm). This method
95             essentially initializes the measure for use.
96              
97             Parameters: $file -- configuration file.
98              
99             Returns: none.
100              
101             =cut
102              
103             # Initialization of the WordNet::Similarity::lesk object... parses the config file and sets up
104             # global variables, or sets them to default values.
105             # INPUT PARAMS : $paramFile .. File containing the module specific params.
106             # RETURN VALUES : (none)
107             sub initialize
108             {
109             my $self = shift;
110             my $paramFile;
111             my $wn = $self->{wn};
112             my $class = ref $self || $self;
113              
114             # Stemming? Normalizing?
115             $self->{stem} = 0;
116             $self->{normalize} = 0;
117              
118             # Look for the default lesk relation file...
119             if(!defined $self->{relationDefault})
120             {
121             my $path;
122             my $header;
123             my @possiblePaths = ();
124            
125             # Look for all possible default data files installed.
126             foreach $path (@INC)
127             {
128             # JM 1-16-04 -- modified to use File::Spec
129             my $file = File::Spec->catfile($path, 'WordNet', 'lesk-relation.dat');
130             push @possiblePaths, $file if(-e $file);
131             }
132            
133             # If there are multiple possibilities, get the one in the correct format.
134             foreach $path (@possiblePaths)
135             {
136             next if(!open(RELATIONS, $path));
137             $header = ;
138             $header =~ s/\s+//g;
139             if($header =~ /RelationFile/)
140             {
141             $self->{relationDefault} = $path;
142             close(RELATIONS);
143             last;
144             }
145             close(RELATIONS);
146             }
147             }
148              
149             # Call the initialize method in the parent...
150             $self->SUPER::initialize (@_);
151              
152             # initialize string compare module. No stemming in string
153             # comparison, so put 0.
154             #&string_compare_initialize(0, %stopHash);
155             my @finder_args = ();
156              
157             if (defined $self->{stop}) {
158             push @finder_args, stoplist => $self->{stop};
159             }
160             # lesk doesn't use a comp file, so we can ignore that
161            
162             $self->{finder} = Text::OverlapFinder->new (@finder_args);
163             }
164              
165             =item $lesk->traceOptions()
166              
167             This method is internally called to determine the extra options
168             specified by this measure (apart from the default options specified
169             in the WordNet::Similarity base class).
170              
171             Parameters: none.
172              
173             Returns: none.
174              
175             =cut
176              
177             # 12/5/03 JM (#1)
178             # show all config options specific to this module
179             sub traceOptions
180             {
181             my $self = shift;
182             $self->{traceString} .= "normalize :: $self->{normalize}\n";
183             $self->SUPER::traceOptions();
184             }
185              
186             =item $lesk->getRelatedness
187              
188             Computes the relatedness of two word senses using the Extended Gloss
189             Overlaps algorithm.
190              
191             Parameters: two word senses in "word#pos#sense" format.
192              
193             Returns: Unless a problem occurs, the return value is the relatedness
194             score, which is greater-than or equal-to 0. If an error occurs,
195             then the error level is set to non-zero and an error
196             string is created (see the description of getError()).
197              
198             =cut
199              
200             sub getRelatedness
201             {
202             my $self = shift;
203             my $wps1 = shift;
204             my $wps2 = shift;
205             my $wn = $self->{wn};
206             my $class = ref $self || $self;
207            
208             # Check the existence of the WordNet::QueryData object.
209             unless($wn)
210             {
211             $self->{errorString} .= "\nError (${class}::getRelatedness()) - ";
212             $self->{errorString} .= "A WordNet::QueryData object is required.";
213             $self->{error} = 2;
214             return undef;
215             }
216              
217             # Using validation code from parseWps() in a super-class
218             my $ret = $self->parseWps($wps1, $wps2);
219             ref $ret or return undef;
220              
221             # Initialize traces.
222             $self->{traceString} = "";
223              
224             # Now check if the similarity value for these two synsets is in
225             # fact in the cache... if so return the cached value.
226             my $relatedness =
227             $self->{doCache} ? $self->fetchFromCache ($wps1, $wps2) : undef;
228             defined $relatedness and return $relatedness;
229            
230             # Now get down to really finding the relatedness of these two.
231             # see if any traces reqd. if so, put in the synset arrays.
232             if($self->{trace})
233             {
234             # ah so we do need SOME traces! put in the synset names.
235             $self->{traceString} = "Synset 1: $wps1\n";
236             $self->{traceString} .= "Synset 2: $wps2\n";
237             }
238            
239             # NOTE: Thanks to Wybo Wiersma for contributing optimizations
240             # in the following code.
241              
242             # Get the gloss strings from the get_wn_info module
243             my ($firstStringArray, $secondStringArray, $weightsArray, $functionsStringArray) = $self->getSuperGlosses($wps1, $wps2);
244             my $score = 0;
245             for(my $i = 0; $i < scalar(@{$weightsArray}); $i++)
246             {
247             my $functionsScore = 0;
248             my $funcStringPrinted = 0;
249              
250             # so those are the two strings for this relation pair. get the
251             # string overlaps
252             my ($overlaps, $wc1, $wc2);
253             if(defined($firstStringArray->[$i]) && defined($secondStringArray->[$i])
254             && $firstStringArray->[$i] ne "" && $secondStringArray->[$i] ne "")
255             {
256             ($overlaps, $wc1, $wc2) = $self->{finder}->getOverlaps($firstStringArray->[$i], $secondStringArray->[$i]);
257             }
258            
259             my $overlapsTraceString = "";
260             my $key;
261             foreach $key (keys %{$overlaps})
262             {
263             # find the length of the key, square it, multiply with its
264             # value and finally with the weight associated with this
265             # relation pair to get the score for this particular
266             # overlap.
267            
268             my @tempArray = split(/\s+/, $key);
269             my $value = ($#tempArray + 1) * ($#tempArray + 1) * $overlaps->{$key};
270             $functionsScore += $value;
271              
272             # put this overlap into the trace string, if necessary
273             if($self->{trace} == 1)
274             {
275             $overlapsTraceString .= "$overlaps->{$key} x \"$key\" ";
276             }
277             }
278            
279             # normalize the function score computed above if required
280             if ($self->{normalize} && defined($wc1) && defined($wc2) && ($wc1 * $wc2))
281             {
282             $functionsScore /= $wc1 * $wc2;
283             }
284            
285             # weight functionsScore with weight of this function
286             $functionsScore *= $weightsArray->[$i];
287            
288             # add to main score for this sense
289             $score += $functionsScore;
290            
291             # if we have an overlap, send functionsString, functionsScore
292             # and overlapsTraceString to trace string, if trace string requested
293             if($self->{trace} == 1 && $overlapsTraceString ne "")
294             {
295             $self->{traceString} .= "".($functionsStringArray->[$i]).": $functionsScore\n";
296             $funcStringPrinted = 1;
297              
298             $self->{traceString} .= "Overlaps: $overlapsTraceString\n";
299             }
300            
301             # check if the two strings need to be reported in the trace.
302             if ($self->{trace} == 2)
303             {
304             if(!$funcStringPrinted)
305             {
306             $self->{traceString} .= "".($functionsStringArray->[$i])."\n";
307             $funcStringPrinted = 1;
308             }
309              
310             $self->{traceString} .= "String 1: \"".($firstStringArray->[$i])."\"\n";
311             $self->{traceString} .= "String 2: \"".($secondStringArray->[$i])."\"\n";
312             }
313             }
314              
315             # that does all the scoring. Put in cache if doing caching. Then
316             # return the score.
317             $self->{doCache} and $self->storeToCache($wps1, $wps2, $score);
318             return $score;
319             }
320              
321             1;
322             __END__