File Coverage

blib/lib/WordNet/SenseKey.pm
Criterion Covered Total %
statement 14 97 14.4
branch 0 20 0.0
condition n/a
subroutine 5 11 45.4
pod 4 6 66.6
total 23 134 17.1


line stmt bran cond sub pod time code
1             # -*- perl -*-
2             #
3             # WordNet::SenseKey.pm version 1.03
4             #
5             # Given an WordNet file offset, return the corresponding sense key
6             # Meant to be used with WordNet::Similarity, which does not normally
7             # manipulate data using sense keys.
8             #
9             # Copyright (c) 2008 Linas Vepstas linasvepstas at gmail.com
10             #
11             # This program is free software; you can redistribute it and/or
12             # modify it under the terms of the GNU General Public License
13             # as published by the Free Software Foundation; either version 2
14             # of the License, or (at your option) any later version.
15             #
16             # This program is distributed in the hope that it will be useful,
17             # but WITHOUT ANY WARRANTY; without even the implied warranty of
18             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19             # GNU General Public License for more details.
20             #
21             # You should have received a copy of the GNU General Public License
22             # along with this program; if not, write to
23             #
24             # The Free Software Foundation, Inc.,
25             # 59 Temple Place - Suite 330,
26             # Boston, MA 02111-1307, USA.
27             #
28             # ------------------------------------------------------------------
29              
30             package WordNet::SenseKey;
31              
32             =head1 NAME
33              
34             WordNet::SenseKey - convert WordNet sense keys to sense numbers, and v.v.
35              
36             =head1 SYNOPSIS
37              
38             use WordNet::QueryData;
39             use WordNet::SenseKey;
40              
41             my $wn = WordNet::QueryData->new("/usr/share/wordnet");
42             my $sk = WordNet::SenseKey->new($wn);
43              
44             my $skey = $sk->get_sense_key("run#v#2");
45             print "Found the sense key $skey for run#v#2\n";
46              
47             my $sense = $sk->get_sense_num($skey);
48             print "Found sense $sense for key $skey\n";
49              
50             my @synset = $sk->get_synset($skey);
51             print "Synset is @synset\n";
52              
53             my $can = $sk->get_canonical_sense("escape", "run%2:38:04::");
54             print "Found sense $can\n";
55              
56             =head1 DESCRIPTION
57              
58             The WordNet::Similarity package is designed to work with words in the
59             form of lemma#pos#num where "lemma" is the word lemma, "pos" is the
60             part of speech, and "num" is the sense number. Unfortuantely, the
61             sense numbering is not stable from one WordNet release to another.
62             Thus, for external programs, it can often be more useful to work with
63             sense keys. Unfortunately, the Wordnet::Similarity package is unaware
64             of sense keys. This class fills that gap.
65              
66             WordNet senses keys are described in greater detail in
67              
68             http://wordnet.princeton.edu/man/senseidx.5WN.html
69              
70             There are four routines implemented here:
71              
72             get_sense_key($sense);
73             get_sense_num($sense_key);
74             get_synset($sense_key);
75             get_canonical_sense($lemma, $sense_key);
76              
77             =head2 get_sense_key
78              
79             Given a word sense, in the form of lemma#pos#num, this method returns
80             the corresponding sense key, as defined by WordNet. Here, "lemma" is the
81             word lemma, "pos" is the part of speech, and "num" is the sense number.
82             The format of WordNet sense keys is documented in senseidx(5WN), one of
83             the WordNet man pages.
84              
85             Returns an undefined value if the sense key cannot be found.
86             The 'get_sense_num' method performs the inverse operation.
87              
88             =head2 get_sense_num
89              
90             Given a WordNet sense key, this method returns the corresponding
91             word-sense string, in the lemma#pos#num format. This function is the
92             inverse of the get_sense_key method; calling one, and then the other,
93             should always return exactly the original input.
94              
95             Returns an undefined value if the sense cannot be found.
96              
97             =head2 get_synset
98              
99             Given a WordNet sense key, this method returns a list of other sense
100             keys that belong to the same synset.
101              
102             =head2 get_canonical_sense
103              
104             Senses in a synset all have different lemmas. This function selects
105             one particular element of a synset, given a lemma, and any other member
106             of the synset. Thus, for example, run%2:38:04:: and escape%2:38:02::
107             belong to the same synset. Then
108              
109             get_canonical_sense("escape", "run%2:38:04::");
110              
111             will return escape%2:38:02::, as this is the sense of "escape" that
112             belongs to the same synset as run%2:38:04::. Returns an undefined
113             value if the sense cannot be found.
114              
115             =head1 SEE ALSO
116              
117             senseidx(5WN), WordNet::Similarity(3), WordNet::QueryData(3)
118              
119             http://wordnet.princeton.edu/
120             http://www.ai.mit.edu/~jrennie/WordNet
121             http://groups.yahoo.com/group/wn-similarity
122              
123             =head1 AUTHOR
124              
125             Linas Vepstas
126              
127             =head1 COPYRIGHT AND LICENSE
128              
129             Copyright (c) 2008, 2009 Linas Vepstas
130              
131             This program is free software; you can redistribute it and/or
132             modify it under the terms of the GNU General Public License
133             as published by the Free Software Foundation; either version 2
134             of the License, or (at your option) any later version.
135              
136             This program is distributed in the hope that it will be useful,
137             but WITHOUT ANY WARRANTY; without even the implied warranty of
138             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
139             GNU General Public License for more details.
140              
141             You should have received a copy of the GNU General Public License
142             along with this program; if not, write to
143              
144             The Free Software Foundation, Inc.,
145             59 Temple Place - Suite 330,
146             Boston, MA 02111-1307, USA.
147              
148             Note: a copy of the GNU General Public License is available on the web
149             at and is included in this
150             distribution as GPL.txt.
151              
152             =cut
153              
154 1     1   28107 use strict;
  1         4  
  1         37  
155 1     1   6 use warnings;
  1         2  
  1         50  
156             require Exporter;
157              
158             BEGIN {
159 1     1   15 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  1         6  
  1         110  
160             # List of classes from which we are inheriting methods
161 1     1   18 @ISA = qw(Exporter);
162             # Automatically loads these function names to be used without qualification
163 1         2 @EXPORT = qw();
164             # Allows these functions to be used without qualification
165 1         2 @EXPORT_OK = qw();
166 1         1200 $VERSION = '1.03';
167             }
168              
169 1     1   8352 END { } # module clean-up code here (global destructor)
170              
171             # ------------------------------------------------------
172             # Constructor
173             # Looks in a default path for the sense index file.
174             # Reads it, builds an associative array of file offsets to sense keys.
175             sub new
176             {
177 0     0 0   my ($class, $wn) = @_;
178 0           my $self = {
179             senseidx_path => "/usr/share/wordnet",
180             senseidx_file => "/usr/share/wordnet/index.sense",
181             wn => $wn,
182             reversed_index => undef,
183             forward_index => undef
184             };
185 0           bless $self, $class;
186              
187             # Get a valid data path from WordNet::QueryData.
188 0           my $path = $wn->dataPath();
189 0 0         if (defined($path))
190             {
191 0           $self->{senseidx_path} = $path;
192 0           $self->{senseidx_file} = $path . "/index.sense";
193             }
194              
195             # Open the file for reading
196 0           my $fh = new FileHandle($self->{senseidx_file});
197 0 0         if (!defined($fh))
198             {
199 0           die "Unable to open $self->{senseidx_file}: $!";
200             }
201            
202             # Build a reverse index of sense-keys to offsets.
203 0           my %rev_idx = ();
204 0           my %fwd_idx = ();
205 0           while (<$fh>)
206             {
207 0           my ($skey, $offset, $snum, $tag_cnt) = split;
208 0           my $keys = $rev_idx{$offset};
209             # $keys is a reference to an array
210 0           push @$keys, $skey;
211 0           $rev_idx{$offset} = [@$keys];
212             # print "index entry $skey and $offset so -- @$keys\n";
213 0           $fwd_idx{$skey} = $snum;
214             }
215 0           undef $fh;
216              
217             # Remember that \% is an array reference.
218 0           $self->{reversed_index} = \%rev_idx;
219 0           $self->{forward_index} = \%fwd_idx;
220              
221 0           return $self;
222             }
223              
224             # report WordNet data dir
225 0     0 0   sub dataPath { my $self = shift; return $self->{senseidx_path}; }
  0            
226              
227             # ------------------------------------------------------
228              
229             sub get_sense_key
230             {
231 0     0 1   my ($self, $lempos) = @_;
232 0           my $wn = $self->{wn};
233              
234             # If the args are undefined, return undefined value.
235 0           my $offset = $wn->offset($lempos);
236 0 0         if (!defined($offset))
237             {
238 0           return $offset;
239             }
240 0 0         if (!defined($lempos))
241             {
242 0           return $lempos;
243             }
244              
245             # Change over to sense-key style notation
246 0 0         if ($lempos) {
247 0           $lempos =~ s/#.*//;
248              
249             # Tight matching -- failes to find %5 synsets, e.g. sane#a#2 which
250             # maps to sane%5:00:00:rational:00
251             # $lempos =~ s/#/%/;
252             # $lempos =~ s/%n/%1/;
253             # $lempos =~ s/%v/%2/;
254             # $lempos =~ s/%a/%3/;
255             # $lempos =~ s/%r/%4/;
256              
257             # make sure its lower-case too.
258 0           $lempos =~ tr/[A-Z]/[a-z]/;
259             }
260              
261             # pad the offet with zeroes, if its too short to be a valid offset.
262 0           my $len = 8 - length($offset);
263 0           for (my $i=0; $i< $len; $i++) {
264 0           $offset = "0" . $offset;
265             }
266              
267             # get the array reference
268 0           my $rev_idx = $self->{reversed_index};
269              
270 0           my $keys = $rev_idx->{$offset};
271             # print "key candidates are @$keys\n";
272              
273             # Loop over all entries in the synset
274 0           my $foundkey = "";
275 0           foreach my $sensekey (@$keys)
276             {
277 0 0         if ($sensekey =~ $lempos) {
278 0           $foundkey = $sensekey;
279 0           last;
280             }
281             }
282              
283 0           return $foundkey;
284             }
285              
286             # ------------------------------------------------------
287             sub get_sense_num
288             {
289 0     0 1   my ($self, $sense_key) = @_;
290              
291 0           $sense_key =~ m/([\w\.]+)%(\d+):*/;
292 0           my $lemma = $1;
293 0           my $pos = $2;
294 0           $pos =~ s/1/n/;
295 0           $pos =~ s/2/v/;
296 0           $pos =~ s/3/a/;
297 0           $pos =~ s/4/r/;
298              
299             # XXX what about 5 ??
300              
301 0           my $fwd_idx = $self->{forward_index};
302 0           my $sense_num = $fwd_idx->{$sense_key};
303              
304 0 0         if (!defined($sense_num)) { return $sense_num; }
  0            
305              
306 0           return $lemma . "#" . $pos . "#" . $sense_num;
307             }
308              
309              
310             # ------------------------------------------------------
311             # get_synset -- return a wordnet synset.
312             # Given a sense key as input, this will
313             # return a list of sense keys in the synset.
314             sub get_synset
315             {
316 0     0 1   my ($self, $sense_key) = @_;
317 0           my $sense_str = $self->get_sense_num($sense_key);
318              
319 0 0         if (!defined($sense_str)) { return (); }
  0            
320              
321 0           my $wn = $self->{wn};
322 0           my @synset = $wn->querySense($sense_str, "syns");
323 0           my @keyset = ();
324 0           foreach (@synset)
325             {
326 0           my $lempos = $_;
327 0           my $skey = $self->get_sense_key($lempos);
328 0           push @keyset, $skey;
329             }
330              
331 0           return @keyset;
332             }
333              
334             # ------------------------------------------------------
335              
336             # get_canonical_sense -- get matching lemma from a synset.
337             # Return an alternate sense key that belongs to the same
338             # synset ass the input sense key, but has the the lemmatized
339             # form $lemma at its root.
340             #
341             # Thus, for example:
342             #
343             # get_canonical_sense("join#v", "connect%2:42:02::");
344             #
345             # will return "join%2:42:01", because "join%2:42:01" is in the same
346             # synset as "connect%2:42:02::", but has "join" as its root.
347             #
348             sub get_canonical_sense
349             {
350 0     0 1   my ($self, $lemma, $sense) = @_;
351 0           my $wn = $self->{wn};
352              
353             # strip off the part-of-speech marker from the lemma.
354 0           $lemma =~ m/([\w\.]+)#/;
355 0 0         if (defined($1))
356             {
357 0           $lemma = $1;
358             }
359              
360             # Loop over the synset, looking for a matching form.
361 0           my @synset = $self->get_synset($sense);
362 0           foreach (@synset)
363             {
364 0           my $altsense = $_;
365 0           $altsense =~ m/([\w\.]+)%/;
366 0 0         if ($1 eq $lemma)
367             {
368 0           return $altsense;
369             }
370             }
371              
372 0           my $notfound; # this is undefined!
373 0           return $notfound;
374             }
375              
376              
377             # module must return true
378             1;
379             __END__