File Coverage

blib/lib/WordNet/Similarity/ICFinder.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             # WordNet::Similarity::ICFinder.pm version 2.04
2             # (Last updated $Id: ICFinder.pm,v 1.19 2008/03/27 06:21:17 sidz1979 Exp $)
3             #
4             # A generic (and abstract) information content measure--this is not a
5             # real measure. The res, lin, and jcn measures inherit from this class.
6             #
7             # Copyright (c) 2005,
8             #
9             # Ted Pedersen, University of Minnesota Duluth
10             # tpederse at d.umn.edu
11             #
12             # Jason Michelizzi, Univeristy of Minnesota Duluth
13             # mich0212 at d.umn.edu
14             #
15             # Siddharth Patwardhan, University of Utah, Salt Lake City
16             # sidd at cs.utah.edu
17             #
18             # This program is free software; you can redistribute it and/or
19             # modify it under the terms of the GNU General Public License
20             # as published by the Free Software Foundation; either version 2
21             # of the License, or (at your option) any later version.
22             #
23             # This program is distributed in the hope that it will be useful,
24             # but WITHOUT ANY WARRANTY; without even the implied warranty of
25             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26             # GNU General Public License for more details.
27             #
28             # You should have received a copy of the GNU General Public License
29             # along with this program; if not, write to
30             #
31             # The Free Software Foundation, Inc.,
32             # 59 Temple Place - Suite 330,
33             # Boston, MA 02111-1307, USA.
34             #
35             # ------------------------------------------------------------------
36              
37             package WordNet::Similarity::ICFinder;
38              
39             =head1 NAME
40              
41             WordNet::Similarity::ICFinder - a module for finding the information content
42             of concepts in WordNet
43              
44             =head1 SYNOPSIS
45              
46             use WordNet::QueryData;
47             my $wn = WordNet::QueryData->new;
48             defined $wn or die "Construction of WordNet::QueryData failed";
49              
50             use WordNet::Similarity::ICFinder;
51             my $obj = WordNet::Similarity::ICFinder->new ($wn);
52             my ($err, $errString) = $obj->getError ();
53             $err and die $errString;
54              
55             my $wps1 = 'cat#n#1';
56             my $wps2 = 'feline#n#1';
57              
58             my $offset1 = $wn -> offset ($wps1);
59             my $offset2 = $wn -> offset ($wps2);
60              
61             # using the wps mode
62              
63             my $ic = $obj->IC ($wps1, 'n', 'wps');
64             my $prob = $obj->probability ($wps1, 'n', 'wps');
65             my $freq = $obj->getFrequency ($wps1, 'n', 'wps');
66             print "$wps1 has frequency $freq, probability $prob, and IC $ic\n";
67              
68             my $ic = $obj->IC ($wps2, 'n', 'wps');
69             my $prob = $obj->probability ($wps2, 'n', 'wps');
70             my $freq = $obj->getFrequency ($wps2, 'n', 'wps');
71             print "$wps2 has frequency $freq, probability $prob, and IC $ic\n";
72              
73             my @lcsbyic = $obj -> getLCSbyIC($wps1,$wps2,'n','wps');
74             print "$wps1 and $wps2 have LCS $lcsbyic[0]->[0] with IC $lcsbyic[0]->[1]\n";
75              
76             # doing the same thing in the offset mode
77              
78             my $ic = $obj->IC ($offset1, 'n', 'offset');
79             my $prob = $obj->probability ($offset1, 'n', 'offset');
80             my $freq = $obj->getFrequency ($offset1, 'n', 'offset');
81             print "$offset1 has frequency $freq, probability $prob, and IC $ic\n";
82              
83             my $ic = $obj->IC ($offset2, 'n', 'offset');
84             my $prob = $obj->probability ($offset2, 'n', 'offset');
85             my $freq = $obj->getFrequency ($offset2, 'n', 'offset');
86             print "$offset2 has frequency $freq, probability $prob, and IC $ic\n";
87              
88             my @lcsbyic = $obj -> getLCSbyIC($offset1,$offset2,'n','wps');
89             print "$offset1 and $offset2 have LCS $lcsbyic[0]->[0] with IC $lcsbyic[0]->[1]\n";
90              
91             =head1 DESCRIPTION
92              
93             =head2 Introduction
94              
95             Three of the measures provided within the package require information
96             content values of concepts (WordNet synsets) for computing the semantic
97             relatedness of concepts. Resnik (1995) describes a method for computing the
98             information content of concepts from large corpora of text. In order to
99             compute information content of concepts, according to the method described
100             in the paper, we require the frequency of occurrence of every concept in a
101             large corpus of text. We provide these frequency counts to the three
102             measures (Resnik, Jiang-Conrath and Lin measures) in files that we call
103             information content files. These files contain a list of WordNet synset
104             offsets along with their part of speech and frequency count. The files are
105             also used to determine the topmost nodes of the noun and verb 'is-a'
106             hierarchies in WordNet. The information content file to be used is specified
107             in the configuration file for the measure. If no information content file is
108             specified, then the default information content file, generated at the time
109             of the installation of the WordNet::Similarity modules, is used. A description
110             of the format of these files follows. The FIRST LINE of this file must contain
111             the hash-code of WordNet the the file was created with. This should be present
112             as a string of the form
113              
114             wnver::
115              
116             For example, if WordNet version 2.1 with the hash-code
117             LL1BZMsWkr0YOuiewfbiL656+Q4 was used for creation of the information content
118             file, the following line would be present at the start of the information
119             content file.
120              
121             wnver::LL1BZMsWkr0YOuiewfbiL656+Q4
122              
123             The rest of the file contains on each line, a WordNet synset offset,
124             part-of-speech and a frequency count, of the form
125              
126             [ROOT]
127              
128             without any leading or trailing spaces. For example, one of the lines of an
129             information content file may be as follows.
130              
131             63723n 667
132              
133             where '63723' is a noun synset offset and 667 is its frequency
134             count. Suppose the noun synset with offset 1740 is the root node of one of
135             the noun taxonomies and has a frequency count of 17625. Then this synset would
136             appear in an information content file as follows:
137              
138             1740n 17625 ROOT
139              
140             The ROOT tags are extremely significant in determining the top of the
141             hierarchies and must not be omitted. Typically, frequency counts for the noun
142             and verb hierarchies are present in each information content file.
143             A number of support programs to generate these files from various corpora
144             are present in the '/utils' directory of the package. A sample information
145             content file has been provided in the '/samples' directory of the package.
146              
147             =head2 Methods
148              
149             The following methodes are provided by this module.
150              
151             =head3 Public Methods
152              
153             =over
154              
155             =cut
156              
157 5     5   32929 use strict;
  5         12  
  5         190  
158 5     5   29 use warnings;
  5         9  
  5         215  
159              
160 5     5   5286 use WordNet::Similarity::PathFinder;
  0            
  0            
161              
162             our @ISA = qw/WordNet::Similarity::PathFinder/;
163              
164             our $VERSION = '2.04';
165              
166             WordNet::Similarity::addConfigOption ('infocontent', 0, 'p', undef);
167              
168              
169             =item $module->traceOptions (Z<>)
170              
171             Prints status of configuration options specific to this module to
172             the trace string. This module has only one such options: infocontent.
173              
174             =cut
175              
176             sub traceOptions {
177             my $self = shift;
178             $self->{traceString} .= "infocontent :: $self->{infocontent}\n";
179             $self->SUPER::traceOptions;
180             }
181              
182              
183              
184             =item $module->probability ($synset, $pos, $mode)
185              
186             Returns the probability of $synset in a corpus (using frequency values
187             from whatever information content file is being used). If $synset
188             is a wps string, then $mode must be 'wps'; if $synset is an offset,
189             then $mode must be 'offset'.
190              
191             =cut
192              
193             sub probability {
194             my $self = shift;
195             my $wn = $self->{wn};
196             my ($offset, $pos, $mode) = @_;
197              
198             $offset = $wn->offset($offset) if(defined($mode) && $mode eq 'wps');
199              
200             my $class = ref $self || $self;
201              
202             my $rootFreq = $self->{offsetFreq}->{$pos}->{0};
203             my $offFreq = $self->{offsetFreq}->{$pos}->{$offset};
204             if($rootFreq && defined $offFreq) {
205             if($offFreq <= $rootFreq) {
206             return $offFreq / $rootFreq;
207             }
208             $self->{errorString} .= "\nError (${class}::probability()) - ";
209             $self->{errorString} .= "Probability greater than 1? (Check information content file)";
210             $self->{error} = 2;
211             }
212             return 0;
213             }
214              
215              
216             =item $module->IC ($synset, $pos, $mode)
217              
218             Returns the information content of $synset. If $synset is a wps string,
219             then $mode must be 'wps'; if $synset is an offset, then $mode must be
220             'offset'.
221              
222             =cut
223              
224             sub IC
225             {
226             my $self = shift;
227             my $wn = $self->{wn};
228             my ($offset, $pos, $mode) = @_;
229              
230             $offset = $wn->offset($offset) if(defined($mode) && $mode eq 'wps');
231              
232             if($pos =~ /[nv]/) {
233             my $prob = $self->probability($offset, $pos, 'offset');
234             return ($prob > 0) ? -log($prob) : 0;
235             }
236             return 0;
237             }
238              
239             =item $module->getFrequency ($synset, $pos, $mode)
240              
241             Returns the frequency of $synset in whatever information content file
242             is currently being used.
243              
244             If $synset is a wps string, then the mode must be 'wps'; if $synset
245             is an offset, then $mode must be 'offset'.
246              
247             Usually the C and C methods will be more useful
248             than this method. This method is useful in determining if the
249             frequency of a synset was 0.
250              
251             =cut
252              
253             sub getFrequency
254             {
255             my $self = shift;
256             my $wn = $self->{wn};
257             my ($synset, $pos, $mode) = @_;
258              
259             my $offset;
260             if ($mode eq 'wps') {
261             $offset = $wn->offset ($synset);
262             }
263             else {
264             $offset = $synset;
265             }
266             my $freq = $self->{offsetFreq}->{$pos}->{$offset};
267             return $freq;
268             }
269              
270             =item getLCSbyIC($synset1, $synset2, $pos, $mode)
271              
272             Given two input synsets, finds the least common subsumer (LCS) of them. If
273             there are multiple candidates for the LCS, the the candidate with the greatest
274             information content.
275              
276             Parameters: two synsets, a part of speech, and a mode.
277              
278             Returns: a list of the form ($lcs, $ic) where $lcs is the LCS and $ic is
279             the information content of the LCS.
280              
281             =cut
282              
283             sub getLCSbyIC
284             {
285             my $self = shift;
286             my $synset1 = shift;
287             my $synset2 = shift;
288             my $pos = shift;
289             my $mode = shift;
290             my $class = ref $self || $self;
291              
292             my $wn = $self->{wn};
293              
294             my @paths = $self->getAllPaths ($synset1, $synset2, $pos, $mode);
295              
296             # check to see if any paths were found
297             unless (defined $paths[0]) {
298             $self->{error} = $self->{error} < 1 ? 1 : $self->{error};
299             $self->{errorString} .= "\nWarning (${class}::getLCSbyIC()) - ";
300              
301             my $wps1 = $mode eq 'wps' ? $synset1 : $wn->getSense ($synset1, $pos);
302             my $wps2 = $mode eq 'wps' ? $synset2 : $wn->getSense ($synset2, $pos);
303              
304             $self->{errorString} .= "No LCS found for $wps1 and $wps2.";
305              
306             if ($self->{trace}) {
307             $self->{traceString} .= "\nNo LCS found for ";
308             $self->printSet ($pos, $mode, $synset1);
309             $self->{traceString} .= ", ";
310             $self->printSet ($pos, $mode, $synset2);
311             $self->{traceString} .= ".";
312             }
313             return undef;
314             }
315              
316             my %IC;
317              
318             # get the IC of each subsumer, put it in a hash
319             foreach (@paths) {
320             # the "O + $off" below is a hack to cope with an unfortunate problem:
321             # The offsets in the WordNet data files are zero-padded, eight-digit
322             # decimal numbers. Sometimes these numbers get stripped off (QueryData's
323             # offset() method does this). As a result, it is much easier to compare
324             # the offsets as numbers rather than as strings:
325             # '00001740' ne '1740', BUT 0 + '00001740' == 0 + '1740'
326             my $off;
327             if ($mode eq 'offset') {
328             $off = $_->[0];
329             }
330             else {
331             $off = (index ($_->[0], '*Root*') < $[) ? $wn->offset ($_->[0]) : 0;
332             }
333              
334             next if defined $IC{$_->[0]};
335              
336             $IC{$_->[0]} = $self->IC (0 + $off, $pos) || 0;
337             }
338              
339              
340             # sort lcs by info content
341             my @array = sort {$b->[1] <=> $a->[1]} map {[$_, $IC{$_}]} keys %IC;
342              
343             if ($self->{trace}) {
344             $self->{traceString} .= "Lowest Common Subsumer(s): ";
345             }
346              
347             my @return;
348              
349             # determine which subsumers have the highest info content; do some
350             # tracing as well
351             foreach my $ref (@array) {
352             if ($self->{trace}) {
353             $self->printSet ($pos, $mode, $ref->[0]);
354             $self->{traceString} .= " (IC=";
355             $self->{traceString} .= sprintf ("%.6f", $ref->[1]);
356             $self->{traceString} .= ") ";
357             }
358              
359             if ($ref->[1] == $array[0]->[1]) {
360             push @return, $ref;
361             }
362             }
363              
364             $self->{trace} and $self->{traceString} .= "\n";
365              
366             return @return;
367             }
368              
369              
370             =item $module->configure (Z<>)
371              
372             Overrides the configure method of WordNet::Similarity to process the
373             information content file (also calles WordNet::Similarity::configure()
374             so that all the work done by that method is still accomplished).
375              
376             =cut
377              
378             sub configure {
379             my $self = shift;
380             $self->SUPER::configure (@_);
381             my $wn = $self->{wn};
382             my $wntools = $self->{wntools};
383             my $class = ref $self || $self;
384              
385             unless (defined $self->{infocontent}) {
386             # look for info content file
387             my $path;
388             my $wnver;
389             my @possiblePaths = ();
390              
391             # Look for all possible default data files installed.
392             foreach $path (@INC) {
393             if(-e $path."/WordNet/ic-semcor.dat") {
394             push @possiblePaths, $path."/WordNet/ic-semcor.dat";
395             }
396             elsif(-e $path."\\WordNet\\ic-semcor.dat") {
397             push @possiblePaths, $path."\\WordNet\\ic-semcor.dat";
398             }
399             }
400              
401             # If there are multiple possibilities, get the one that matches the
402             # the installed version (hash-code) of WordNet.
403             foreach $path (@possiblePaths) {
404             if (open (ICF, $path)) {
405             my $wnver = ;
406             $wnver =~ s/[\r\f\n\t ]+//g;
407             if ($wnver =~ /wnver::(.*)/) {
408             $wnver = $1;
409             if (defined $wnver && $wnver eq $wntools->hashCode()) {
410             $self->{infocontent} = $path;
411             close (ICF);
412             last;
413             }
414             }
415             close (ICF);
416             }
417             }
418             }
419              
420             unless (defined $self->{infocontent}) {
421             $self->{errorString} .= "Error (${class}::configure()) - ";
422             $self->{errorString} .= "Could not find a default information content file\n";
423             $self->{error} = 2;
424             return;
425             }
426              
427             unless (open ICF, $self->{infocontent}) {
428             $self->{errorString} .= "Error (${class}::configure()) - ";
429             $self->{errorString} .= "Could not open information content file $self->{infocontent}\n";
430             $self->{error} = 2;
431             return;
432             }
433              
434             # load the info content file data
435             my $wnver = ;
436             $wnver =~ s/[\r\f\n\t ]+//g;
437             if($wnver =~ /wnver::(.*)/) {
438             $wnver = $1;
439             if(defined $wnver && $wnver eq $wntools->hashCode()) {
440             $self->{offsetFreq}->{n}->{0} = 0;
441             $self->{offsetFreq}->{v}->{0} = 0;
442             while() {
443             s/[\r\f\n]//g;
444             s/^\s+//;
445             s/\s+$//;
446             my ($offsetPOS, $frequency, $topmost) = split /\s+/, $_, 3;
447             if($offsetPOS =~ /([0-9]+)([nvar])/) {
448             my $curOffset;
449             my $curPOS;
450              
451             $curOffset = $1;
452             $curPOS = $2;
453             $self->{offsetFreq}->{$curPOS}->{$curOffset} = $frequency;
454             if(defined $topmost && $topmost =~ /ROOT/) {
455             $self->{offsetFreq}->{$curPOS}->{0} += $self->{offsetFreq}->{$curPOS}->{$curOffset};
456             }
457             }
458             else {
459             $self->{errorString} .= "\nError (${class}::configure()) - ";
460             $self->{errorString} .= "Bad file format ($self->{infocontent}).";
461             $self->{error} = 2;
462             return;
463             }
464             }
465             }
466             else {
467             $self->{errorString} .= "\nError (${class}::configure()) - ";
468             $self->{errorString} .= "WordNet version does not match data file.";
469             $self->{error} = 2;
470             return;
471             }
472             }
473             else {
474             $self->{errorString} .= "\nError (${class}::configure()) - ";
475             $self->{errorString} .= "Bad file format ($self->{infocontent}).";
476             $self->{error} = 2;
477             return;
478             }
479             close (ICF);
480              
481             }
482              
483             =back
484              
485             =head3 Private Methods
486              
487             =over
488              
489             =item $module->_loadInfoContentFile ($file)
490              
491             Subroutine to load frequency counts from an information content file.
492              
493             =cut
494              
495             sub _loadInfoContentFile
496             {
497             my $self = shift;
498             my $infoContentFile = shift;
499             my $wn = $self->{'wn'};
500             my $wntools = $self->{'wntools'};
501             my $wnver;
502             my $offsetPOS;
503             my $frequency;
504             my $topmost;
505             my $localFreq = {};
506              
507             if(open(INFOCONTENT, $infoContentFile))
508             {
509             $wnver = ;
510             $wnver =~ s/[\r\f\n]//g;
511             $wnver =~ s/\s+//g;
512             if($wnver =~ /wnver::(.*)/)
513             {
514             $wnver = $1;
515             if(defined $wnver && $wnver eq $wntools->hashCode())
516             {
517             $localFreq->{"n"}->{0} = 0;
518             $localFreq->{"v"}->{0} = 0;
519             while()
520             {
521             s/[\r\f\n]//g;
522             s/^\s+//;
523             s/\s+$//;
524             ($offsetPOS, $frequency, $topmost) = split /\s+/, $_, 3;
525             if($offsetPOS =~ /([0-9]+)([nvar])/)
526             {
527             my $curOffset;
528             my $curPOS;
529            
530             $curOffset = $1;
531             $curPOS = $2;
532             $localFreq->{$curPOS}->{$curOffset} = $frequency;
533             if(defined $topmost && $topmost =~ /ROOT/)
534             {
535             $localFreq->{$curPOS}->{0} += $localFreq->{$curPOS}->{$curOffset};
536             }
537             }
538             else
539             {
540             return "Bad file format ($infoContentFile).";
541             }
542             }
543             }
544             else
545             {
546             return "WordNet version does not match data file.";
547             }
548             }
549             else
550             {
551             return "Bad file format ($infoContentFile).";
552             }
553             close(INFOCONTENT);
554             }
555             else
556             {
557             return "Unable to open '$infoContentFile'.";
558             }
559              
560             $self->{'offsetFreq'} = $localFreq;
561              
562             return "";
563             }
564              
565             =item $module->_isValidInfoContentFile ($filename)
566              
567             Subroutine that checks the validity of an information content file.
568              
569             =cut
570              
571             sub _isValidInfoContentFile
572             {
573             my $self = shift;
574             my $path = shift;
575             my $wn = $self->{'wn'};
576             my $wntools = $self->{'wntools'};
577             my $wnver;
578              
579             if(open(INFOCONTENT, $path))
580             {
581             $wnver = ;
582             $wnver =~ s/[\r\f\n]//g;
583             $wnver =~ s/\s+//g;
584             if($wnver =~ /wnver::(.*)/)
585             {
586             $wnver = $1;
587             if(defined $wnver && $wnver eq $wntools->hashCode())
588             {
589             close(INFOCONTENT);
590             return 1;
591             }
592             }
593             close(INFOCONTENT);
594             }
595              
596             return 0;
597             }
598              
599             1;
600              
601             __END__