File Coverage

blib/lib/WordNet/stem.pm
Criterion Covered Total %
statement 9 43 20.9
branch 0 8 0.0
condition n/a
subroutine 3 6 50.0
pod 3 3 100.0
total 15 60 25.0


line stmt bran cond sub pod time code
1             # WordNet::stem.pm version 2.04
2             # (Last updated $Id: stem.pm,v 1.1 2008/03/27 05:13:01 sidz1979 Exp $)
3             #
4             # Package used by WordNet::Similarity::lesk module that
5             # computes semantic relatedness of word senses in WordNet
6             # using gloss overlaps.
7             #
8             # Copyright (c) 2005,
9             #
10             # Ted Pedersen, University of Minnesota Duluth
11             # tpederse at d.umn.edu
12             #
13             # Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh
14             # banerjee+ at cs.cmu.edu
15             #
16             # This program is free software; you can redistribute it and/or
17             # modify it under the terms of the GNU General Public License
18             # as published by the Free Software Foundation; either version 2
19             # of the License, or (at your option) any later version.
20             #
21             # This program is distributed in the hope that it will be useful,
22             # but WITHOUT ANY WARRANTY; without even the implied warranty of
23             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24             # GNU General Public License for more details.
25             #
26             # You should have received a copy of the GNU General Public License
27             # along with this program; if not, write to
28             #
29             # The Free Software Foundation, Inc.,
30             # 59 Temple Place - Suite 330,
31             # Boston, MA 02111-1307, USA.
32             #
33             # ------------------------------------------------------------------
34              
35             package WordNet::stem;
36              
37             =head1 NAME
38              
39             WordNet::stem - Module that find the stem of a word or the stems of a
40             string of words, using WordNet.
41              
42             =head1 SYNOPSIS
43              
44             use WordNet::stem;
45              
46             my $wn = WordNet::QueryData->new();
47              
48             my $stemmer = WordNet::stem->new($wn)
49              
50             my @stems = $stemmer->stemWord($word);
51              
52             my $string = $stemmer->stemString($inString, $cache);
53              
54             =head1 DESCRIPTION
55              
56             This module uses the internal stemming algorithm of WordNet to
57             stem words and strings of words. This module is used by the
58             lesk measure of the WordNet::Similarity package.
59              
60             =head2 Methods
61              
62             =over
63              
64             =cut
65              
66 1     1   6 use strict;
  1         2  
  1         32  
67 1     1   5 use Exporter;
  1         2  
  1         52  
68 1     1   11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         2  
  1         596  
69              
70             @ISA = qw(Exporter);
71              
72             %EXPORT_TAGS = ();
73              
74             @EXPORT_OK = ();
75              
76             @EXPORT = ();
77              
78             $VERSION = '2.04';
79              
80             =item new
81              
82             Creates a new stemmer object and initilizes it with a
83             WordNet::QueryData object.
84              
85             Parameters: $wn
86              
87             Returns: $stemmer
88              
89             =cut
90              
91             # function to create the stemmer object
92             sub new
93             {
94 0     0 1   my $className = shift;
95 0           my $wn = shift;
96 0           my $self = {};
97              
98 0           $self->{wn} = $wn;
99 0           $self->{wordStemHash} = ();
100 0           $self->{stringStemHash} = ();
101 0           bless($self, $className);
102              
103 0           return $self;
104             }
105              
106             =item stemString
107              
108             Takes a string of words as input and returns a string of stemmed words.
109              
110             Parameters: $inString
111              
112             Returns: $retString
113              
114             =cut
115              
116             # Function to take a string, and process it in such a way that all the
117             # words in it get stemmed. Note that if a single word has two or more
118             # possible stems, we return the original surface form since there is
119             # no way to select from the competing stems. The stem of the string
120             # can be cached if requested. Useful if the calling function knows
121             # which strings it will have to stem over and over again. Strings that
122             # will be only stemmed ones need not be cached - thereby saving space.
123             sub stemString
124             {
125 0     0 1   my $self = shift;
126 0           my $inputString = shift;
127 0           my $cache = shift;
128            
129             # whether or not this string has been requested for cacheing,
130             # check in the cache
131 0 0         return $self->{'stringStemHash'}->{$inputString} if (defined $self->{'stringStemHash'}->{$inputString});
132            
133             # Not in cache. Stem.
134            
135             # for each word in the input get the stem and put in the output string
136 0           my $outputString = "";
137 0           while ($inputString =~ /(\w+)/g)
138             {
139 0           my $word = $1;
140 0           my @stems = $self->stemWord($word);
141            
142             # if multiple or no stems, use surface form.
143 0 0         $outputString .= ($#stems != 0) ? "$word " : "$stems[0] ";
144             }
145            
146             # if cache required, do so
147 0 0         $self->{'stringStemHash'}->{$inputString} = $outputString if (defined($cache));
148            
149             # return the string
150 0           return($outputString);
151             }
152              
153             =item stemWord
154              
155             Takes a word as input and returns its stems. A word may have more than
156             one stem. All are returned.
157              
158             Parameters: $word
159              
160             Returns: @stems
161              
162             =back
163              
164             =cut
165              
166             # stem the word passed to this function and return an array of words
167             # that contain all the possible stems of this word. All possible stems
168             # of the word may include the surface form too if its a valid WordNet
169             # lemma.
170             sub stemWord
171             {
172 0     0 1   my $self = shift;
173 0           my $word = shift;
174 0           my $wn = $self->{wn};
175 0           my @stems = ();
176            
177             # if not in the cache, create and put in cache
178 0 0         if (!defined $self->{wordStemHash}->{$word})
179             {
180             # So not in the hash. gotta check for all possible parts of speech.
181 0           my %stems = ();
182 0           my $possiblePartsOfSpeech = "nvar";
183            
184 0           my $pos;
185 0           while ("nvar" =~ /(.)/g)
186             {
187 0           foreach ($wn->validForms("$word\#$1"))
188             {
189             # put underscore for space
190 0           $_ =~ s/ /_/g;
191            
192             # remove part of speech if any
193 0           $_ =~ s/\#\w$//;
194            
195             # put in stems hash (the hash allows us to not worry about
196             # multiple copies of the same stem!)
197 0           $stems{$_} = 1;
198             }
199             }
200            
201             # put in the cache
202 0           $self->{wordStemHash}->{$word} = join(" ", (keys %stems));
203             }
204            
205             # return the stems
206 0           return (split / /, $self->{wordStemHash}->{$word});
207             }
208              
209             1;
210              
211             __END__