File Coverage

blib/lib/WordNet/Tools.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # WordNet::Tools v2.05
2             # (Last updated $Id: Tools.pm,v 1.5 2008/06/04 18:38:01 sidz1979 Exp $)
3             #
4             # This module provides some WordNet tools for use with the
5             # WordNet::Similarity modules.
6             #
7             # Copyright (c) 2005,
8             #
9             # Ted Pedersen, University of Minnesota Duluth
10             # tpederse at d.umn.edu
11             #
12             # Siddharth Patwardhan, University of Utah, Salt Lake City
13             # sidd at cs.utah.edu
14             #
15             # This program is free software; you can redistribute it and/or
16             # modify it under the terms of the GNU General Public License
17             # as published by the Free Software Foundation; either version 2
18             # of the License, or (at your option) any later version.
19             #
20             # This program is distributed in the hope that it will be useful,
21             # but WITHOUT ANY WARRANTY; without even the implied warranty of
22             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23             # GNU General Public License for more details.
24             #
25             # You should have received a copy of the GNU General Public License
26             # along with this program; if not, write to
27             #
28             # The Free Software Foundation, Inc.,
29             # 59 Temple Place - Suite 330,
30             # Boston, MA 02111-1307, USA.
31             #
32             # ------------------------------------------------------------------
33              
34             package WordNet::Tools;
35              
36             =head1 NAME
37              
38             WordNet::Tools - Some tools for use with WordNet.
39              
40             =head1 SYNOPSIS
41              
42             use WordNet::QueryData;
43              
44             use WordNet::Tools;
45              
46             my $wn = WordNet::QueryData->new;
47              
48             my $wntools = WordNet::Tools->new($wn);
49              
50             my $wnHashCode = $wntools->hashCode();
51              
52             my $newstring = $wntools->compoundify("find compound words like new york city in this text");
53              
54             =head1 DESCRIPTION
55              
56             This module provides some tools for use with WordNet. For example, the
57             'compoundify' method detects compound words (as found in WordNet) in a
58             text string and it combines these words into single tokens using
59             underscore separators. Another tool in this module generates a unique
60             hash code corresponding to a WordNet distribution. This hash code is
61             meant to replace the "version" information in WordNet, which is no
62             longer reliable.
63              
64             =head1 METHODS
65              
66             The following methods are defined:
67              
68             =over
69              
70             =cut
71              
72 19     19   133227 use strict;
  19         34  
  19         614  
73 19     19   96 use warnings;
  19         33  
  19         560  
74 19     19   101 use Exporter;
  19         39  
  19         912  
75 19     19   33286 use WordNet::QueryData;
  0            
  0            
76             use Digest::SHA1 qw(sha1_base64);
77              
78             use constant MAX_COMPOUND_SIZE => 9;
79              
80             our @ISA = qw(Exporter);
81             our $VERSION = '2.05';
82              
83             =item WordNet::Tools->new($wn)
84              
85             This is a constructor for this class (and creates a new object of this
86             class). It requires a WordNet::QueryData object as a parameter.
87              
88             Parameters: $wn -- a WordNet::QueryData object.
89              
90             Returns: a new WordNet::Tools object.
91              
92             =cut
93              
94             # Constructor for this module
95             sub new
96             {
97             my $class = shift;
98             my $wn = shift;
99             my $self = {};
100              
101             # Create the preprocessor object
102             $class = ref $class || $class;
103             bless($self, $class);
104              
105             # Verify the given WordNet::QueryData object
106             return undef if(!defined $wn || !ref $wn || ref($wn) ne "WordNet::QueryData");
107             $self->{wn} = $wn;
108              
109             # Get the compounds from WordNet
110             foreach my $pos ('n', 'v', 'a', 'r')
111             {
112             foreach my $word ($wn->listAllWords($pos))
113             {
114             $self->{compounds}->{$word} = 1 if ($word =~ /_/);
115             }
116             }
117              
118             # Compute the WordNet hash-code and store
119             $self->{hashcode} = $self->_computeHashCode();
120             return undef if(!defined($self->{hashcode}));
121              
122             return $self;
123             }
124              
125             =item $wntools->compoundify($string)
126              
127             This is method identifies all compound words occurring in the given input
128             string. Compound words are multi-word tokens appearing in WordNet.
129              
130             Parameters: $string -- an input text string.
131              
132             Returns: a string with compound words identified.
133              
134             =cut
135              
136             # Detect compounds in a block of text
137             sub compoundify
138             {
139             my $self = shift;
140             my $block = shift;
141              
142             return $block if(!defined $block || !ref $self || !defined $self->{compounds});
143              
144             my $string;
145             my $done;
146             my $temp;
147             my $firstPointer;
148             my $secondPointer;
149             my @wordsArray;
150              
151             # get all the words into an array
152             @wordsArray = ();
153             while($block =~ /([a-zA-Z0-9_\.\-\/\']+)/g)
154             {
155             push(@wordsArray, $1);
156             }
157              
158             # now compoundify, GREEDILY!!
159             $firstPointer = 0;
160             $string = "";
161              
162             while($firstPointer <= $#wordsArray)
163             {
164             $secondPointer = (($#wordsArray > ($firstPointer + MAX_COMPOUND_SIZE - 1)) ? ($firstPointer + MAX_COMPOUND_SIZE - 1) : ($#wordsArray));
165             $done = 0;
166             while(($secondPointer > $firstPointer) && !$done)
167             {
168             $temp = join("_", @wordsArray[$firstPointer .. $secondPointer]);
169             if(defined $self->{compounds}->{$temp})
170             {
171             $string .= "$temp ";
172             $done = 1;
173             }
174             else
175             {
176             $secondPointer--;
177             }
178             }
179             $string .= "$wordsArray[$firstPointer] " unless($done);
180             $firstPointer = $secondPointer + 1;
181             }
182             $string =~ s/\s+$//;
183              
184             return $string;
185             }
186              
187             =item $wntools->getCompoundsList()
188              
189             This method returns the list of compound words present in WordNet.
190              
191             Parameters: none
192              
193             Returns: reference to an array of compounds.
194              
195             =cut
196              
197             # Return the list of WordNet compounds
198             # Since a deep-copy is performed, this method can be slow. Consequently,
199             # this method should be used sparingly
200             sub getCompoundsList
201             {
202             my $self = shift;
203             my @cList = keys(%{$self->{compounds}});
204             return \@cList;
205             }
206              
207             =item $wntools->hashCode()
208              
209             This is method returns a unique identifier representing a specific
210             distribution of WordNet.
211              
212             Parameters: none.
213              
214             Returns: a unique identifier (string).
215              
216             =cut
217              
218             # Return the computed hash-code
219             sub hashCode
220             {
221             my $self = shift;
222             return $self->{hashcode};
223             }
224              
225             # Compute the hash code for the given WordNet distribution
226             # Most of this code was written by Ben Haskell
227             sub _computeHashCode
228             {
229             my $self = shift;
230             my $qd = $self->{wn};
231             return undef if(!defined($qd));
232              
233             my $dir = $qd->dataPath();
234             my $pos = '{noun,verb,adj,adv}';
235             my @files = sort grep -f, map glob("\Q$dir\E/$_"), "{index,data}.$pos", "$pos.{idx,dat}";
236              
237             # (stat)[7] returns file size in bytes
238             my $concat = join '.', map { (stat)[7] } @files;
239             return sha1_base64($concat);
240             }
241              
242             1;
243              
244             __END__