File Coverage

blib/lib/WordNet/SenseRelate/AllWords.pm

Criterion	Covered	Total	%
statement	33	565	5.8
branch	0	300	0.0
condition	0	114	0.0
subroutine	11	31	35.4
pod	3	14	21.4
total	47	1024	4.5

line	stmt	bran	cond	sub	pod	time	code
1							package WordNet::SenseRelate::AllWords;
2
3							# $Id: AllWords.pm,v 1.40 2009/05/27 20:58:27 kvarada Exp $
4
5							=head1 NAME
6
7							WordNet::SenseRelate::AllWords - Disambiguate All Words in a Text based on semantic similarity and relatedness in WordNet
8
9							=head1 SYNOPSIS
10
11							use WordNet::SenseRelate::AllWords;
12							use WordNet::QueryData;
13							use WordNet::Tools;
14							my $qd = WordNet::QueryData->new;
15							defined $qd or die "Construction of WordNet::QueryData failed";
16							my $wntools = WordNet::Tools->new($qd);
17							defined $wntools or die "\nCouldn't construct WordNet::Tools object";
18
19							my $wsd = WordNet::SenseRelate::AllWords->new (wordnet => $qd,
20							wntools => $wntools,
21							measure => 'WordNet::Similarity::lesk');
22
23							my @context = qw/the bridge is held up by red tape/;
24							my @results = $wsd->disambiguate (window => 3,
25							context => [@context]);
26							print "@results\n";
27
28							=head1 DESCRIPTION
29
30							WordNet::SenseRelate::AllWords implements an algorithm for Word Sense
31							Disambiguation that uses measures of semantic relatedness. The algorithm
32							is an extension of an algorithm described by Pedersen, Banerjee, and
33							Patwardhan[1]. This implementation is similar to the original SenseRelate
34							package but disambiguates every word in the given context rather than just
35							single word.
36
37							=head2 Methods
38
39							Note: the methods below will die() on serious errors. Wrap calls to the
40							methods in an eval BLOCK to catch the exceptions. See
41							'perldoc -f eval' for more information.
42
43							Example:
44
45							my @res;
46							eval {@res = $wsd->disambiguate (args...)}
47
48							if ($@){
49							print STDERR "An exception occurred ($@)\n";
50							}
51
52							=over
53
54							=cut
55
56	3			3		3233	use 5.006;
	3					12
	3					140
57	3			3		18	use strict;
	3					4
	3					117
58	3			3		16	use warnings;
	3					17
	3					111
59	3			3		167	use Carp;
	3					6
	3					617
60
61							our @ISA = ();
62
63							our $VERSION = '0.19';
64
65							my %wordnet;
66							my %wntools;
67							my %simMeasure; # the similarity/relatedness measure
68							my %stoplist;
69							my %pairScore;
70							my %contextScore;
71							my %trace;
72							my %outfile;
73							my %forcepos;
74							my %nocompoundify;
75							my %usemono;
76							my %backoff;
77							my %wnformat;
78							my %fixed;
79
80							# closed class words
81	3					275	use constant {CLOSED => 'c',
82	3			3		27	NOINFO => 'f'};
	3					6
83
84							# constants used to specify trace levels
85	3			3		15	use constant TR_CONTEXT => 1; # show the context window
	3					5
	3					164
86	3			3		15	use constant TR_BESTSCORE => 2; # show the best score
	3					5
	3					149
87	3			3		13	use constant TR_ALLSCORES => 4; # show all non-zero scores
	3					7
	3					135
88	3			3		14	use constant TR_PAIRWISE => 8; # show all the non-zero similarity scores
	3					5
	3					139
89	3			3		30	use constant TR_ZERO => 16;
	3					6
	3					120
90	3			3		14	use constant TR_MEASURE => 32; # show similarity measure traces
	3					5
	3					69206
91
92							# Penn tagset
93							my %wnTag = (
94							JJ => 'a',
95							JJR => 'a',
96							JJS => 'a',
97							CD => 'a',
98							RB => 'r',
99							RBR => 'r',
100							RBS => 'r',
101							RP => 'r',
102							WRB => CLOSED,
103							CC => CLOSED,
104							IN => 'r',
105							DT => CLOSED,
106							PDT => CLOSED,
107							CC => CLOSED,
108							'PRP$' => CLOSED,
109							PRP => CLOSED,
110							WDT => CLOSED,
111							'WP$' => CLOSED,
112							NN => 'n',
113							NNS => 'n',
114							NNP => 'n',
115							NNPS => 'n',
116							PRP => CLOSED,
117							WP => CLOSED,
118							EX => CLOSED,
119							VBP => 'v',
120							VB => 'v',
121							VBD => 'v',
122							VBG => 'v',
123							VBN => 'v',
124							VBZ => 'v',
125							VBP => 'v',
126							MD => 'v',
127							TO => CLOSED,
128							POS => undef,
129							UH => CLOSED,
130							'.' => undef,
131							':' => undef,
132							',' => undef,
133							_ => undef,
134							'$' => undef,
135							'(' => undef,
136							')' => undef,
137							'"' => undef,
138							FW => NOINFO,
139							SYM => undef,
140							LS => undef,
141							);
142
143							=item BZ<>
144
145							Z<>The constructor for this class. It will create a new instance and
146							return a reference to the constructed object.
147
148							Parameters:
149
150							wordnet => REFERENCE : WordNet::QueryData object
151							wntools => REFERENCE : WordNet::Tools object
152							measure => STRING : name of a WordNet::Similarity measure
153							config => FILENAME : config file for above measure
154							outfile => FILENAME : name of a file for output (optional)
155							stoplist => FILENAME : file containing list of stop words
156							pairScore => INTEGER : minimum pairwise score (default: 0)
157							contextScore => INTEGER : minimum overall score (default: 0)
158							trace => INTEGER : generate traces (default: 0)
159							forcepos => INTEGER : do part-of-speech coercion (default: 0)
160							nocompoundify => INTEGER : disable compoundify (default: 0)
161							usemono => INTEGER : enable assigning the available sense to usemono (default: 0)
162							backoff => INTEGER : enable assigning most frequent sense if the measure can't assign sense (default: 0)
163
164							Returns:
165
166							A reference to the constructed object.
167
168							Example:
169
170							WordNet::SenseRelate::AllWords->new (wordnet => $query_data_obj,
171							wntools => $wordnet_tools_obj,
172							measure => 'WordNet::Similarity::lesk',
173							trace => 1);
174
175							The trace levels are:
176
177							1 Show the context window for each pass through the algorithm.
178
179							2 Display winning score for each pass (i.e., for each target word).
180
181							4 Display the non-zero scores for each sense of each target
182							word (overrides 2).
183
184							8 Display the non-zero values from the semantic relatedness measures.
185
186							16 Show the zero values as well when combined with either 4 or 8.
187							When not used with 4 or 8, this has no effect.
188
189							32 Display traces from the semantic relatedness module.
190
191							These trace levels can be added together. For example, by specifying
192							a trace level of 3, the context window will be displayed along with
193							the winning score for each pass.
194
195							=cut
196
197							sub new
198							{
199	0			0	1		my $class = shift;
200	0						my %args = @_;
201	0		0				$class = ref $class \|\| $class;
202
203	0						my $qd;
204							my $wnt;
205	0						my $measure;
206	0						my $measure_config;
207	0						my $stoplist;
208	0						my $pairScore = 0;
209	0						my $contextScore = 0;
210	0						my $trace;
211							my $outfile;
212	0						my $forcepos;
213	0						my $nocompoundify=0;
214	0						my $usemono=0;
215	0						my $backoff=0;
216	0						my $fixed = 0;
217	0						my $wnformat = 0;
218
219	0						while (my ($key, $val) = each %args) {
220	0	0					if ($key eq 'wordnet') {
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
221	0						$qd = $val;
222							}
223							elsif ($key eq 'wntools')
224							{
225	0						$wnt = $val;
226							}
227							elsif ($key eq 'measure') {
228	0						$measure = $val;
229							}
230							elsif ($key eq 'config') {
231	0						$measure_config = $val;
232							}
233							elsif ($key eq 'stoplist') {
234	0						$stoplist = $val;
235							}
236							elsif ($key eq 'pairScore') {
237	0						$pairScore = $val;
238							}
239							elsif ($key eq 'contextScore') {
240	0						$contextScore = $val;
241							}
242							elsif($key eq 'nocompoundify'){
243	0						$nocompoundify=$val;
244							}
245							elsif($key eq 'usemono'){
246	0						$usemono=$val;
247							}
248							elsif($key eq 'backoff'){
249	0						$backoff=$val;
250							}
251							elsif ($key eq 'trace') {
252	0						$trace = $val;
253	0	0					$trace = defined $trace ? $trace : 0;
254							}
255							elsif ($key eq 'outfile') {
256	0						$outfile = $val;
257							}
258							elsif ($key eq 'forcepos') {
259	0						$forcepos = $val;
260							}
261							elsif ($key eq 'fixed') {
262	0						$fixed = $val;
263							}
264							elsif ($key eq 'wnformat') {
265	0						$wnformat = $val;
266							}
267							else {
268	0						croak "Unknown parameter type '$key'";
269							}
270							}
271
272	0	0					unless (ref $qd) {
273	0						croak "No WordNet::QueryData object supplied";
274							}
275
276	0	0					unless (ref $wnt) {
277	0						croak "No WordNet::Tools object supplied";
278							}
279
280	0	0					unless ($measure) {
281	0						croak "No relatedness measure supplied";
282							}
283
284	0						my $self = bless [], $class;
285
286							# initialize tracing;
287	0	0					if (defined $trace) {
288	0						$trace{$self} = {level => $trace, string => ''};
289							}
290							else {
291	0						$trace{$self} = {level => 0, string => ''};
292							}
293
294							# require the relatedness modules
295	0						my $file = $measure;
296	0						$file =~ s/::/\//g;
297	0						require "${file}.pm";
298
299							# construct the relatedness object
300	0	0					if (defined $measure_config) {
301	0						$simMeasure{$self} = $measure->new ($qd, $measure_config);
302							}
303							else {
304	0						$simMeasure{$self} = $measure->new ($qd);
305							}
306
307							# check for errors
308	0						my ($errCode, $errStr) = $simMeasure{$self}->getError;
309	0	0					if ($errCode) {
310	0						carp $errStr;
311							}
312
313							# turn on traces in the relatedness measure if required
314	0	0					if ($trace{$self}->{level} & TR_MEASURE) {
315	0						$simMeasure{$self}->{trace} = 1;
316							}
317							else {
318	0						$simMeasure{$self}->{trace} = 0;
319							}
320
321
322							# save ref to WordNet::QueryData obj
323	0						$wordnet{$self} = $qd;
324
325							# save ref to WordNet::Tools obj
326	0						$wntools{$self} = $wnt;
327
328	0	0					$self->_loadStoplist ($stoplist) if defined $stoplist;
329
330							# store threshold values
331	0						$pairScore{$self} = $pairScore;
332	0						$contextScore{$self} = $contextScore;
333
334							# save output file name
335	0						$outfile{$self} = $outfile;
336	0	0	0				if ($outfile and -e $outfile) {
337	0						unlink $outfile;
338							}
339
340	0	0					if (defined $forcepos) {
341	0						$forcepos{$self} = $forcepos;
342							}
343							else {
344	0						$forcepos{$self} = 0;
345							}
346
347	0	0					if (defined $nocompoundify) {
348	0						$nocompoundify{$self} = $nocompoundify;
349							}
350							else {
351	0						$nocompoundify{$self} = 0;
352							}
353
354	0	0					if (defined $usemono) {
355	0						$usemono{$self} = $usemono;
356							}
357							else {
358	0						$usemono{$self} = 0;
359							}
360
361	0	0					if (defined $backoff) {
362	0						$backoff{$self} = $backoff;
363							}
364							else {
365	0						$backoff{$self} = 0;
366							}
367
368
369
370
371	0						$fixed{$self} = $fixed;
372
373	0						$wnformat{$self} = $wnformat;
374
375	0						return $self;
376							}
377
378							# the destructor for this class. You shouldn't need to call this
379							# explicitly (but if you really want to, you can see what happens)
380							sub DESTROY
381							{
382	0			0			my $self = shift;
383	0						delete $wordnet{$self};
384	0						delete $wntools{$self};
385	0						delete $simMeasure{$self};
386	0						delete $stoplist{$self};
387	0						delete $pairScore{$self};
388	0						delete $contextScore{$self};
389	0						delete $trace{$self};
390	0						delete $outfile{$self};
391	0						delete $forcepos{$self};
392	0						delete $nocompoundify{$self};
393	0						delete $usemono{$self};
394	0						delete $backoff{$self};
395	0						delete $wnformat{$self};
396	0						delete $fixed{$self};
397
398	0						1;
399							}
400
401							sub wordnet : lvalue
402							{
403	0			0	0		my $self = shift;
404	0						$wordnet{$self};
405							}
406
407							=item B
408
409							Disambiguates all the words in the specified context and returns them
410							as a list. If a word cannot be disambiguated, then it is returned "as is".
411							A word cannot be disambiguated if it is not in WordNet or if no value
412							exceeds the specified threshold.
413
414							The context parameter specifies the
415							words to be disambiguated. It treats the value as one sentence. To
416							disambiguate a document with multiple sentences, make one call to
417							disambiguate() for each sentence.
418
419							Parameters:
420
421							window => INTEGER : the window size to use. A window size of N means
422							that the window will include N words, including
423							the target word. If N is an even number, there
424							will be one more word on the left side of the
425							target word than on the right.
426							tagged => BOOLEAN : true if the text is tagged, false otherwise
427							scheme => normal\|sense1\|random\|fixed : the disambiguation scheme to use
428							context => ARRAY_REF : reference to an array of words to disambiguate
429
430							Returns: An array of disambiguated words.
431
432							Example:
433
434							my @results =
435							$wsd->disambiguate (window => 3, tagged => 0, context => [@words]);
436
437							Rules for attaching suffixes:
438
439							Suffixes are attached to the words in the context in order to ignore those while disambiguation.
440							Note that after converting the tags to WordNet tags, tagged text is treated same as wntagged text.
441
442							Below is the ordered enumeration of the words which are ignored for disambiguation and the suffixes attached to those words.
443
444							Note that we check for such words in the order below:
445
446							1 stopwords => #o
447
448							2 Only for tagged text :
449
450							i) Closed Class words => #CL
451
452							ii) Invalid Tag => #IT
453
454							iii) Missing Word => #MW
455
456							3 For tagged and wntagged text:
457
458							i) No Tag => #NT
459
460							ii) Missing Word => #MW
461
462							iii) Invalid Tag => #IT
463
464							4 Not in WordNet => #ND
465
466							5 No Relatedness found with the surrounding words => #NR
467
468							=cut
469
470							#The scheme can have three different values:
471							#
472							#=over
473							#
474							#=item normal
475							#
476							#This is the normal mode of operation, where disambiguation is done by
477							#measuring the semantic relatedness of the senses of each word with the
478							#surrounding words.
479							#
480							#=item sense1
481							#
482							#In this mode, the first sense number (i.e., sense number 1) is assigned
483							#to each word. In WordNet, the first sense of a word is I the
484							#most frequent sense.
485							#
486							#=item random
487							#
488							#In this mode, sense numbers are randomly assigned to each word from the
489							#set of valid sense numbers for each word. For example, the noun 'hart'
490							#has three senses in WordNet 2.0, so the word would randomly be assigned
491							#1, 2, or 3. This may be useful for comparison purposes when evaluating
492							#experimental results.
493							#
494							#
495							#
496							#
497							#=cut
498
499							sub disambiguate
500							{
501	0			0	1		my $self = shift;
502	0						my %options = @_;
503	0						my $contextScore;
504							my $pairScore;
505	0						my $window = 3; # default the window to 3 to avoid failure if omitted
506	0						my $tagged;
507							my @context;
508	0						my $scheme = 'normal';
509
510	0						while (my ($key, $value) = each %options){
511	0	0					if ($key eq 'window') {
		0
		0
		0
512	0						$window = $value;
513							}
514							elsif ($key eq 'tagged') {
515	0						$tagged = $value;
516							}
517							elsif ($key eq 'context') {
518	0						@context = @$value;
519							}
520							elsif ($key eq 'scheme') {
521	0						$scheme = $value;
522							}
523							else {
524	0						croak "Unknown option '$key'";
525							}
526							}
527
528							# _initializeContext method
529							# 1) compoundifies the text
530							# 2) checks if the word is a stopword. If it is a stopword, attaches \#o
531							# 3) converts position tags if we have tagged text
532	0						my @newcontext = $self->_initializeContext ($tagged, @context);
533
534	0	0	0				if($tagged \|\| $wnformat{$self}){
535	0						foreach my $word (@newcontext) {
536	0	0	0				if ($word !~ /\#/) {
		0
		0
537	0						$word = $word . "#NT";
538							}
539							elsif( $word =~ /^#/)
540							{
541	0						$word = $word . "#MW";
542							}
543							elsif ( $word !~ /\#[nvar]$/ && $word !~ /\#o\b/ && $word !~ /\#CL\b/ && $word !~ /\#IT\b/) {
544	0						$word = $word . "#IT";
545							}
546							}
547							}
548
549	0						my @results;
550	0	0	0				if ($scheme eq 'sense1') {
		0
		0
551	0						@results = $self->doSense1 (@newcontext);
552							}
553							elsif ($scheme eq 'random') {
554	0						@results = $self->doRandom (@newcontext);
555							}
556							elsif (($scheme eq 'normal') or ($scheme eq 'fixed')) {
557	0	0					$fixed{$self} = 1 if $scheme eq 'fixed';
558	0						@results = $self->doNormal ($pairScore, $contextScore, $window,
559							@newcontext);
560							}
561							else {
562	0						croak ("Bad scheme '$scheme'.\n",
563							"Scheme must be 'normal', 'sense1', 'random', or 'fixed'");
564							}
565
566							# my @rval = map {s/\#o//; $_} @results;
567	0						my @rval = @results;
568
569	0	0					if ($outfile{$self}) {
570	0	0					open OFH, '>>', $outfile{$self} or croak "Cannot open outfile: $!";
571	0						print OFH "\n\n";
572	0						print OFH "Results after disambiguation...\n";
573	0						for my $i (0..$#newcontext) {
574	0						my $orig_word = $newcontext[$i];
575	0						my $new_word = $rval[$i];
576	0						my ($w, $p, $s) = $new_word =~ /([^\#]+)(?:\#([^\#]+)(?:\#([^\#]+))?)?/;
577	0						printf OFH "%25s", $orig_word;
578	0						printf OFH " %24s", $w;
579	0	0					printf OFH "%3s", $p if defined $p;
580	0	0					printf OFH "%3s", $s if defined $s;
581	0						print OFH "\n";
582							}
583
584	0						close OFH;
585							}
586
587	0						return @rval;
588							}
589
590							sub _initializeContext
591							{
592	0			0			my $self = shift;
593	0						my $tagged = shift;
594	0						my $wn = $wordnet{$self};
595	0						my $wnt = $wntools{$self};
596	0						my $nocompoundify = $nocompoundify{$self};
597
598	0						my @context = @_;
599
600							# compoundify the words (if the text is raw)
601	0	0	0				if (defined $wnt and $nocompoundify == 0 and defined !$tagged and !$wnformat{$self} ) {
			0
			0
602	0						@context = split(/ +/,$wnt->compoundify("@context"));
603							}
604
605	0						my @newcontext;
606							# do stoplisting
607	0	0					if ($stoplist{$self}) {
608	0						foreach my $word (@context) {
609	0	0					if ($self->isStop ($word)) {
610	0						push @newcontext, $word."#o";
611							}
612							else {
613	0						push @newcontext, $word;
614							}
615							}
616							}
617							else {
618	0						@newcontext = @context;
619							}
620
621							# convert POS tags, if we have tagged text
622	0	0					if ($tagged) {
623	0						foreach my $wpos (@newcontext) {
624	0						$wpos = $self->convertTag ($wpos);
625	0	0					if (!defined $wpos) {
626	0						$wpos="#MW";
627							}
628							}
629							}
630
631	0						return @newcontext;
632							}
633
634							sub doNormal {
635	0			0	0		my $self = shift;
636	0						my $pairScore = shift;
637	0						my $contextScore = shift;
638	0						my $window = shift;
639	0						my @context = @_;
640
641	0						my $lwindow = $window >> 1; # simply divide by 2 & throw away remainder
642	0						my $rwindow = $window - $lwindow - 1;
643
644							# get all the senses for each word
645	0						my @senses = $self->_getSenses (\@context);
646
647
648							# disambiguate
649	0						my @results;
650
651	0						local $\| = 1;
652
653	0						my $sense1firstword = 0;
654
655							# for each word in the context, disambiguate the (target) word
656	0						for my $targetIdx (0..$#context) {
657	0						my @target_scores;
658
659	0	0					unless (ref $senses[$targetIdx]) {
660	0						$results[$targetIdx] = $context[$targetIdx];
661	0						next;
662							}
663
664
665							# figure out which words are in the window
666	0						my $lower = $targetIdx - $lwindow;
667	0	0					$lower = 0 if $lower < 0;
668	0						my $upper = $targetIdx + $rwindow;
669	0	0					$upper = $#context if $upper > $#context;
670
671							# expand context window to the left, if necessary
672	0						my $i = $targetIdx - 1;
673	0						while ($i >= $lower) {
674	0	0					last if $lower == 0;
675	0	0	0				unless (defined $senses[$i] and (scalar @{$senses[$i]} > 0)) {
	0
676	0						$lower--;
677							}
678	0						$i--;
679							}
680
681							# expand context window to the right, if necessary
682	0						my $j = $targetIdx + 1;
683	0						while ($j <= $upper) {
684	0	0					last if $upper >= scalar $#context;
685	0	0	0				unless (defined $senses[$j] and (scalar @{$senses[$j]} > 0)) {
	0
686	0						$upper++;
687							}
688	0						$j++;
689							}
690
691							# If it is the first word in a sentence and the window size is 2, we'll
692							# consider a word at the right of the target word. Otherwise it will not be
693							# assigned any sense. In the previous version we were simply doing sense1
694							# which gave a boost to window=2 results.
695
696	0	0	0				if ($targetIdx==0 && $window == 2){
697	0						$upper=1;
698							}
699
700							# do some tracing
701	0	0	0				if ($trace{$self} and ($trace{$self}->{level} & TR_CONTEXT)) {
702	0						$trace{$self}->{string} .= "Context: ";
703	0	0					if ($lower < $targetIdx) {
704	0						$trace{$self}->{string} .=
705							join (' ', @context[$lower..$targetIdx-1]) . ' ';
706
707							}
708
709	0						$trace{$self}->{string} .=
710							"$context[$targetIdx]";
711
712	0	0					if ($targetIdx < $upper) {
713	0						$trace{$self}->{string} .= ' ' .
714							join (' ', @context[($targetIdx+1)..$upper]);
715							}
716
717	0						$trace{$self}->{string} .= "\n";
718							}
719
720	0						my $result;
721	0	0					if ($sense1firstword) {
722							##########################
723	0						my $word = $context[$targetIdx];
724
725	0						my $t = $self->getSense1 (\$context[$targetIdx]);
726	0	0					if (defined $t) {
727	0						$sense1firstword = 0;
728	0						$result = $t;
729							}
730							else {
731	0						$result = $context[$targetIdx];
732							}
733							}
734							else {
735	0	0					if ($forcepos{$self}) {
736	0						$result = $self->_forcedPosDisambig ($lower, $targetIdx,
737							$upper, \@senses,
738							\@context);
739							}
740							else {
741	0						$result = $self->_normalDisambig ($lower, $targetIdx, $upper,
742							\@senses, \@context);
743							}
744							}
745
746	0	0					if ($fixed{$self}) {
747	0	0					if ($result =~ /\#[nvars]\#\d/) {
748	0						$senses[$targetIdx] = [$result];
749							}
750							}
751
752	0						push @results, $result;
753							}
754
755	0						return @results;
756							}
757
758							=item B
759
760							Gets the current trace string and resets it to "".
761
762							Parameters:
763
764							None
765
766							Returns:
767
768							The current trace string (before resetting it). If the returned string
769							is not empty, it will end with a newline.
770
771							Example:
772
773							my $str = $wsd->getTrace ();
774							print $str;
775
776							=cut
777
778							sub getTrace
779							{
780	0			0	1		my $self = shift;
781	0						my $str = $trace{$self}->{string};
782	0						$trace{$self}->{string} = '';
783	0						return $str;
784							}
785
786							# does sense 1 disambiguation
787							sub doSense1
788							{
789	0			0	0		my $self = shift;
790	0						my @words = @_;
791	0						my $wn = $wordnet{$self};
792
793	0						my @disambiguated;
794
795	0						foreach my $word (@words) {
796	0						my $tmp = $self->getSense1 (\$word);
797	0	0					if (defined $tmp) {
798	0						push @disambiguated, $tmp;
799							}
800							else {
801	0						push @disambiguated, $word;
802							}
803							}
804
805	0						return @disambiguated;
806							}
807
808							# gets sense number 1 for the specified word. If the word has multiple forms,
809							# then the most frequent sense is returned. If there is more than one
810							# most frequent sense with sense number 1, a sense is chosen at random.
811							#
812							# this is not quite the same as choosing the most frequent sense of a word.
813							# The sense number 1 in wordnet is often the most frequent but not always.
814							sub getSense1
815							{
816	0			0	0		my $self = shift;
817	0						my $word_ref = shift;
818	0						my $wn = $wordnet{$self};
819	0						my %senses;
820
821							# check if word has error suffix in it, if it does, we can't do anything with it
822	0	0					if (${$word_ref} =~ /\#o\|\#IT\|\#CL\|\#NT\|\#MW/) {
	0
823	0						return undef;
824							}
825
826	0						my @forms;
827	0	0					unless ($wnformat{$self}) {
828	0						@forms = $wn->validForms (${$word_ref});
	0
829							}
830							else {
831	0						@forms = ${$word_ref};
	0
832							}
833	0	0					if (scalar @forms == 0) {
834	0						${$word_ref}= "${$word_ref}"."#ND";
	0
	0
835							}
836							else{
837	0						foreach my $form (@forms) {
838	0						my @t = $wn->querySense ($form);
839	0	0					if (scalar @t > 0) {
840	0						$senses{$form} = $t[0];
841							}
842							}
843							}
844	0						my @best_senses;
845
846	0						foreach my $key (keys %senses) {
847	0						my $sense = $senses{$key};
848
849	0						my $freq = $wn->frequency ($sense);
850
851	0	0					if ($#best_senses < 0) {
		0
		0
852	0						push @best_senses, [$sense, $freq];
853							}
854							elsif ($best_senses[$#best_senses]->[1] < $freq) {
855	0						@best_senses = ([$sense, $freq]);
856							}
857							elsif ($best_senses[$#best_senses]->[1] == $freq) {
858	0						push @best_senses, [$sense, $freq];
859							}
860							else {
861							# do nothing
862							}
863							}
864
865	0	0					if (scalar @best_senses) {
866	0						my $i = int (rand (scalar @best_senses));
867
868	0						return $best_senses[$i]->[0];
869							}
870
871	0						return undef;
872							}
873
874							# does random guessing. This could be considered a baseline approach
875							# of sorts. Also try running normal disambiguation using the
876							# WordNet::Similarity::random measure
877							sub doRandom
878							{
879	0			0	0		my $self = shift;
880	0						my @words = @_;
881	0						my $wn = $wordnet{$self};
882
883	0						my $datapath = $wn->dataPath;
884
885	0						my @disambiguated;
886
887	0						foreach my $word (@words) {
888	0	0					if ( $word =~ /\#o\|\#IT\|\#CL\|\#NT\|\#MW/) {
889							# push the string into the array
890	0						push @disambiguated, $word;
891	0						next;
892							}
893
894	0						my @forms;
895	0	0					unless ($wnformat{$self}) {
896	0						@forms = $wn->validForms ($word);
897							}
898							else {
899	0						@forms = $word;
900							}
901	0						my @senses;
902	0	0					if (scalar @forms == 0) {
903	0						$word= "$word"."#ND";
904							}
905							else {
906	0						foreach my $form (@forms) {
907	0						my @t = $wn->querySense ($form);
908	0	0					if (scalar @t > 0) {
909	0						push @senses, @t;
910							}
911							}
912							}
913	0	0					if (scalar @senses) {
914	0						my $i = int (rand (scalar @senses));
915	0						push @disambiguated, $senses[$i];
916							}
917							else {
918	0						push @disambiguated, $word;
919							}
920
921
922							}
923	0						return @disambiguated;
924							}
925
926							sub _forcedPosDisambig
927							{
928	0			0			my $self = shift;
929	0						my $lower = shift;
930	0						my $targetIdx = shift;
931	0						my $upper = shift;
932	0						my $senses_ref = shift;
933	0						my $context_ref = shift;
934	0						my $measure = $simMeasure{$self};
935	0						my $result;
936							my @traces;
937	0						my @target_scores;
938
939
940							# for each sense of the target word ...
941	0						for my $i (0..$#{$senses_ref->[$targetIdx]}) {
	0
942	0	0	0				unless (ref $senses_ref->[$targetIdx]
943							and defined $senses_ref->[$targetIdx][$i]) {
944	0						$target_scores[$i] = -1;
945	0						next;
946							}
947
948	0						$target_scores[$i] = 0;
949
950	0						my $target_pos = getPos ($senses_ref->[$targetIdx][$i]);
951
952							# for each (context) word in the window around the target word
953	0						for my $contextIdx ($lower..$upper) {
954	0	0					next if $contextIdx == $targetIdx;
955	0	0					next unless ref $senses_ref->[$contextIdx];
956
957	0						my @tempScores;
958
959							my @goodsenses;
960							# * check if senses for context word work with target word *
961	0	0					if (needCoercePos ($target_pos, $senses_ref->[$contextIdx])) {
962	0						@goodsenses = $self->coercePos ($context_ref->[$contextIdx],
963							$target_pos);
964							}
965							else {
966	0						@goodsenses = @{$senses_ref->[$contextIdx]};
	0
967							}
968
969							# for each sense of the context word in the window
970	0						for my $k (0..$#{$senses_ref->[$contextIdx]}) {
	0
971	0	0					unless (defined $senses_ref->[$contextIdx][$k]) {
972	0						$tempScores[$k] = -1;
973	0						next;
974							}
975
976	0						$tempScores[$k] =
977							$measure->getRelatedness ($senses_ref->[$targetIdx][$i],
978							$senses_ref->[$contextIdx][$k]);
979
980	0	0					if ($trace{$self}->{level} & TR_PAIRWISE) {
981							# only trace zero values if TR_ZERO is specified
982	0	0	0				if ((defined $tempScores[$k] and $tempScores[$k] > 0)
			0
983							or ($trace{$self}->{level} & TR_ZERO)) {
984	0	0					my $s = " "
985							. $senses_ref->[$targetIdx][$i] . ' '
986							. $senses_ref->[$contextIdx][$k] . ' '
987							. (defined $tempScores[$k]
988							? $tempScores[$k]
989							: 'undef');
990	0						push @{$traces[$i]}, $s;
	0
991							}
992							}
993
994	0	0	0				if ($trace{$self}->{level} & TR_MEASURE
			0
995							and ((defined $tempScores[$k] and $tempScores[$k] > 0)
996							or ($trace{$self}->{level} & TR_ZERO))) {
997	0						push @{$traces[$i]}, $measure->getTraceString ();
	0
998							}
999							# clear errors in Similarity object
1000	0	0					$measure->getError () unless defined $tempScores[$k];
1001							}
1002	0						my $best = -2;
1003	0						foreach my $temp (@tempScores) {
1004	0	0					next unless defined $temp;
1005	0	0					$best = $temp if $temp > $best;
1006							}
1007
1008	0	0					if ($best > $pairScore{$self}) {
1009	0						$target_scores[$i] += $best;
1010							}
1011							}
1012							}
1013
1014							# find the best score for this sense of the target word
1015
1016							# first, do a bit of tracing
1017	0	0	0				if (ref $trace{$self} and ($trace{$self}->{level} & TR_ALLSCORES)) {
1018	0						$trace{$self}->{string} .= " Scores for $context_ref->[$targetIdx]\n";
1019							}
1020
1021							# now find the best sense
1022	0						my $best_tscore = -1;
1023	0						foreach my $i (0..$#target_scores) {
1024	0						my $tscore = $target_scores[$i];
1025	0	0					next unless defined $tscore;
1026
1027	0	0	0				if ($trace{$self}->{level} & TR_ALLSCORES
			0
1028							and (($tscore > 0) or ($trace{$self}->{level} & TR_ZERO))) {
1029	0						$trace{$self}->{string} .= " $senses_ref->[$targetIdx][$i]: $tscore\n";
1030							}
1031
1032	0	0	0				if (($trace{$self}->{level} & TR_MEASURE
			0
1033							or $trace{$self}->{level} & TR_PAIRWISE)
1034							and defined $traces[$i]) {
1035	0						foreach my $str (@{$traces[$i]}) {
	0
1036	0						$trace{$self}->{string} .= $str . "\n";
1037							}
1038							}
1039
1040							# ignore scores less than the threshold
1041	0	0					next unless $tscore > $contextScore{$self};
1042
1043	0	0					if ($tscore > $best_tscore) {
1044	0						$result = $senses_ref->[$targetIdx][$i];
1045	0						$best_tscore = $tscore;
1046							}
1047							}
1048
1049	0	0					if ($best_tscore < 0) {
1050	0						$result = $context_ref->[$targetIdx];
1051							}
1052
1053	0	0	0				if (ref $trace{$self} and $trace{$self}->{level} & TR_BESTSCORE) {
1054	0						$trace{$self}->{string} .= " Winning score: $best_tscore\n";
1055							}
1056
1057	0						return $result;
1058							}
1059
1060							sub _normalDisambig
1061							{
1062	0			0			my $self = shift;
1063	0						my $lower = shift;
1064	0						my $targetIdx = shift;
1065	0						my $upper = shift;
1066	0						my $senses_ref = shift;
1067	0						my $context_ref = shift;
1068	0						my $measure = $simMeasure{$self};
1069	0						my $usemono = $usemono{$self};
1070	0						my $backoff = $backoff{$self};
1071
1072	0						my $result;
1073
1074							my @traces;
1075	0						my @target_scores;
1076
1077							# for each sense of the target word ...
1078	0						for my $i (0..$#{$senses_ref->[$targetIdx]}) {
	0
1079	0	0	0				unless (ref $senses_ref->[$targetIdx]
1080							and defined $senses_ref->[$targetIdx][$i]) {
1081	0						$target_scores[$i] = -1;
1082	0						next;
1083							}
1084	0						$target_scores[$i] = 0;
1085							# If --usemono flag is on and the word has only one sense then assign it.
1086							# This flag will be off by default.
1087	0	0	0				if($usemono == 1 && $#{$senses_ref->[$targetIdx]} == 0){
	0
1088	0						$result = $senses_ref->[$targetIdx][0];
1089	0						return $result;
1090							}
1091							#my @tempScores;
1092
1093
1094							# for each (context) word in the window around the target word
1095	0						for my $contextIdx ($lower..$upper) {
1096	0						my @tempScores = ();
1097	0	0					next if $contextIdx == $targetIdx;
1098	0	0					next unless ref $senses_ref->[$contextIdx];
1099
1100							# for each sense of the context word in the window
1101	0						for my $k (0..$#{$senses_ref->[$contextIdx]}) {
	0
1102	0	0					unless (defined $senses_ref->[$contextIdx][$k]) {
1103	0						$tempScores[$k] = -1;
1104	0						next;
1105							}
1106
1107	0						$tempScores[$k] =
1108							$measure->getRelatedness ($senses_ref->[$targetIdx][$i],
1109							$senses_ref->[$contextIdx][$k]);
1110
1111	0	0					if ($trace{$self}->{level} & TR_PAIRWISE) {
1112							# only trace zero values if TR_ZERO is specified
1113	0	0	0				if ((defined $tempScores[$k] and $tempScores[$k] > 0)
			0
1114							or ($trace{$self}->{level} & TR_ZERO)) {
1115	0	0					my $s = " "
1116							.$senses_ref->[$targetIdx][$i] . ' '
1117							.$senses_ref->[$contextIdx][$k] . ' '
1118							. (defined $tempScores[$k]
1119							? $tempScores[$k]
1120							: 'undef');
1121
1122	0						push @{$traces[$i]}, $s;
	0
1123							}
1124							}
1125
1126	0	0	0				if ($trace{$self}->{level} & TR_MEASURE
			0
1127							and ((defined $tempScores[$k] and $tempScores[$k] > 0)
1128							or ($trace{$self}->{level} & TR_ZERO))) {
1129	0						push @{$traces[$i]}, $measure->getTraceString ();
	0
1130							}
1131
1132							# clear errors in Similarity object
1133	0	0					$measure->getError () unless defined $tempScores[$k];
1134							}
1135	0						my $best = -2;
1136	0						foreach my $temp (@tempScores) {
1137	0	0					next unless defined $temp;
1138
1139	0	0					$best = $temp if $temp > $best;
1140							}
1141
1142	0	0					if ($best > $pairScore{$self}) {
1143	0						$target_scores[$i] += $best;
1144							}
1145							}
1146							}
1147
1148							# find the best score for this sense of the target word
1149
1150							# first, do a bit of tracing
1151	0	0	0				if (ref $trace{$self} and ($trace{$self}->{level} & TR_ALLSCORES)) {
1152	0						$trace{$self}->{string} .= " Scores for $context_ref->[$targetIdx]\n";
1153							}
1154
1155							# now find the best sense
1156	0						my $best_tscore = -1;
1157
1158	0						foreach my $i (0..$#target_scores) {
1159	0						my $tscore = $target_scores[$i];
1160	0	0					next unless defined $tscore;
1161
1162	0	0	0				if ($trace{$self}->{level} & TR_ALLSCORES
			0
1163							&& (($tscore > 0) or ($trace{$self}->{level} & TR_ZERO))) {
1164	0						$trace{$self}->{string} .= " $senses_ref->[$targetIdx][$i]: $tscore\n";
1165							}
1166
1167	0	0	0				if (($trace{$self}->{level} & TR_MEASURE
			0
1168							or $trace{$self}->{level} & TR_PAIRWISE)
1169							and defined $traces[$i]) {
1170	0						foreach my $str (@{$traces[$i]}) {
	0
1171	0						$trace{$self}->{string} .= $str . "\n";
1172							}
1173							}
1174
1175							# ignore scores less than the threshold
1176	0	0					next unless $tscore > $contextScore{$self};
1177
1178	0	0					if ($tscore > $best_tscore) {
1179	0						$result = $senses_ref->[$targetIdx][$i];
1180	0						$best_tscore = $tscore;
1181							}
1182							}
1183
1184	0	0					if ($best_tscore < 0) {
1185							#$result = $context_ref->[$targetIdx];
1186	0						$result = "$context_ref->[$targetIdx]"."#NR";
1187	0	0					if($backoff == 1){
1188	0						$result = $self->getSense1(\$context_ref->[$targetIdx]);
1189							}
1190							}
1191
1192	0	0	0				if (ref $trace{$self} and $trace{$self}->{level} & TR_BESTSCORE) {
1193	0						$trace{$self}->{string} .= " Winning score: $best_tscore\n";
1194							}
1195
1196							# if ($trace{$self}->{level} & 8) { # \| $trace{$self}->{level} & TR_PAIRWISE) {
1197							# foreach my $str (@traces) {
1198							# $trace{$self}->{string} .= "$str\n";
1199							# }
1200							# @traces = ();
1201							# }
1202	0						return $result;
1203							}
1204
1205							sub isStop
1206							{
1207	0			0	0		my $self = shift;
1208	0						my $word = shift;
1209	0						foreach my $re (@{$stoplist{$self}}) {
	0
1210	0	0					if ($word =~ /^$re$/) {
1211	0						return 1;
1212							}
1213							}
1214	0						return 0;
1215							}
1216
1217							# checks to see if the POS of at least one word#pos#sense string in $aref
1218							# is $pos
1219							sub needCoercePos
1220							{
1221	0			0	0		my $pos = shift;
1222
1223							# Only coerce if target POS is noun or verb.
1224							# The measures that take advantage of POS coercion only work with
1225							# nouns and verbs.
1226	0	0	0				unless ($pos eq 'n' or $pos eq 'v') {
1227	0						return 0;
1228							}
1229
1230	0						my $aref = shift;
1231	0						foreach my $wps (@$aref) {
1232	0	0					if ($pos eq getPos ($wps)) {
1233	0						return 0;
1234							}
1235							}
1236	0						return 1;
1237							}
1238
1239							sub convertTag
1240							{
1241	0			0	0		my $self = shift;
1242	0						my $wordpos = shift;
1243	0						my $index = index $wordpos, "/";
1244
1245	0	0					if ($index < 0) {
		0
		0
1246	0						return $wordpos;
1247							}
1248							elsif ($index == 0) {
1249	0						return undef;
1250							}
1251							elsif (index ($wordpos, "'") == 0) {
1252							# we have a contraction
1253	0						my $word = substr $wordpos, 0, $index;
1254	0						my $tag = substr $wordpos, $index + 1;
1255	0						return $self->convertContraction ($word, $tag);
1256							}
1257							else {
1258	0						my $word = substr $wordpos, 0, $index;
1259	0						my $old_pos_tag = substr $wordpos, $index + 1;
1260	0						my $new_pos_tag = $wnTag{$old_pos_tag};
1261
1262	0	0	0				if ((defined $new_pos_tag) and ($new_pos_tag =~ /[nvar]/)) {
		0	0
		0
1263	0						return $word . '#' . $new_pos_tag;
1264							}
1265							elsif((defined $new_pos_tag) and ($new_pos_tag =~ /[cf]/)){
1266	0						return $word . '#CL';
1267							}
1268							elsif(!(defined $new_pos_tag)){
1269	0						return $word . '#IT';
1270							}
1271							else {
1272	0						return $word;
1273							}
1274							}
1275							}
1276
1277
1278							sub convertContraction
1279							{
1280	0			0	0		my ($self, $word, $tag) = @_;
1281	0	0					if ($word eq "'s") {
		0
		0
		0
		0
		0
		0
		0
1282	0	0					if ($tag =~ /^V/) {
1283	0						return "is#v";
1284							}
1285							else {
1286	0						return "";
1287							}
1288							}
1289							elsif ($word eq "'re") {
1290	0						return "are#v";
1291							}
1292							elsif ($word eq "'d") {
1293	0						return "had#v"; # actually this could be would as well
1294							}
1295							elsif ($word eq "'ll") {
1296	0						return "will#v";
1297							}
1298							elsif ($word eq "'em") {
1299	0						return "";
1300							}
1301							elsif ($word eq "'ve") {
1302	0						return "have#v";
1303							}
1304							elsif ($word eq "'m") {
1305	0						return "am#v";
1306							}
1307							elsif ($word eq "'t") { # HELP should be n't
1308	0						return "not";
1309							}
1310							else {
1311	0						return "$word#$tag";
1312							}
1313
1314							}
1315
1316							# noun to non-noun ptr symbols, with frequencies
1317							# -u 329 (dmnu) - cf. domn (all domains)
1318							# -r 80 (dmnr)
1319							# = 648 (attr)
1320							# -c 2372 (dmnc)
1321							# + 21390 (deri) lexical
1322
1323							# verb to non-verb ptr symbols, with frequencies
1324							# ;u 16 (dmtu) - cf. domt (all domains)
1325							# ;c 1213 (dmtc)
1326							# ;r 2 (dmtr)
1327							# + 21095 (deri) lexical
1328
1329							# adj to non-adj
1330							# \ 4672 (pert) pertains to noun ; lexical
1331							# ;u 233
1332							# ;c 1125
1333							# = 648 (attr)
1334							# < 124 (part) particple of verb ; lexical
1335							# ;r 76
1336
1337							# adv to non-adv
1338							# \ 3208 (derived from adj)
1339							# ;u 74
1340							# ;c 37
1341							# ;r 2
1342
1343							sub coercePos
1344							{
1345	0			0	0		my $self = shift;
1346	0						my $word = shift;
1347	0						my $pos = shift;
1348	0						my $wn = $wordnet{$self};
1349
1350							# remove pos tag, if present
1351	0						$word =~ s/\#.*//;
1352
1353	0						my @forms = $wn->validForms ($word);
1354
1355	0	0					if (0 >= scalar @forms) {
1356	0						return undef;
1357							}
1358
1359							# pre-compile the pattern
1360	0						my $cpattern = qr/\#$pos/;
1361
1362	0						foreach my $form (@forms) {
1363	0	0					if ($form =~ /$cpattern/) {
1364	0						return $form;
1365							}
1366							}
1367
1368							# didn't find a surface match, look along cross-pos relations
1369
1370	0						my @goodforms;
1371	0						foreach my $form (@forms) {
1372	0						my @cands = $wn->queryWord ($form, "deri");
1373	0						foreach my $candidate (@cands) {
1374	0	0					if ($candidate =~ /$cpattern/) {
1375	0						push @goodforms, $candidate;
1376							}
1377							}
1378							}
1379
1380	0						return @goodforms;
1381							}
1382
1383							# get all senses for each context word
1384							sub _getSenses
1385							{
1386	0			0			my $self = shift;
1387	0						my $context_ref = shift;
1388	0						my @senses;
1389	0						for my $i (0..$#{$context_ref}){
	0
1390							# first get all forms for each POS
1391	0	0					if ( (${$context_ref}[$i] =~ /\#o\|\#IT\|\#CL\|\#NT\|\#MW/) ) {
	0
1392	0						$senses[$i] = undef;
1393							}
1394							else {
1395	0						my @forms;
1396	0	0					unless ($wnformat{$self}) {
1397	0						@forms = $self->wordnet->validForms (${$context_ref}[$i]);
	0
1398							}
1399							else {
1400	0						@forms = ${$context_ref}[$i];
	0
1401							}
1402	0	0					if (scalar @forms == 0) {
1403	0						${$context_ref}[$i]= "${$context_ref}[$i]"."#ND";
	0
	0
1404							}
1405							else {
1406							# now get all the senses for each form
1407	0						foreach my $form (@forms) {
1408	0						my @temps = $self->wordnet->querySense ($form);
1409	0						push @{$senses[$i]}, @temps;
	0
1410							}
1411							}
1412							}
1413							}
1414	0						return @senses;
1415							}
1416
1417							sub _loadStoplist
1418							{
1419	0			0			my $self = shift;
1420	0						my $file = shift;
1421	0	0					open SFH, '<', $file or die "Cannot open stoplist $file: $!";
1422	0						$stoplist{$self} = [];
1423	0						while (my $line = ) {
1424	0						chomp $line;
1425	0	0					if ($line =~ m\|/(.*)/\|) {
1426	0						push @{$stoplist{$self}}, qr/$1/;
	0
1427							}
1428							else {
1429	0						warn "Line $. of the stoplist '$file' is malformed\n";
1430							}
1431							}
1432	0						close SFH;
1433							}
1434
1435							sub getPos
1436							{
1437	0			0	0		my $string = shift;
1438	0						my $p = index $string, "#";
1439	0	0					return undef if $p < 0;
1440	0						my $pos = substr $string, $p+1, 1;
1441	0						return $pos;
1442							}
1443
1444							1;
1445
1446							__END__