File Coverage

blib/lib/Lingua/Stem/En.pm

Criterion	Covered	Total	%
statement	64	118	54.2
branch	24	50	48.0
condition	6	10	60.0
subroutine	7	45	15.5
pod	3	3	100.0
total	104	226	46.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::En;
2
3							# $RCSfile: En.pm,v $ $Revision: 1.4 $ $Date: 1999/06/24 23:33:37 $ $Author: snowhare $
4
5							=head1 NAME
6
7							Lingua::Stem::En - Porter's stemming algorithm for 'generic' English
8
9							=head1 SYNOPSIS
10
11							use Lingua::Stem::En;
12							my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference,
13							-locale => 'en',
14							-exceptions => $exceptions_hash,
15							});
16
17							=head1 DESCRIPTION
18
19							This routine applies the Porter Stemming Algorithm to its parameters,
20							returning the stemmed words.
21
22							It is derived from the C program "stemmer.c"
23							as found in freewais and elsewhere, which contains these notes:
24
25							Purpose: Implementation of the Porter stemming algorithm documented
26							in: Porter, M.F., "An Algorithm For Suffix Stripping,"
27							Program 14 (3), July 1980, pp. 130-137.
28							Provenance: Written by B. Frakes and C. Cox, 1986.
29
30							I have re-interpreted areas that use Frakes and Cox's "WordSize"
31							function. My version may misbehave on short words starting with "y",
32							but I can't think of any examples.
33
34							The step numbers correspond to Frakes and Cox, and are probably in
35							Porter's article (which I've not seen).
36							Porter's algorithm still has rough spots (e.g current/currency, -ings words),
37							which I've not attempted to cure, although I have added
38							support for the British -ise suffix.
39
40							=head1 CHANGES
41
42
43							1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace,
44							optionalized the export of the 'stem' routine
45							into the caller's namespace, added named parameters
46
47							1999.06.24 - Switch core implementation of the Porter stemmer to
48							the one written by Jim Richardson
49
50							2000.08.25 - 2.11 Added stemming cache
51
52							2000.09.14 - 2.12 Fixed major :( implementation error of Porter's algorithm
53							Error was entirely my fault - I completely forgot to include
54							rule sets 2,3, and 4 starting with Lingua::Stem 0.30.
55							-- Benjamin Franz
56
57							2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens.
58
59							2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention
60							for 'private' subroutines. Changed Exporter invokation to more
61							portable 'require' vice 'use'.
62
63							2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming.
64
65							2009.07.27 2.16 Documentation Fix
66							=cut
67
68							#######################################################################
69							# Initialization
70							#######################################################################
71
72	2			2		12	use strict;
	2					4
	2					95
73							require Exporter;
74	2			2		12	use Carp;
	2					3
	2					162
75	2			2		11	use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
	2					8
	2					382
76							BEGIN {
77	2			2		5	$VERSION = "2.16";
78	2					46	@ISA = qw (Exporter);
79	2					5	@EXPORT = ();
80	2					4	@EXPORT_OK = qw (stem clear_stem_cache stem_caching);
81	2					7926	%EXPORT_TAGS = ();
82							}
83
84							my $Stem_Caching = 0;
85							my $Stem_Cache = {};
86							my %Stem_Cache2 = ();
87
88							#
89							#V Porter.pm V2.11 25 Aug 2000 stemming cache
90							# Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
91							# Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
92							# porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
93							# Jim Richardson, University of Sydney
94							# jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
95
96							# Find a canonical stem for a word, assumed to consist entirely of
97							# lower-case letters. The approach is from
98							#
99							# M. F. Porter, An algorithm for suffix stripping, Program (Automated
100							# Library and Information Systems) 14 (3) 130-7, July 1980.
101							#
102							# This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
103							#
104							# http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html
105
106							# Some additional rules are used here, mainly to allow for British spellings
107							# like -ise. They are marked ** in the code.
108
109							# Initialization required before using subroutine stem:
110
111							# We count syllables slightly differently from Porter: we say the syllable
112							# count increases on each occurrence in the word of an adjacent pair
113							#
114							# [aeiouy][^aeiou]
115							#
116							# This avoids any need to define vowels and consonants, or confusion over
117							# 'y'. It also works slightly better: our definition gives two syllables
118							# in 'yttrium', while Porter's gives only one because the initial 'y' is
119							# taken to be a consonant. But it is not quite obvious: for example,
120							# consider 'mayfly' where, when working backwards (see below), the 'yf'
121							# matches the above pattern, even though it is the 'ay' which in Porter's
122							# terms increments the syllable count.
123							#
124							# We wish to match the above in context, working backwards from the end of
125							# the word: the appropriate regular expression is
126
127							my $syl = '[aeiou][^aeiou][^aeiouy][aeiouy]';
128
129							# (This works because [^aeiouy] is a subset of [^aeiou].) If we want two
130							# syllables ("m>1" in Porter's terminology) we can just match $syl$syl.
131
132							# For step 1b we need to be able to detect the presence of a vowel: here
133							# we revert to Porter's definition that a vowel is [aeiou], or y preceded
134							# by a consonant. (If the . below is a vowel, then the . is the desired
135							# vowel; if the . is a consonant the y is the desired vowel.)
136
137							my $hasvow = '[^aeiouy]*([aeiou]\|y.)';
138
139							=head1 METHODS
140
141							=cut
142
143							#######################################################################
144
145							=over 4
146
147							=item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions });
148
149							Stems a list of passed words using the rules of US English. Returns
150							an anonymous array reference to the stemmed words.
151
152							Example:
153
154							my @words = ( 'wordy', 'another' );
155							my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words,
156							-locale => 'en',
157							-exceptions => \%exceptions,
158							});
159
160							If the first element of @words is a list reference, then the stemming is performed 'in place'
161							on that list (modifying the passed list directly instead of copying it to a new array).
162
163							This is only useful if you do not need to keep the original list. If you B need to keep
164							the original list, use the normal semantic of having 'stem' return a new list instead - that
165							is faster than making your own copy B using the 'in place' semantics since the primary
166							difference between 'in place' and 'by value' stemming is the creation of a copy of the original
167							list. If you B need the original list, then the 'in place' stemming is about 60% faster.
168
169							Example of 'in place' stemming:
170
171							my $words = [ 'wordy', 'another' ];
172							my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words],
173							-locale => 'en',
174							-exceptions => \%exceptions,
175							});
176
177							The 'in place' mode returns a reference to the original list with the words stemmed.
178
179							=back
180
181							=cut
182
183							sub stem {
184	18	50		18	1	49	return [] if ($#_ == -1);
185	18					19	my $parm_ref;
186	18	50				38	if (ref $_[0]) {
187	18					22	$parm_ref = shift;
188							} else {
189	0					0	$parm_ref = { @_ };
190							}
191
192	18					28	my $words = [];
193	18					24	my $locale = 'en';
194	18					24	my $exceptions = {};
195	18					70	foreach (keys %$parm_ref) {
196	54					77	my $key = lc ($_);
197	54					79	my $value = $parm_ref->{$key};
198	54	100				127	if ($key eq '-words') {
		100
		50
199	18					71	@$words = @$value;
200	18	100				128	if (ref($words->[0]) eq 'ARRAY'){
201	3					7	$words = $words->[0];
202							}
203							} elsif ($key eq '-exceptions') {
204	18					55	$exceptions = $parm_ref->{$key};
205							} elsif ($key eq '-locale') {
206	18					42	$locale = $parm_ref->{$key};
207							} else {
208	0					0	croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
209							}
210							}
211
212	18					33	local( $_ );
213
214	18					32	foreach (@$words) {
215
216							# Flatten case
217	180					250	$_ = lc $_;
218
219							# Check against cache of stemmed words
220	180	50				324	if (exists $Stem_Cache2{$_}) {
221	0					0	$_ = $Stem_Cache2{$_};
222	0					0	next;
223							}
224
225							# Check against exceptions list
226	180	100				348	if (exists $exceptions->{$_}) {
227	6					11	$_ = $exceptions->{$_};
228	6					14	next;
229							}
230
231	174					179	my $original_word = $_;
232
233							# Step 0 - remove punctuation
234	174					218	s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
	174					512
	174					227
235	174	50				643	next unless /^[a-z]+$/;
236
237							# Reverse the word so we can easily apply pattern matching to the end:
238	174					263	$_ = reverse $_;
239
240							# Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0
241
242	174	100	50			479	m!^s! && ( s!^se(ss\|i)!$1! \|\| s!^s([^s])!$1! );
243
244							# Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
245							# but ated->ate etc
246
247	174	50	50			1335	s!^dee($syl)!ee$1!o \|\|
			100
248							(
249							s!^(de\|gni)($hasvow)!$2!o &&
250							(
251							# at->ate, bl->ble, iz->ize, is->ise
252							s!^(ta\|lb\|[sz]i)!e$1! \|\| # ** ise as well as ize
253							# CC->C (C consonant other than l, s, z)
254							s!^([^aeioulsz])\1!$1! \|\|
255							# (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
256							s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
257							)
258							);
259
260							# Step 1c: change y to i: happy->happi, sky->sky
261
262	174					378	s!^y($hasvow)!i$1!o;
263
264							# Step 2: double and triple suffices (part 1)
265
266							# Switch on last three letters (fails harmlessly if subroutine undefined) --
267							# thanks to Ian Phillipps who wrote
268							# CPAN authors/id/IANPX/Stem-0.1.tar.gz
269							# for suggesting the replacement of
270							# eval( '&S2' . unpack( 'a3', $_ ) );
271							# (where the eval ignores undefined subroutines) by the much faster
272							# eval { &{ 'S2' . substr( $_, 0, 3 ) } };
273							# But the following is slightly faster still:
274
275							{
276	2			2		24	no strict 'refs';
	2					3
	2					5731
	174					256
277
278	174					156	my $sub;
279
280							# Step 3: double and triple suffices, etc (part 2)
281
282	174	50				159	&$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) };
	174					833
283
284							# Step 3: double and triple suffices, etc (part 2)
285
286	174	50				202	&$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) };
	174					700
287
288							# Step 4: single suffices on polysyllables
289
290	174	100				176	&$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) };
	174					731
291
292							}
293							# Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas
294
295	174	100	50			658	m!^e! && ( s!^e($syl$syl)!$1!o \|\|
			50
296
297							# Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
298							# not w, x or y:
299
300							! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E
301							s!^e($syl[aeiouy][^aeiou])$!$1!o # m=1
302							);
303
304							# Step 5b: double l -- controll->control, roll->roll
305							# ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
306							# ($syl) is wanted to strip an l off controll.
307
308	174					229	s!^ll($syl)!l$1!o;
309
310	174					254	$_ = scalar( reverse $_ );
311
312	174	50				383	$Stem_Cache2{$original_word} = $_ if $Stem_Caching;
313							}
314	18	50				54	%Stem_Cache2 = () if ($Stem_Caching < 2);
315
316	18					124	return $words;
317							}
318
319							##############################################################
320							# Rule set 4
321
322							sub _S4la {
323							# SYLSYLal -> SYLSYL
324	0			0		0	s!^la($syl$syl)!$1!o;
325							}
326
327							sub _S4ec {
328							# SYLSYL[ae]nce -> SYLSYL
329	0			0		0	s!^ecn[ae]($syl$syl)!$1!o;
330							}
331
332							sub _S4re {
333							# SYLSYLer -> SYLSYL
334	18			18		107	s!^re($syl$syl)!$1!o;
335							}
336
337							sub _S4ci {
338							# SYLSYLic -> SYLSYL
339	0			0			s!^ci($syl$syl)!$1!o;
340							}
341
342							sub _S4el {
343							# SYLSYL[ai]ble -> SYLSYL
344	0			0			s!^elb[ai]($syl$syl)!$1!o;
345							}
346
347							sub _S4tn {
348							# SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
349	0			0			s!^tn(a\|e(me?)?)($syl$syl)!$3!o;
350							}
351							sub _S4no {
352							# SYLSYL[st]ion -> SYLSYL[st]
353	0			0			s!^noi([st]$syl$syl)!$1!o;
354							}
355
356							sub _S4uo {
357							# SYLSYLou -> SYLSYL e.g. homologou -> homolog
358	0			0			s!^uo($syl$syl)!$1!o;
359							}
360
361							sub _S4ms {
362							# SYLSYLism -> SYLSYL
363	0			0			s!^msi($syl$syl)!$1!o;
364							}
365
366							sub _S4et {
367							# SYLSYLate -> SYLSYL
368	0			0			s!^eta($syl$syl)!$1!o;
369							}
370
371							sub _S4it {
372							# SYLSYLiti -> SYLSYL
373	0			0			s!^iti($syl$syl)!$1!o;
374							}
375
376							sub _S4su {
377							# SYLSYLous -> SYLSYL
378	0			0			s!^suo($syl$syl)!$1!o;
379							}
380
381							sub _S4ev {
382							# SYLSYLive -> SYLSYL
383	0			0			s!^evi($syl$syl)!$1!o;
384							}
385
386							sub _S4ez {
387							# SYLSYLize -> SYLSYL
388	0			0			s!^ezi($syl$syl)!$1!o;
389							}
390
391							sub _S4es {
392							# SYLSYLise -> SYLSYL **
393	0			0			s!^esi($syl$syl)!$1!o;
394							}
395
396							##############################################################
397							# Rule set 2
398
399							sub _S2lan {
400							# SYLational -> SYLate, SYLtional -> SYLtion
401	0	0		0			s!^lanoita($syl)!eta$1!o \|\| s!^lanoit($syl)!noit$1!o;
402							}
403
404							sub _S2icn {
405							# SYLanci -> SYLance, SYLency ->SYLence
406	0			0			s!^icn([ae]$syl)!ecn$1!o;
407							}
408
409							sub _S2res {
410							# SYLiser -> SYLise **
411	0			0			&_S2rez;
412							}
413
414							sub _S2rez {
415							# SYLizer -> SYLize
416	0			0			s!^re(.)i($syl)!e$1i$2!o;
417							}
418
419							sub _S2ilb {
420							# SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
421	0			0			s!^ilb([ai]$syl)!elb$1!o;
422							}
423
424							sub _S2ill {
425							# SYLalli -> SYLal
426	0			0			s!^illa($syl)!la$1!o;
427							}
428
429							sub _S2ilt {
430							# SYLentli -> SYLent
431	0			0			s!^iltne($syl)!tne$1!o
432							}
433
434							sub _S2ile {
435							# SYLeli -> SYLe
436	0			0			s!^ile($syl)!e$1!o;
437							}
438
439							sub _S2ils {
440							# SYLousli -> SYLous
441	0			0			s!^ilsuo($syl)!suo$1!o;
442							}
443
444							sub _S2noi {
445							# SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
446	0	0		0			s!^noita([sz])i($syl)!e$1i$2!o \|\| s!^noita($syl)!eta$1!o;
447							}
448
449							sub _S2rot {
450							# SYLator -> SYLate
451	0			0			s!^rota($syl)!eta$1!o;
452							}
453
454							sub _S2msi {
455							# SYLalism -> SYLal
456	0			0			s!^msila($syl)!la$1!o;
457							}
458
459							sub _S2sse {
460							# SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
461	0			0			s!^ssen(evi\|luf\|suo)($syl)!$1$2!o;
462							}
463
464							sub _S2iti {
465							# SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
466	0	0		0			s!^iti(la\|lib\|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
	0	0
467							. $2 !eo;
468							}
469
470							##############################################################
471							# Rule set 3
472
473							sub _S3eta {
474							# SYLicate -> SYLic
475	0			0			s!^etaci($syl)!ci$1!o;
476							}
477
478							sub _S3evi {
479							# SYLative -> SYL
480	0			0			s!^evita($syl)!$1!o;
481							}
482
483							sub _S3ezi
484							{
485							# SYLalize -> SYLal
486	0			0			s!^ezila($syl)!la$1!o;
487							}
488
489							sub _S3esi {
490							# SYLalise -> SYLal **
491	0			0			s!^esila($syl)!la$1!o;
492							}
493
494							sub _S3iti {
495							# SYLiciti -> SYLic
496	0			0			s!^itici($syl)!ci$1!o;
497							}
498
499							sub _S3lac {
500							# SYLical -> SYLic
501	0			0			s!^laci($syl)!ci$1!o;
502							}
503							sub _S3luf {
504							# SYLful -> SYL
505	0			0			s!^luf($syl)!$1!o;
506							}
507
508							sub _S3sse {
509							# SYLness -> SYL
510	0			0			s!^ssen($syl)!$1!o;
511							}
512
513
514							##############################################################
515
516							=over 4
517
518							=item stem_caching({ -level => 0\|1\|2 });
519
520							Sets the level of stem caching.
521
522							'0' means 'no caching'. This is the default level.
523
524							'1' means 'cache per run'. This caches stemming results during a single
525							call to 'stem'.
526
527							'2' means 'cache indefinitely'. This caches stemming results until
528							either the process exits or the 'clear_stem_cache' method is called.
529
530							=back
531
532							=cut
533
534							sub stem_caching {
535	0			0	1		my $parm_ref;
536	0	0					if (ref $_[0]) {
537	0						$parm_ref = shift;
538							} else {
539	0						$parm_ref = { @_ };
540							}
541	0						my $caching_level = $parm_ref->{-level};
542	0	0					if (defined $caching_level) {
543	0	0					if ($caching_level !~ m/^[012]$/) {
544	0						croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
545							}
546	0						$Stem_Caching = $caching_level;
547	0	0					if ($caching_level < 2) {
548	0						%Stem_Cache2 = ();
549							}
550							}
551	0						return $Stem_Caching;
552							}
553
554							##############################################################
555
556							=over 4
557
558							=item clear_stem_cache;
559
560							Clears the cache of stemmed words
561
562							=back
563
564							=cut
565
566							sub clear_stem_cache {
567	0			0	1		%Stem_Cache2 = ();
568							}
569
570							##############################################################
571
572							=head1 NOTES
573
574							This code is almost entirely derived from the Porter 2.1 module
575							written by Jim Richardson.
576
577							=head1 SEE ALSO
578
579							Lingua::Stem
580
581							=head1 AUTHOR
582
583							Jim Richardson, University of Sydney
584							jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
585
586							Integration in Lingua::Stem by
587							Benjamin Franz, FreeRun Technologies,
588							snowhare@nihongo.org or http://www.nihongo.org/snowhare/
589
590							=head1 COPYRIGHT
591
592							Jim Richardson, University of Sydney
593							Benjamin Franz, FreeRun Technologies
594
595							This code is freely available under the same terms as Perl.
596
597							=head1 BUGS
598
599							=head1 TODO
600
601							=cut
602
603							1;