File Coverage

blib/lib/Lingua/Stem/En.pm

Criterion	Covered	Total	%
statement	67	121	55.3
branch	24	50	48.0
condition	6	10	60.0
subroutine	8	46	17.3
pod	3	3	100.0
total	108	230	46.9

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::En;
2
3							=head1 NAME
4
5							Lingua::Stem::En - Porter's stemming algorithm for 'generic' English
6
7							=head1 SYNOPSIS
8
9							use Lingua::Stem::En;
10							my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference,
11							-locale => 'en',
12							-exceptions => $exceptions_hash,
13							});
14
15							=head1 DESCRIPTION
16
17							This routine applies the Porter Stemming Algorithm to its parameters,
18							returning the stemmed words.
19
20							It is derived from the C program "stemmer.c"
21							as found in freewais and elsewhere, which contains these notes:
22
23							Purpose: Implementation of the Porter stemming algorithm documented
24							in: Porter, M.F., "An Algorithm For Suffix Stripping,"
25							Program 14 (3), July 1980, pp. 130-137.
26							Provenance: Written by B. Frakes and C. Cox, 1986.
27
28							I have re-interpreted areas that use Frakes and Cox's "WordSize"
29							function. My version may misbehave on short words starting with "y",
30							but I can't think of any examples.
31
32							The step numbers correspond to Frakes and Cox, and are probably in
33							Porter's article (which I've not seen).
34							Porter's algorithm still has rough spots (e.g current/currency, -ings words),
35							which I've not attempted to cure, although I have added
36							support for the British -ise suffix.
37
38							=head1 CHANGES
39
40
41							1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace,
42							optionalized the export of the 'stem' routine
43							into the caller's namespace, added named parameters
44
45							1999.06.24 - Switch core implementation of the Porter stemmer to
46							the one written by Jim Richardson
47
48							2000.08.25 - 2.11 Added stemming cache
49
50							2000.09.14 - 2.12 Fixed major :( implementation error of Porter's algorithm
51							Error was entirely my fault - I completely forgot to include
52							rule sets 2,3, and 4 starting with Lingua::Stem 0.30.
53							-- Jerilyn Franz
54
55							2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens.
56
57							2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention
58							for 'private' subroutines. Changed Exporter invokation to more
59							portable 'require' vice 'use'.
60
61							2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming.
62
63							2009.07.27 - 2.16 Documentation Fix
64
65							2020.06.20 - 2.30 Version renumber for module consistency.
66
67							2020.09.26 - 2.31 Fix for Latin1/UTF8 issue in documentation
68
69							=cut
70
71							#######################################################################
72							# Initialization
73							#######################################################################
74
75	2			2		13	use strict;
	2					4
	2					62
76	2			2		10	use warnings;
	2					3
	2					60
77							require Exporter;
78	2			2		10	use Carp;
	2					4
	2					129
79	2			2		21	use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
	2					3
	2					219
80							BEGIN {
81	2			2		8	$VERSION = "2.31";
82	2					68	@ISA = qw (Exporter);
83	2					14	@EXPORT = ();
84	2					5	@EXPORT_OK = qw (stem clear_stem_cache stem_caching);
85	2					1214	%EXPORT_TAGS = ();
86							}
87
88							my $Stem_Caching = 0;
89							my $Stem_Cache = {};
90							my %Stem_Cache2 = ();
91
92							#
93							#V Porter.pm V2.11 25 Aug 2000 stemming cache
94							# Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
95							# Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
96							# porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
97							# Jim Richardson, University of Sydney
98							# jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
99
100							# Find a canonical stem for a word, assumed to consist entirely of
101							# lower-case letters. The approach is from
102							#
103							# M. F. Porter, An algorithm for suffix stripping, Program (Automated
104							# Library and Information Systems) 14 (3) 130-7, July 1980.
105							#
106							# This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
107							#
108							# http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html
109
110							# Some additional rules are used here, mainly to allow for British spellings
111							# like -ise. They are marked ** in the code.
112
113							# Initialization required before using subroutine stem:
114
115							# We count syllables slightly differently from Porter: we say the syllable
116							# count increases on each occurrence in the word of an adjacent pair
117							#
118							# [aeiouy][^aeiou]
119							#
120							# This avoids any need to define vowels and consonants, or confusion over
121							# 'y'. It also works slightly better: our definition gives two syllables
122							# in 'yttrium', while Porter's gives only one because the initial 'y' is
123							# taken to be a consonant. But it is not quite obvious: for example,
124							# consider 'mayfly' where, when working backwards (see below), the 'yf'
125							# matches the above pattern, even though it is the 'ay' which in Porter's
126							# terms increments the syllable count.
127							#
128							# We wish to match the above in context, working backwards from the end of
129							# the word: the appropriate regular expression is
130
131							my $syl = '[aeiou][^aeiou][^aeiouy][aeiouy]';
132
133							# (This works because [^aeiouy] is a subset of [^aeiou].) If we want two
134							# syllables ("m>1" in Porter's terminology) we can just match $syl$syl.
135
136							# For step 1b we need to be able to detect the presence of a vowel: here
137							# we revert to Porter's definition that a vowel is [aeiou], or y preceded
138							# by a consonant. (If the . below is a vowel, then the . is the desired
139							# vowel; if the . is a consonant the y is the desired vowel.)
140
141							my $hasvow = '[^aeiouy]*([aeiou]\|y.)';
142
143							=head1 METHODS
144
145							=cut
146
147							#######################################################################
148
149							=over 4
150
151							=item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions });
152
153							Stems a list of passed words using the rules of US English. Returns
154							an anonymous array reference to the stemmed words.
155
156							Example:
157
158							my @words = ( 'wordy', 'another' );
159							my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words,
160							-locale => 'en',
161							-exceptions => \%exceptions,
162							});
163
164							If the first element of @words is a list reference, then the stemming is performed 'in place'
165							on that list (modifying the passed list directly instead of copying it to a new array).
166
167							This is only useful if you do not need to keep the original list. If you B need to keep
168							the original list, use the normal semantic of having 'stem' return a new list instead - that
169							is faster than making your own copy B using the 'in place' semantics since the primary
170							difference between 'in place' and 'by value' stemming is the creation of a copy of the original
171							list. If you B need the original list, then the 'in place' stemming is about 60% faster.
172
173							Example of 'in place' stemming:
174
175							my $words = [ 'wordy', 'another' ];
176							my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words],
177							-locale => 'en',
178							-exceptions => \%exceptions,
179							});
180
181							The 'in place' mode returns a reference to the original list with the words stemmed.
182
183							=back
184
185							=cut
186
187							sub stem {
188	18	50		18	1	40	return [] if ($#_ == -1);
189	18					20	my $parm_ref;
190	18	50				37	if (ref $_[0]) {
191	18					25	$parm_ref = shift;
192							} else {
193	0					0	$parm_ref = { @_ };
194							}
195
196	18					25	my $words = [];
197	18					26	my $locale = 'en';
198	18					24	my $exceptions = {};
199	18					51	foreach (keys %$parm_ref) {
200	54					80	my $key = lc ($_);
201	54					77	my $value = $parm_ref->{$key};
202	54	100				107	if ($key eq '-words') {
		100
		50
203	18					48	@$words = @$value;
204	18	100				46	if (ref($words->[0]) eq 'ARRAY'){
205	3					6	$words = $words->[0];
206							}
207							} elsif ($key eq '-exceptions') {
208	18					48	$exceptions = $parm_ref->{$key};
209							} elsif ($key eq '-locale') {
210	18					35	$locale = $parm_ref->{$key};
211							} else {
212	0					0	croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
213							}
214							}
215
216	18					29	local( $_ );
217
218	18					26	foreach (@$words) {
219
220							# Flatten case
221	180					300	$_ = lc $_;
222
223							# Check against cache of stemmed words
224	180	50				272	if (exists $Stem_Cache2{$_}) {
225	0					0	$_ = $Stem_Cache2{$_};
226	0					0	next;
227							}
228
229							# Check against exceptions list
230	180	100				281	if (exists $exceptions->{$_}) {
231	6					8	$_ = $exceptions->{$_};
232	6					14	next;
233							}
234
235	174					193	my $original_word = $_;
236
237							# Step 0 - remove punctuation
238	174					229	s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
	174					288
	174					247
239	174	50				433	next unless /^[a-z]+$/;
240
241							# Reverse the word so we can easily apply pattern matching to the end:
242	174					320	$_ = reverse $_;
243
244							# Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0
245
246	174	100	50			334	m!^s! && ( s!^se(ss\|i)!$1! \|\| s!^s([^s])!$1! );
247
248							# Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
249							# but ated->ate etc
250
251	174	50	50			671	s!^dee($syl)!ee$1!o \|\|
			100
252							(
253							s!^(de\|gni)($hasvow)!$2!o &&
254							(
255							# at->ate, bl->ble, iz->ize, is->ise
256							s!^(ta\|lb\|[sz]i)!e$1! \|\| # ** ise as well as ize
257							# CC->C (C consonant other than l, s, z)
258							s!^([^aeioulsz])\1!$1! \|\|
259							# (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
260							s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
261							)
262							);
263
264							# Step 1c: change y to i: happy->happi, sky->sky
265
266	174					379	s!^y($hasvow)!i$1!o;
267
268							# Step 2: double and triple suffices (part 1)
269
270							# Switch on last three letters (fails harmlessly if subroutine undefined) --
271							# thanks to Ian Phillipps who wrote
272							# CPAN authors/id/IANPX/Stem-0.1.tar.gz
273							# for suggesting the replacement of
274							# eval( '&S2' . unpack( 'a3', $_ ) );
275							# (where the eval ignores undefined subroutines) by the much faster
276							# eval { &{ 'S2' . substr( $_, 0, 3 ) } };
277							# But the following is slightly faster still:
278
279							{
280	2			2		16	no strict 'refs';
	2					4
	2					3048
	174					201
281
282	174					178	my $sub;
283
284							# Step 3: double and triple suffices, etc (part 2)
285
286	174	50				176	&$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) };
	174					692
287
288							# Step 3: double and triple suffices, etc (part 2)
289
290	174	50				221	&$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) };
	174					508
291
292							# Step 4: single suffices on polysyllables
293
294	174	100				212	&$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) };
	174					604
295
296							}
297							# Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas
298
299	174	100	50			590	m!^e! && ( s!^e($syl$syl)!$1!o \|\|
			50
300
301							# Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
302							# not w, x or y:
303
304							! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E
305							s!^e($syl[aeiouy][^aeiou])$!$1!o # m=1
306							);
307
308							# Step 5b: double l -- controll->control, roll->roll
309							# ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
310							# ($syl) is wanted to strip an l off controll.
311
312	174					253	s!^ll($syl)!l$1!o;
313
314	174					270	$_ = scalar( reverse $_ );
315
316	174	50				294	$Stem_Cache2{$original_word} = $_ if $Stem_Caching;
317							}
318	18	50				40	%Stem_Cache2 = () if ($Stem_Caching < 2);
319
320	18					61	return $words;
321							}
322
323							##############################################################
324							# Rule set 4
325
326							sub _S4la {
327							# SYLSYLal -> SYLSYL
328	0			0		0	s!^la($syl$syl)!$1!o;
329							}
330
331							sub _S4ec {
332							# SYLSYL[ae]nce -> SYLSYL
333	0			0		0	s!^ecn[ae]($syl$syl)!$1!o;
334							}
335
336							sub _S4re {
337							# SYLSYLer -> SYLSYL
338	18			18		89	s!^re($syl$syl)!$1!o;
339							}
340
341							sub _S4ci {
342							# SYLSYLic -> SYLSYL
343	0			0			s!^ci($syl$syl)!$1!o;
344							}
345
346							sub _S4el {
347							# SYLSYL[ai]ble -> SYLSYL
348	0			0			s!^elb[ai]($syl$syl)!$1!o;
349							}
350
351							sub _S4tn {
352							# SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
353	0			0			s!^tn(a\|e(me?)?)($syl$syl)!$3!o;
354							}
355							sub _S4no {
356							# SYLSYL[st]ion -> SYLSYL[st]
357	0			0			s!^noi([st]$syl$syl)!$1!o;
358							}
359
360							sub _S4uo {
361							# SYLSYLou -> SYLSYL e.g. homologou -> homolog
362	0			0			s!^uo($syl$syl)!$1!o;
363							}
364
365							sub _S4ms {
366							# SYLSYLism -> SYLSYL
367	0			0			s!^msi($syl$syl)!$1!o;
368							}
369
370							sub _S4et {
371							# SYLSYLate -> SYLSYL
372	0			0			s!^eta($syl$syl)!$1!o;
373							}
374
375							sub _S4it {
376							# SYLSYLiti -> SYLSYL
377	0			0			s!^iti($syl$syl)!$1!o;
378							}
379
380							sub _S4su {
381							# SYLSYLous -> SYLSYL
382	0			0			s!^suo($syl$syl)!$1!o;
383							}
384
385							sub _S4ev {
386							# SYLSYLive -> SYLSYL
387	0			0			s!^evi($syl$syl)!$1!o;
388							}
389
390							sub _S4ez {
391							# SYLSYLize -> SYLSYL
392	0			0			s!^ezi($syl$syl)!$1!o;
393							}
394
395							sub _S4es {
396							# SYLSYLise -> SYLSYL **
397	0			0			s!^esi($syl$syl)!$1!o;
398							}
399
400							##############################################################
401							# Rule set 2
402
403							sub _S2lan {
404							# SYLational -> SYLate, SYLtional -> SYLtion
405	0	0		0			s!^lanoita($syl)!eta$1!o \|\| s!^lanoit($syl)!noit$1!o;
406							}
407
408							sub _S2icn {
409							# SYLanci -> SYLance, SYLency ->SYLence
410	0			0			s!^icn([ae]$syl)!ecn$1!o;
411							}
412
413							sub _S2res {
414							# SYLiser -> SYLise **
415	0			0			&_S2rez;
416							}
417
418							sub _S2rez {
419							# SYLizer -> SYLize
420	0			0			s!^re(.)i($syl)!e$1i$2!o;
421							}
422
423							sub _S2ilb {
424							# SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
425	0			0			s!^ilb([ai]$syl)!elb$1!o;
426							}
427
428							sub _S2ill {
429							# SYLalli -> SYLal
430	0			0			s!^illa($syl)!la$1!o;
431							}
432
433							sub _S2ilt {
434							# SYLentli -> SYLent
435	0			0			s!^iltne($syl)!tne$1!o
436							}
437
438							sub _S2ile {
439							# SYLeli -> SYLe
440	0			0			s!^ile($syl)!e$1!o;
441							}
442
443							sub _S2ils {
444							# SYLousli -> SYLous
445	0			0			s!^ilsuo($syl)!suo$1!o;
446							}
447
448							sub _S2noi {
449							# SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
450	0	0		0			s!^noita([sz])i($syl)!e$1i$2!o \|\| s!^noita($syl)!eta$1!o;
451							}
452
453							sub _S2rot {
454							# SYLator -> SYLate
455	0			0			s!^rota($syl)!eta$1!o;
456							}
457
458							sub _S2msi {
459							# SYLalism -> SYLal
460	0			0			s!^msila($syl)!la$1!o;
461							}
462
463							sub _S2sse {
464							# SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
465	0			0			s!^ssen(evi\|luf\|suo)($syl)!$1$2!o;
466							}
467
468							sub _S2iti {
469							# SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
470	0	0		0			s!^iti(la\|lib\|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
	0	0
471							. $2 !eo;
472							}
473
474							##############################################################
475							# Rule set 3
476
477							sub _S3eta {
478							# SYLicate -> SYLic
479	0			0			s!^etaci($syl)!ci$1!o;
480							}
481
482							sub _S3evi {
483							# SYLative -> SYL
484	0			0			s!^evita($syl)!$1!o;
485							}
486
487							sub _S3ezi
488							{
489							# SYLalize -> SYLal
490	0			0			s!^ezila($syl)!la$1!o;
491							}
492
493							sub _S3esi {
494							# SYLalise -> SYLal **
495	0			0			s!^esila($syl)!la$1!o;
496							}
497
498							sub _S3iti {
499							# SYLiciti -> SYLic
500	0			0			s!^itici($syl)!ci$1!o;
501							}
502
503							sub _S3lac {
504							# SYLical -> SYLic
505	0			0			s!^laci($syl)!ci$1!o;
506							}
507							sub _S3luf {
508							# SYLful -> SYL
509	0			0			s!^luf($syl)!$1!o;
510							}
511
512							sub _S3sse {
513							# SYLness -> SYL
514	0			0			s!^ssen($syl)!$1!o;
515							}
516
517
518							##############################################################
519
520							=over 4
521
522							=item stem_caching({ -level => 0\|1\|2 });
523
524							Sets the level of stem caching.
525
526							'0' means 'no caching'. This is the default level.
527
528							'1' means 'cache per run'. This caches stemming results during a single
529							call to 'stem'.
530
531							'2' means 'cache indefinitely'. This caches stemming results until
532							either the process exits or the 'clear_stem_cache' method is called.
533
534							=back
535
536							=cut
537
538							sub stem_caching {
539	0			0	1		my $parm_ref;
540	0	0					if (ref $_[0]) {
541	0						$parm_ref = shift;
542							} else {
543	0						$parm_ref = { @_ };
544							}
545	0						my $caching_level = $parm_ref->{-level};
546	0	0					if (defined $caching_level) {
547	0	0					if ($caching_level !~ m/^[012]$/) {
548	0						croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
549							}
550	0						$Stem_Caching = $caching_level;
551	0	0					if ($caching_level < 2) {
552	0						%Stem_Cache2 = ();
553							}
554							}
555	0						return $Stem_Caching;
556							}
557
558							##############################################################
559
560							=over 4
561
562							=item clear_stem_cache;
563
564							Clears the cache of stemmed words
565
566							=back
567
568							=cut
569
570							sub clear_stem_cache {
571	0			0	1		%Stem_Cache2 = ();
572							}
573
574							##############################################################
575
576							=head1 NOTES
577
578							This code is almost entirely derived from the Porter 2.1 module
579							written by Jim Richardson.
580
581							=head1 SEE ALSO
582
583							Lingua::Stem
584
585							=head1 AUTHOR
586
587							Jim Richardson, University of Sydney
588							jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
589
590							Integration in Lingua::Stem by
591							Jerilyn Franz, FreeRun Technologies,
592
593
594							=head1 COPYRIGHT
595
596							Jim Richardson, University of Sydney
597							Jerilyn Franz, FreeRun Technologies
598
599							This code is freely available under the same terms as Perl.
600
601							=head1 BUGS
602
603							=head1 TODO
604
605							=cut
606
607							1;