File Coverage

blib/lib/Lingua/Stem/En.pm
Criterion Covered Total %
statement 67 121 55.3
branch 24 50 48.0
condition 6 10 60.0
subroutine 8 46 17.3
pod 3 3 100.0
total 108 230 46.9


line stmt bran cond sub pod time code
1             package Lingua::Stem::En;
2              
3             =head1 NAME
4              
5             Lingua::Stem::En - Porter's stemming algorithm for 'generic' English
6              
7             =head1 SYNOPSIS
8              
9             use Lingua::Stem::En;
10             my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference,
11             -locale => 'en',
12             -exceptions => $exceptions_hash,
13             });
14              
15             =head1 DESCRIPTION
16              
17             This routine applies the Porter Stemming Algorithm to its parameters,
18             returning the stemmed words.
19              
20             It is derived from the C program "stemmer.c"
21             as found in freewais and elsewhere, which contains these notes:
22              
23             Purpose: Implementation of the Porter stemming algorithm documented
24             in: Porter, M.F., "An Algorithm For Suffix Stripping,"
25             Program 14 (3), July 1980, pp. 130-137.
26             Provenance: Written by B. Frakes and C. Cox, 1986.
27              
28             I have re-interpreted areas that use Frakes and Cox's "WordSize"
29             function. My version may misbehave on short words starting with "y",
30             but I can't think of any examples.
31              
32             The step numbers correspond to Frakes and Cox, and are probably in
33             Porter's article (which I've not seen).
34             Porter's algorithm still has rough spots (e.g current/currency, -ings words),
35             which I've not attempted to cure, although I have added
36             support for the British -ise suffix.
37              
38             =head1 CHANGES
39              
40            
41             1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace,
42             optionalized the export of the 'stem' routine
43             into the caller's namespace, added named parameters
44              
45             1999.06.24 - Switch core implementation of the Porter stemmer to
46             the one written by Jim Richardson
47              
48             2000.08.25 - 2.11 Added stemming cache
49              
50             2000.09.14 - 2.12 Fixed *major* :( implementation error of Porter's algorithm
51             Error was entirely my fault - I completely forgot to include
52             rule sets 2,3, and 4 starting with Lingua::Stem 0.30.
53             -- Jerilyn Franz
54              
55             2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens.
56              
57             2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention
58             for 'private' subroutines. Changed Exporter invokation to more
59             portable 'require' vice 'use'.
60              
61             2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming.
62              
63             2009.07.27 - 2.16 Documentation Fix
64              
65             2020.06.20 - 2.30 Version renumber for module consistency.
66              
67             2020.09.26 - 2.31 Fix for Latin1/UTF8 issue in documentation
68              
69             =cut
70              
71             #######################################################################
72             # Initialization
73             #######################################################################
74              
75 2     2   13 use strict;
  2         4  
  2         62  
76 2     2   10 use warnings;
  2         3  
  2         60  
77             require Exporter;
78 2     2   10 use Carp;
  2         4  
  2         129  
79 2     2   21 use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
  2         3  
  2         219  
80             BEGIN {
81 2     2   8 $VERSION = "2.31";
82 2         68 @ISA = qw (Exporter);
83 2         14 @EXPORT = ();
84 2         5 @EXPORT_OK = qw (stem clear_stem_cache stem_caching);
85 2         1214 %EXPORT_TAGS = ();
86             }
87              
88             my $Stem_Caching = 0;
89             my $Stem_Cache = {};
90             my %Stem_Cache2 = ();
91              
92             #
93             #V Porter.pm V2.11 25 Aug 2000 stemming cache
94             # Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
95             # Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
96             # porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
97             # Jim Richardson, University of Sydney
98             # jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
99              
100             # Find a canonical stem for a word, assumed to consist entirely of
101             # lower-case letters. The approach is from
102             #
103             # M. F. Porter, An algorithm for suffix stripping, Program (Automated
104             # Library and Information Systems) 14 (3) 130-7, July 1980.
105             #
106             # This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
107             #
108             # http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html
109              
110             # Some additional rules are used here, mainly to allow for British spellings
111             # like -ise. They are marked ** in the code.
112              
113             # Initialization required before using subroutine stem:
114              
115             # We count syllables slightly differently from Porter: we say the syllable
116             # count increases on each occurrence in the word of an adjacent pair
117             #
118             # [aeiouy][^aeiou]
119             #
120             # This avoids any need to define vowels and consonants, or confusion over
121             # 'y'. It also works slightly better: our definition gives two syllables
122             # in 'yttrium', while Porter's gives only one because the initial 'y' is
123             # taken to be a consonant. But it is not quite obvious: for example,
124             # consider 'mayfly' where, when working backwards (see below), the 'yf'
125             # matches the above pattern, even though it is the 'ay' which in Porter's
126             # terms increments the syllable count.
127             #
128             # We wish to match the above in context, working backwards from the end of
129             # the word: the appropriate regular expression is
130              
131             my $syl = '[aeiou]*[^aeiou][^aeiouy]*[aeiouy]';
132              
133             # (This works because [^aeiouy] is a subset of [^aeiou].) If we want two
134             # syllables ("m>1" in Porter's terminology) we can just match $syl$syl.
135              
136             # For step 1b we need to be able to detect the presence of a vowel: here
137             # we revert to Porter's definition that a vowel is [aeiou], or y preceded
138             # by a consonant. (If the . below is a vowel, then the . is the desired
139             # vowel; if the . is a consonant the y is the desired vowel.)
140              
141             my $hasvow = '[^aeiouy]*([aeiou]|y.)';
142              
143             =head1 METHODS
144              
145             =cut
146              
147             #######################################################################
148              
149             =over 4
150              
151             =item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions });
152              
153             Stems a list of passed words using the rules of US English. Returns
154             an anonymous array reference to the stemmed words.
155              
156             Example:
157              
158             my @words = ( 'wordy', 'another' );
159             my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words,
160             -locale => 'en',
161             -exceptions => \%exceptions,
162             });
163              
164             If the first element of @words is a list reference, then the stemming is performed 'in place'
165             on that list (modifying the passed list directly instead of copying it to a new array).
166              
167             This is only useful if you do not need to keep the original list. If you B need to keep
168             the original list, use the normal semantic of having 'stem' return a new list instead - that
169             is faster than making your own copy B using the 'in place' semantics since the primary
170             difference between 'in place' and 'by value' stemming is the creation of a copy of the original
171             list. If you B need the original list, then the 'in place' stemming is about 60% faster.
172              
173             Example of 'in place' stemming:
174              
175             my $words = [ 'wordy', 'another' ];
176             my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words],
177             -locale => 'en',
178             -exceptions => \%exceptions,
179             });
180              
181             The 'in place' mode returns a reference to the original list with the words stemmed.
182              
183             =back
184              
185             =cut
186              
187             sub stem {
188 18 50   18 1 40 return [] if ($#_ == -1);
189 18         20 my $parm_ref;
190 18 50       37 if (ref $_[0]) {
191 18         25 $parm_ref = shift;
192             } else {
193 0         0 $parm_ref = { @_ };
194             }
195            
196 18         25 my $words = [];
197 18         26 my $locale = 'en';
198 18         24 my $exceptions = {};
199 18         51 foreach (keys %$parm_ref) {
200 54         80 my $key = lc ($_);
201 54         77 my $value = $parm_ref->{$key};
202 54 100       107 if ($key eq '-words') {
    100          
    50          
203 18         48 @$words = @$value;
204 18 100       46 if (ref($words->[0]) eq 'ARRAY'){
205 3         6 $words = $words->[0];
206             }
207             } elsif ($key eq '-exceptions') {
208 18         48 $exceptions = $parm_ref->{$key};
209             } elsif ($key eq '-locale') {
210 18         35 $locale = $parm_ref->{$key};
211             } else {
212 0         0 croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
213             }
214             }
215            
216 18         29 local( $_ );
217              
218 18         26 foreach (@$words) {
219              
220             # Flatten case
221 180         300 $_ = lc $_;
222              
223             # Check against cache of stemmed words
224 180 50       272 if (exists $Stem_Cache2{$_}) {
225 0         0 $_ = $Stem_Cache2{$_};
226 0         0 next;
227             }
228              
229             # Check against exceptions list
230 180 100       281 if (exists $exceptions->{$_}) {
231 6         8 $_ = $exceptions->{$_};
232 6         14 next;
233             }
234              
235 174         193 my $original_word = $_;
236              
237             # Step 0 - remove punctuation
238 174         229 s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
  174         288  
  174         247  
239 174 50       433 next unless /^[a-z]+$/;
240              
241             # Reverse the word so we can easily apply pattern matching to the end:
242 174         320 $_ = reverse $_;
243            
244             # Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0
245            
246 174 100 50     334 m!^s! && ( s!^se(ss|i)!$1! || s!^s([^s])!$1! );
247            
248             # Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
249             # but ated->ate etc
250            
251 174 50 50     671 s!^dee($syl)!ee$1!o ||
      100        
252             (
253             s!^(de|gni)($hasvow)!$2!o &&
254             (
255             # at->ate, bl->ble, iz->ize, is->ise
256             s!^(ta|lb|[sz]i)!e$1! || # ** ise as well as ize
257             # CC->C (C consonant other than l, s, z)
258             s!^([^aeioulsz])\1!$1! ||
259             # (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
260             s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
261             )
262             );
263            
264             # Step 1c: change y to i: happy->happi, sky->sky
265            
266 174         379 s!^y($hasvow)!i$1!o;
267            
268             # Step 2: double and triple suffices (part 1)
269            
270             # Switch on last three letters (fails harmlessly if subroutine undefined) --
271             # thanks to Ian Phillipps who wrote
272             # CPAN authors/id/IANPX/Stem-0.1.tar.gz
273             # for suggesting the replacement of
274             # eval( '&S2' . unpack( 'a3', $_ ) );
275             # (where the eval ignores undefined subroutines) by the much faster
276             # eval { &{ 'S2' . substr( $_, 0, 3 ) } };
277             # But the following is slightly faster still:
278              
279             {
280 2     2   16 no strict 'refs';
  2         4  
  2         3048  
  174         201  
281            
282 174         178 my $sub;
283            
284             # Step 3: double and triple suffices, etc (part 2)
285              
286 174 50       176 &$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) };
  174         692  
287            
288             # Step 3: double and triple suffices, etc (part 2)
289            
290 174 50       221 &$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) };
  174         508  
291            
292             # Step 4: single suffices on polysyllables
293            
294 174 100       212 &$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) };
  174         604  
295            
296             }
297             # Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas
298            
299 174 100 50     590 m!^e! && ( s!^e($syl$syl)!$1!o ||
      50        
300            
301             # Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
302             # not w, x or y:
303            
304             ! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E
305             s!^e($syl[aeiouy]*[^aeiou]*)$!$1!o # m=1
306             );
307            
308             # Step 5b: double l -- controll->control, roll->roll
309             # ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
310             # ($syl) is wanted to strip an l off controll.
311            
312 174         253 s!^ll($syl)!l$1!o;
313            
314 174         270 $_ = scalar( reverse $_ );
315              
316 174 50       294 $Stem_Cache2{$original_word} = $_ if $Stem_Caching;
317             }
318 18 50       40 %Stem_Cache2 = () if ($Stem_Caching < 2);
319            
320 18         61 return $words;
321             }
322              
323             ##############################################################
324             # Rule set 4
325              
326             sub _S4la {
327             # SYLSYLal -> SYLSYL
328 0     0   0 s!^la($syl$syl)!$1!o;
329             }
330              
331             sub _S4ec {
332             # SYLSYL[ae]nce -> SYLSYL
333 0     0   0 s!^ecn[ae]($syl$syl)!$1!o;
334             }
335              
336             sub _S4re {
337             # SYLSYLer -> SYLSYL
338 18     18   89 s!^re($syl$syl)!$1!o;
339             }
340              
341             sub _S4ci {
342             # SYLSYLic -> SYLSYL
343 0     0     s!^ci($syl$syl)!$1!o;
344             }
345              
346             sub _S4el {
347             # SYLSYL[ai]ble -> SYLSYL
348 0     0     s!^elb[ai]($syl$syl)!$1!o;
349             }
350              
351             sub _S4tn {
352             # SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
353 0     0     s!^tn(a|e(me?)?)($syl$syl)!$3!o;
354             }
355             sub _S4no {
356             # SYLSYL[st]ion -> SYLSYL[st]
357 0     0     s!^noi([st]$syl$syl)!$1!o;
358             }
359              
360             sub _S4uo {
361             # SYLSYLou -> SYLSYL e.g. homologou -> homolog
362 0     0     s!^uo($syl$syl)!$1!o;
363             }
364              
365             sub _S4ms {
366             # SYLSYLism -> SYLSYL
367 0     0     s!^msi($syl$syl)!$1!o;
368             }
369              
370             sub _S4et {
371             # SYLSYLate -> SYLSYL
372 0     0     s!^eta($syl$syl)!$1!o;
373             }
374              
375             sub _S4it {
376             # SYLSYLiti -> SYLSYL
377 0     0     s!^iti($syl$syl)!$1!o;
378             }
379              
380             sub _S4su {
381             # SYLSYLous -> SYLSYL
382 0     0     s!^suo($syl$syl)!$1!o;
383             }
384              
385             sub _S4ev {
386             # SYLSYLive -> SYLSYL
387 0     0     s!^evi($syl$syl)!$1!o;
388             }
389              
390             sub _S4ez {
391             # SYLSYLize -> SYLSYL
392 0     0     s!^ezi($syl$syl)!$1!o;
393             }
394              
395             sub _S4es {
396             # SYLSYLise -> SYLSYL **
397 0     0     s!^esi($syl$syl)!$1!o;
398             }
399              
400             ##############################################################
401             # Rule set 2
402              
403             sub _S2lan {
404             # SYLational -> SYLate, SYLtional -> SYLtion
405 0 0   0     s!^lanoita($syl)!eta$1!o || s!^lanoit($syl)!noit$1!o;
406             }
407              
408             sub _S2icn {
409             # SYLanci -> SYLance, SYLency ->SYLence
410 0     0     s!^icn([ae]$syl)!ecn$1!o;
411             }
412              
413             sub _S2res {
414             # SYLiser -> SYLise **
415 0     0     &_S2rez;
416             }
417              
418             sub _S2rez {
419             # SYLizer -> SYLize
420 0     0     s!^re(.)i($syl)!e$1i$2!o;
421             }
422              
423             sub _S2ilb {
424             # SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
425 0     0     s!^ilb([ai]$syl)!elb$1!o;
426             }
427              
428             sub _S2ill {
429             # SYLalli -> SYLal
430 0     0     s!^illa($syl)!la$1!o;
431             }
432              
433             sub _S2ilt {
434             # SYLentli -> SYLent
435 0     0     s!^iltne($syl)!tne$1!o
436             }
437              
438             sub _S2ile {
439             # SYLeli -> SYLe
440 0     0     s!^ile($syl)!e$1!o;
441             }
442              
443             sub _S2ils {
444             # SYLousli -> SYLous
445 0     0     s!^ilsuo($syl)!suo$1!o;
446             }
447              
448             sub _S2noi {
449             # SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
450 0 0   0     s!^noita([sz])i($syl)!e$1i$2!o || s!^noita($syl)!eta$1!o;
451             }
452              
453             sub _S2rot {
454             # SYLator -> SYLate
455 0     0     s!^rota($syl)!eta$1!o;
456             }
457              
458             sub _S2msi {
459             # SYLalism -> SYLal
460 0     0     s!^msila($syl)!la$1!o;
461             }
462              
463             sub _S2sse {
464             # SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
465 0     0     s!^ssen(evi|luf|suo)($syl)!$1$2!o;
466             }
467              
468             sub _S2iti {
469             # SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
470 0 0   0     s!^iti(la|lib|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
  0 0          
471             . $2 !eo;
472             }
473              
474             ##############################################################
475             # Rule set 3
476              
477             sub _S3eta {
478             # SYLicate -> SYLic
479 0     0     s!^etaci($syl)!ci$1!o;
480             }
481              
482             sub _S3evi {
483             # SYLative -> SYL
484 0     0     s!^evita($syl)!$1!o;
485             }
486              
487             sub _S3ezi
488             {
489             # SYLalize -> SYLal
490 0     0     s!^ezila($syl)!la$1!o;
491             }
492              
493             sub _S3esi {
494             # SYLalise -> SYLal **
495 0     0     s!^esila($syl)!la$1!o;
496             }
497              
498             sub _S3iti {
499             # SYLiciti -> SYLic
500 0     0     s!^itici($syl)!ci$1!o;
501             }
502              
503             sub _S3lac {
504             # SYLical -> SYLic
505 0     0     s!^laci($syl)!ci$1!o;
506             }
507             sub _S3luf {
508             # SYLful -> SYL
509 0     0     s!^luf($syl)!$1!o;
510             }
511              
512             sub _S3sse {
513             # SYLness -> SYL
514 0     0     s!^ssen($syl)!$1!o;
515             }
516              
517              
518             ##############################################################
519              
520             =over 4
521              
522             =item stem_caching({ -level => 0|1|2 });
523              
524             Sets the level of stem caching.
525              
526             '0' means 'no caching'. This is the default level.
527              
528             '1' means 'cache per run'. This caches stemming results during a single
529             call to 'stem'.
530              
531             '2' means 'cache indefinitely'. This caches stemming results until
532             either the process exits or the 'clear_stem_cache' method is called.
533              
534             =back
535              
536             =cut
537              
538             sub stem_caching {
539 0     0 1   my $parm_ref;
540 0 0         if (ref $_[0]) {
541 0           $parm_ref = shift;
542             } else {
543 0           $parm_ref = { @_ };
544             }
545 0           my $caching_level = $parm_ref->{-level};
546 0 0         if (defined $caching_level) {
547 0 0         if ($caching_level !~ m/^[012]$/) {
548 0           croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
549             }
550 0           $Stem_Caching = $caching_level;
551 0 0         if ($caching_level < 2) {
552 0           %Stem_Cache2 = ();
553             }
554             }
555 0           return $Stem_Caching;
556             }
557            
558             ##############################################################
559              
560             =over 4
561              
562             =item clear_stem_cache;
563              
564             Clears the cache of stemmed words
565              
566             =back
567              
568             =cut
569              
570             sub clear_stem_cache {
571 0     0 1   %Stem_Cache2 = ();
572             }
573              
574             ##############################################################
575              
576             =head1 NOTES
577              
578             This code is almost entirely derived from the Porter 2.1 module
579             written by Jim Richardson.
580              
581             =head1 SEE ALSO
582              
583             Lingua::Stem
584              
585             =head1 AUTHOR
586              
587             Jim Richardson, University of Sydney
588             jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
589              
590             Integration in Lingua::Stem by
591             Jerilyn Franz, FreeRun Technologies,
592            
593              
594             =head1 COPYRIGHT
595              
596             Jim Richardson, University of Sydney
597             Jerilyn Franz, FreeRun Technologies
598              
599             This code is freely available under the same terms as Perl.
600              
601             =head1 BUGS
602              
603             =head1 TODO
604              
605             =cut
606              
607             1;