File Coverage

blib/lib/Lingua/Stem/En.pm
Criterion Covered Total %
statement 64 118 54.2
branch 24 50 48.0
condition 6 10 60.0
subroutine 7 45 15.5
pod 3 3 100.0
total 104 226 46.0


line stmt bran cond sub pod time code
1             package Lingua::Stem::En;
2              
3             # $RCSfile: En.pm,v $ $Revision: 1.4 $ $Date: 1999/06/24 23:33:37 $ $Author: snowhare $
4              
5             =head1 NAME
6              
7             Lingua::Stem::En - Porter's stemming algorithm for 'generic' English
8              
9             =head1 SYNOPSIS
10              
11             use Lingua::Stem::En;
12             my $stems = Lingua::Stem::En::stem({ -words => $word_list_reference,
13             -locale => 'en',
14             -exceptions => $exceptions_hash,
15             });
16              
17             =head1 DESCRIPTION
18              
19             This routine applies the Porter Stemming Algorithm to its parameters,
20             returning the stemmed words.
21              
22             It is derived from the C program "stemmer.c"
23             as found in freewais and elsewhere, which contains these notes:
24              
25             Purpose: Implementation of the Porter stemming algorithm documented
26             in: Porter, M.F., "An Algorithm For Suffix Stripping,"
27             Program 14 (3), July 1980, pp. 130-137.
28             Provenance: Written by B. Frakes and C. Cox, 1986.
29              
30             I have re-interpreted areas that use Frakes and Cox's "WordSize"
31             function. My version may misbehave on short words starting with "y",
32             but I can't think of any examples.
33              
34             The step numbers correspond to Frakes and Cox, and are probably in
35             Porter's article (which I've not seen).
36             Porter's algorithm still has rough spots (e.g current/currency, -ings words),
37             which I've not attempted to cure, although I have added
38             support for the British -ise suffix.
39              
40             =head1 CHANGES
41              
42            
43             1999.06.15 - Changed to '.pm' module, moved into Lingua::Stem namespace,
44             optionalized the export of the 'stem' routine
45             into the caller's namespace, added named parameters
46              
47             1999.06.24 - Switch core implementation of the Porter stemmer to
48             the one written by Jim Richardson
49              
50             2000.08.25 - 2.11 Added stemming cache
51              
52             2000.09.14 - 2.12 Fixed *major* :( implementation error of Porter's algorithm
53             Error was entirely my fault - I completely forgot to include
54             rule sets 2,3, and 4 starting with Lingua::Stem 0.30.
55             -- Benjamin Franz
56              
57             2003.09.28 - 2.13 Corrected documentation error pointed out by Simon Cozens.
58              
59             2005.11.20 - 2.14 Changed rule declarations to conform to Perl style convention
60             for 'private' subroutines. Changed Exporter invokation to more
61             portable 'require' vice 'use'.
62              
63             2006.02.14 - 2.15 Added ability to pass word list by 'handle' for in-place stemming.
64              
65             2009.07.27 2.16 Documentation Fix
66             =cut
67              
68             #######################################################################
69             # Initialization
70             #######################################################################
71              
72 2     2   12 use strict;
  2         4  
  2         95  
73             require Exporter;
74 2     2   12 use Carp;
  2         3  
  2         162  
75 2     2   11 use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
  2         8  
  2         382  
76             BEGIN {
77 2     2   5 $VERSION = "2.16";
78 2         46 @ISA = qw (Exporter);
79 2         5 @EXPORT = ();
80 2         4 @EXPORT_OK = qw (stem clear_stem_cache stem_caching);
81 2         7926 %EXPORT_TAGS = ();
82             }
83              
84             my $Stem_Caching = 0;
85             my $Stem_Cache = {};
86             my %Stem_Cache2 = ();
87              
88             #
89             #V Porter.pm V2.11 25 Aug 2000 stemming cache
90             # Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
91             # Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
92             # porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
93             # Jim Richardson, University of Sydney
94             # jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
95              
96             # Find a canonical stem for a word, assumed to consist entirely of
97             # lower-case letters. The approach is from
98             #
99             # M. F. Porter, An algorithm for suffix stripping, Program (Automated
100             # Library and Information Systems) 14 (3) 130-7, July 1980.
101             #
102             # This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
103             #
104             # http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html
105              
106             # Some additional rules are used here, mainly to allow for British spellings
107             # like -ise. They are marked ** in the code.
108              
109             # Initialization required before using subroutine stem:
110              
111             # We count syllables slightly differently from Porter: we say the syllable
112             # count increases on each occurrence in the word of an adjacent pair
113             #
114             # [aeiouy][^aeiou]
115             #
116             # This avoids any need to define vowels and consonants, or confusion over
117             # 'y'. It also works slightly better: our definition gives two syllables
118             # in 'yttrium', while Porter's gives only one because the initial 'y' is
119             # taken to be a consonant. But it is not quite obvious: for example,
120             # consider 'mayfly' where, when working backwards (see below), the 'yf'
121             # matches the above pattern, even though it is the 'ay' which in Porter's
122             # terms increments the syllable count.
123             #
124             # We wish to match the above in context, working backwards from the end of
125             # the word: the appropriate regular expression is
126              
127             my $syl = '[aeiou]*[^aeiou][^aeiouy]*[aeiouy]';
128              
129             # (This works because [^aeiouy] is a subset of [^aeiou].) If we want two
130             # syllables ("m>1" in Porter's terminology) we can just match $syl$syl.
131              
132             # For step 1b we need to be able to detect the presence of a vowel: here
133             # we revert to Porter's definition that a vowel is [aeiou], or y preceded
134             # by a consonant. (If the . below is a vowel, then the . is the desired
135             # vowel; if the . is a consonant the y is the desired vowel.)
136              
137             my $hasvow = '[^aeiouy]*([aeiou]|y.)';
138              
139             =head1 METHODS
140              
141             =cut
142              
143             #######################################################################
144              
145             =over 4
146              
147             =item stem({ -words => \@words, -locale => 'en', -exceptions => \%exceptions });
148              
149             Stems a list of passed words using the rules of US English. Returns
150             an anonymous array reference to the stemmed words.
151              
152             Example:
153              
154             my @words = ( 'wordy', 'another' );
155             my $stemmed_words = Lingua::Stem::En::stem({ -words => \@words,
156             -locale => 'en',
157             -exceptions => \%exceptions,
158             });
159              
160             If the first element of @words is a list reference, then the stemming is performed 'in place'
161             on that list (modifying the passed list directly instead of copying it to a new array).
162              
163             This is only useful if you do not need to keep the original list. If you B need to keep
164             the original list, use the normal semantic of having 'stem' return a new list instead - that
165             is faster than making your own copy B using the 'in place' semantics since the primary
166             difference between 'in place' and 'by value' stemming is the creation of a copy of the original
167             list. If you B need the original list, then the 'in place' stemming is about 60% faster.
168              
169             Example of 'in place' stemming:
170              
171             my $words = [ 'wordy', 'another' ];
172             my $stemmed_words = Lingua::Stem::En::stem({ -words => [$words],
173             -locale => 'en',
174             -exceptions => \%exceptions,
175             });
176              
177             The 'in place' mode returns a reference to the original list with the words stemmed.
178              
179             =back
180              
181             =cut
182              
183             sub stem {
184 18 50   18 1 49 return [] if ($#_ == -1);
185 18         19 my $parm_ref;
186 18 50       38 if (ref $_[0]) {
187 18         22 $parm_ref = shift;
188             } else {
189 0         0 $parm_ref = { @_ };
190             }
191            
192 18         28 my $words = [];
193 18         24 my $locale = 'en';
194 18         24 my $exceptions = {};
195 18         70 foreach (keys %$parm_ref) {
196 54         77 my $key = lc ($_);
197 54         79 my $value = $parm_ref->{$key};
198 54 100       127 if ($key eq '-words') {
    100          
    50          
199 18         71 @$words = @$value;
200 18 100       128 if (ref($words->[0]) eq 'ARRAY'){
201 3         7 $words = $words->[0];
202             }
203             } elsif ($key eq '-exceptions') {
204 18         55 $exceptions = $parm_ref->{$key};
205             } elsif ($key eq '-locale') {
206 18         42 $locale = $parm_ref->{$key};
207             } else {
208 0         0 croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
209             }
210             }
211            
212 18         33 local( $_ );
213              
214 18         32 foreach (@$words) {
215              
216             # Flatten case
217 180         250 $_ = lc $_;
218              
219             # Check against cache of stemmed words
220 180 50       324 if (exists $Stem_Cache2{$_}) {
221 0         0 $_ = $Stem_Cache2{$_};
222 0         0 next;
223             }
224              
225             # Check against exceptions list
226 180 100       348 if (exists $exceptions->{$_}) {
227 6         11 $_ = $exceptions->{$_};
228 6         14 next;
229             }
230              
231 174         179 my $original_word = $_;
232              
233             # Step 0 - remove punctuation
234 174         218 s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
  174         512  
  174         227  
235 174 50       643 next unless /^[a-z]+$/;
236              
237             # Reverse the word so we can easily apply pattern matching to the end:
238 174         263 $_ = reverse $_;
239            
240             # Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0
241            
242 174 100 50     479 m!^s! && ( s!^se(ss|i)!$1! || s!^s([^s])!$1! );
243            
244             # Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
245             # but ated->ate etc
246            
247 174 50 50     1335 s!^dee($syl)!ee$1!o ||
      100        
248             (
249             s!^(de|gni)($hasvow)!$2!o &&
250             (
251             # at->ate, bl->ble, iz->ize, is->ise
252             s!^(ta|lb|[sz]i)!e$1! || # ** ise as well as ize
253             # CC->C (C consonant other than l, s, z)
254             s!^([^aeioulsz])\1!$1! ||
255             # (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
256             s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
257             )
258             );
259            
260             # Step 1c: change y to i: happy->happi, sky->sky
261            
262 174         378 s!^y($hasvow)!i$1!o;
263            
264             # Step 2: double and triple suffices (part 1)
265            
266             # Switch on last three letters (fails harmlessly if subroutine undefined) --
267             # thanks to Ian Phillipps who wrote
268             # CPAN authors/id/IANPX/Stem-0.1.tar.gz
269             # for suggesting the replacement of
270             # eval( '&S2' . unpack( 'a3', $_ ) );
271             # (where the eval ignores undefined subroutines) by the much faster
272             # eval { &{ 'S2' . substr( $_, 0, 3 ) } };
273             # But the following is slightly faster still:
274              
275             {
276 2     2   24 no strict 'refs';
  2         3  
  2         5731  
  174         256  
277            
278 174         156 my $sub;
279            
280             # Step 3: double and triple suffices, etc (part 2)
281              
282 174 50       159 &$sub if defined &{ $sub = '_S2' . substr( $_, 0, 3 ) };
  174         833  
283            
284             # Step 3: double and triple suffices, etc (part 2)
285            
286 174 50       202 &$sub if defined &{ $sub = '_S3' . substr( $_, 0, 3 ) };
  174         700  
287            
288             # Step 4: single suffices on polysyllables
289            
290 174 100       176 &$sub if defined &{ $sub = '_S4' . substr( $_, 0, 2 ) };
  174         731  
291            
292             }
293             # Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas
294            
295 174 100 50     658 m!^e! && ( s!^e($syl$syl)!$1!o ||
      50        
296            
297             # Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
298             # not w, x or y:
299            
300             ! m!^e[^aeiouwxy][aeiouy][^aeiou]! && # not *o E
301             s!^e($syl[aeiouy]*[^aeiou]*)$!$1!o # m=1
302             );
303            
304             # Step 5b: double l -- controll->control, roll->roll
305             # ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
306             # ($syl) is wanted to strip an l off controll.
307            
308 174         229 s!^ll($syl)!l$1!o;
309            
310 174         254 $_ = scalar( reverse $_ );
311              
312 174 50       383 $Stem_Cache2{$original_word} = $_ if $Stem_Caching;
313             }
314 18 50       54 %Stem_Cache2 = () if ($Stem_Caching < 2);
315            
316 18         124 return $words;
317             }
318              
319             ##############################################################
320             # Rule set 4
321              
322             sub _S4la {
323             # SYLSYLal -> SYLSYL
324 0     0   0 s!^la($syl$syl)!$1!o;
325             }
326              
327             sub _S4ec {
328             # SYLSYL[ae]nce -> SYLSYL
329 0     0   0 s!^ecn[ae]($syl$syl)!$1!o;
330             }
331              
332             sub _S4re {
333             # SYLSYLer -> SYLSYL
334 18     18   107 s!^re($syl$syl)!$1!o;
335             }
336              
337             sub _S4ci {
338             # SYLSYLic -> SYLSYL
339 0     0     s!^ci($syl$syl)!$1!o;
340             }
341              
342             sub _S4el {
343             # SYLSYL[ai]ble -> SYLSYL
344 0     0     s!^elb[ai]($syl$syl)!$1!o;
345             }
346              
347             sub _S4tn {
348             # SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
349 0     0     s!^tn(a|e(me?)?)($syl$syl)!$3!o;
350             }
351             sub _S4no {
352             # SYLSYL[st]ion -> SYLSYL[st]
353 0     0     s!^noi([st]$syl$syl)!$1!o;
354             }
355              
356             sub _S4uo {
357             # SYLSYLou -> SYLSYL e.g. homologou -> homolog
358 0     0     s!^uo($syl$syl)!$1!o;
359             }
360              
361             sub _S4ms {
362             # SYLSYLism -> SYLSYL
363 0     0     s!^msi($syl$syl)!$1!o;
364             }
365              
366             sub _S4et {
367             # SYLSYLate -> SYLSYL
368 0     0     s!^eta($syl$syl)!$1!o;
369             }
370              
371             sub _S4it {
372             # SYLSYLiti -> SYLSYL
373 0     0     s!^iti($syl$syl)!$1!o;
374             }
375              
376             sub _S4su {
377             # SYLSYLous -> SYLSYL
378 0     0     s!^suo($syl$syl)!$1!o;
379             }
380              
381             sub _S4ev {
382             # SYLSYLive -> SYLSYL
383 0     0     s!^evi($syl$syl)!$1!o;
384             }
385              
386             sub _S4ez {
387             # SYLSYLize -> SYLSYL
388 0     0     s!^ezi($syl$syl)!$1!o;
389             }
390              
391             sub _S4es {
392             # SYLSYLise -> SYLSYL **
393 0     0     s!^esi($syl$syl)!$1!o;
394             }
395              
396             ##############################################################
397             # Rule set 2
398              
399             sub _S2lan {
400             # SYLational -> SYLate, SYLtional -> SYLtion
401 0 0   0     s!^lanoita($syl)!eta$1!o || s!^lanoit($syl)!noit$1!o;
402             }
403              
404             sub _S2icn {
405             # SYLanci -> SYLance, SYLency ->SYLence
406 0     0     s!^icn([ae]$syl)!ecn$1!o;
407             }
408              
409             sub _S2res {
410             # SYLiser -> SYLise **
411 0     0     &_S2rez;
412             }
413              
414             sub _S2rez {
415             # SYLizer -> SYLize
416 0     0     s!^re(.)i($syl)!e$1i$2!o;
417             }
418              
419             sub _S2ilb {
420             # SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
421 0     0     s!^ilb([ai]$syl)!elb$1!o;
422             }
423              
424             sub _S2ill {
425             # SYLalli -> SYLal
426 0     0     s!^illa($syl)!la$1!o;
427             }
428              
429             sub _S2ilt {
430             # SYLentli -> SYLent
431 0     0     s!^iltne($syl)!tne$1!o
432             }
433              
434             sub _S2ile {
435             # SYLeli -> SYLe
436 0     0     s!^ile($syl)!e$1!o;
437             }
438              
439             sub _S2ils {
440             # SYLousli -> SYLous
441 0     0     s!^ilsuo($syl)!suo$1!o;
442             }
443              
444             sub _S2noi {
445             # SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
446 0 0   0     s!^noita([sz])i($syl)!e$1i$2!o || s!^noita($syl)!eta$1!o;
447             }
448              
449             sub _S2rot {
450             # SYLator -> SYLate
451 0     0     s!^rota($syl)!eta$1!o;
452             }
453              
454             sub _S2msi {
455             # SYLalism -> SYLal
456 0     0     s!^msila($syl)!la$1!o;
457             }
458              
459             sub _S2sse {
460             # SYLiveness -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
461 0     0     s!^ssen(evi|luf|suo)($syl)!$1$2!o;
462             }
463              
464             sub _S2iti {
465             # SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
466 0 0   0     s!^iti(la|lib|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
  0 0          
467             . $2 !eo;
468             }
469              
470             ##############################################################
471             # Rule set 3
472              
473             sub _S3eta {
474             # SYLicate -> SYLic
475 0     0     s!^etaci($syl)!ci$1!o;
476             }
477              
478             sub _S3evi {
479             # SYLative -> SYL
480 0     0     s!^evita($syl)!$1!o;
481             }
482              
483             sub _S3ezi
484             {
485             # SYLalize -> SYLal
486 0     0     s!^ezila($syl)!la$1!o;
487             }
488              
489             sub _S3esi {
490             # SYLalise -> SYLal **
491 0     0     s!^esila($syl)!la$1!o;
492             }
493              
494             sub _S3iti {
495             # SYLiciti -> SYLic
496 0     0     s!^itici($syl)!ci$1!o;
497             }
498              
499             sub _S3lac {
500             # SYLical -> SYLic
501 0     0     s!^laci($syl)!ci$1!o;
502             }
503             sub _S3luf {
504             # SYLful -> SYL
505 0     0     s!^luf($syl)!$1!o;
506             }
507              
508             sub _S3sse {
509             # SYLness -> SYL
510 0     0     s!^ssen($syl)!$1!o;
511             }
512              
513              
514             ##############################################################
515              
516             =over 4
517              
518             =item stem_caching({ -level => 0|1|2 });
519              
520             Sets the level of stem caching.
521              
522             '0' means 'no caching'. This is the default level.
523              
524             '1' means 'cache per run'. This caches stemming results during a single
525             call to 'stem'.
526              
527             '2' means 'cache indefinitely'. This caches stemming results until
528             either the process exits or the 'clear_stem_cache' method is called.
529              
530             =back
531              
532             =cut
533              
534             sub stem_caching {
535 0     0 1   my $parm_ref;
536 0 0         if (ref $_[0]) {
537 0           $parm_ref = shift;
538             } else {
539 0           $parm_ref = { @_ };
540             }
541 0           my $caching_level = $parm_ref->{-level};
542 0 0         if (defined $caching_level) {
543 0 0         if ($caching_level !~ m/^[012]$/) {
544 0           croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
545             }
546 0           $Stem_Caching = $caching_level;
547 0 0         if ($caching_level < 2) {
548 0           %Stem_Cache2 = ();
549             }
550             }
551 0           return $Stem_Caching;
552             }
553            
554             ##############################################################
555              
556             =over 4
557              
558             =item clear_stem_cache;
559              
560             Clears the cache of stemmed words
561              
562             =back
563              
564             =cut
565              
566             sub clear_stem_cache {
567 0     0 1   %Stem_Cache2 = ();
568             }
569              
570             ##############################################################
571              
572             =head1 NOTES
573              
574             This code is almost entirely derived from the Porter 2.1 module
575             written by Jim Richardson.
576              
577             =head1 SEE ALSO
578              
579             Lingua::Stem
580              
581             =head1 AUTHOR
582              
583             Jim Richardson, University of Sydney
584             jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html
585              
586             Integration in Lingua::Stem by
587             Benjamin Franz, FreeRun Technologies,
588             snowhare@nihongo.org or http://www.nihongo.org/snowhare/
589              
590             =head1 COPYRIGHT
591              
592             Jim Richardson, University of Sydney
593             Benjamin Franz, FreeRun Technologies
594              
595             This code is freely available under the same terms as Perl.
596              
597             =head1 BUGS
598              
599             =head1 TODO
600              
601             =cut
602              
603             1;