File Coverage

blib/lib/Lingua/Stem/Snowball/Se.pm
Criterion Covered Total %
statement 91 97 93.8
branch 40 50 80.0
condition 1 3 33.3
subroutine 9 9 100.0
pod 0 4 0.0
total 141 163 86.5


line stmt bran cond sub pod time code
1             package Lingua::Stem::Snowball::Se;
2 2     2   209987 use strict;
  2         6  
  2         74  
3 2     2   1049 use bytes;
  2         12  
  2         13  
4             # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
5             # Lingua::Stem::Snowball::Se - Swedish stemmer
6             # :: based upon the swedish stemmer algorithm at snowball.tartarus.org.
7             # by Martin Porter.
8             # (c) 2001-2007 Ask Solem Hoel
9             #
10             # This program is free software; you can redistribute it and/or modify
11             # it under the terms of the GNU General Public License version 2,
12             # *NOT* "earlier versions", as published by the Free Software Foundation.
13             #
14             # This program is distributed in the hope that it will be useful,
15             # but WITHOUT ANY WARRANTY; without even the implied warranty of
16             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17             # GNU General Public License for more details.
18             #
19             # You should have received a copy of the GNU General Public License
20             # along with this program; if not, write to the Free Software
21             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22             # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
23             #####
24              
25              
26 2     2   72 use vars qw($VERSION);
  2         2  
  2         611  
27             $VERSION = 1.2;
28              
29             # special characters
30             my $ae = chr(0xe4); # a"
31             my $ao = chr(0xe5); # ao
32             my $oe = chr(0xf6); # o"
33              
34             # delete the s if a "s ending" is preceeded by one
35             # of these characters.
36             my $s_ending = "bcdfghjklmnoprtvy";
37              
38             # swedish vowels.
39             my $vowels = "aeiouy$ae$ao$oe";
40              
41             # ####
42             # the endings in step 1
43             # XXX: these must be sorted by length
44             # to save time we've done it already, you can do it like this:
45             # my $bylength = sub {
46             # length $a <=> length $b;
47             # }
48             # @endings = reverse sort $bylength @endings;
49             my @endings = qw/
50             heterna hetens ornas ernas andet heter arnas arens heten andes
51             anden orna erna erns arna ades aren aste arne ande ast het ens
52             ern ade are er at es ad as en or ar e a s
53             /;
54              
55             # the endings in step 2
56             # XXX: these must be sorted by length, like @endings in step 1.
57             my @endings2 = ('fullt', "l${oe}st", 'els', 'lig', 'ig');
58              
59             my %cache = ( );
60              
61             sub new {
62 1     1 0 1237 my $pkg = shift;
63 1   33     7 $pkg = ref $pkg || $pkg;
64 1         4 my %arg = @_;
65 1         2 my $self = {};
66 1         3 bless $self, $pkg;
67 1 50       4 if($arg{use_cache}) {
68 0         0 $self->use_cache(1);
69             }
70 1         3 return $self;
71             }
72              
73             sub use_cache {
74 60756     60756 0 104016 my($self, $use_cache) = @_;
75 60756 50       177750 if($use_cache) {
76 0         0 $self->{USE_CACHE} = 1;
77             }
78 60756         208144 return $self->{USE_CACHE};
79             }
80              
81             sub stem {
82 30623     30623 0 19899551 my ($self, $word) = @_;
83 2     2   12 no warnings;
  2         3  
  2         1182  
84 30623         42324 my $orig_word;
85              
86 30623 50       74892 if($self->use_cache()) {
87 0         0 $orig_word = $word;
88 0         0 my $cached_word = $cache{$word};
89 0 0       0 return $cached_word if $cached_word;
90             }
91              
92 30623         88472 my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
93 30623 100       80094 return $word unless $lslen >= 3;
94              
95             # ### STEP 1
96             # only need to refresh wlen each time we change the word.
97 30133         50378 foreach my $ending (@endings) {
98 978971         1191911 my $endinglen = length $ending; # do this once.
99              
100             # only continue if the word has this ending at all.
101 978971 100       2537921 if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) {
102 19151 100       46835 if($ending eq 's') { # b)
103             # check if it has a valid "s ending"...
104 1225         2194 my $valid_s_ending = 0;
105 1225 100       3359 if($rslen == 1) {
106 184         1322 my $wmr1 = substr($word, 0, $wlen - $rslen);
107 184 100       1416 if($wmr1 =~ /[$s_ending]$/o) {
108 151         284 $valid_s_ending = 1;
109             }
110             }
111             else {
112 1041 100       6574 if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) {
113 831         1244 $valid_s_ending = 1;
114             }
115             }
116 1225 100       3637 if($valid_s_ending) {
117             # ...delete the last character (which is a s)
118 982         2326 $word = substr($word, 0, $wlen - 1);
119 982         2441 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
120 982         11304 last;
121             }
122             }
123             else { # a)
124             # delete the ending.
125 17926         64626 $word = substr($word, 0, $wlen - $endinglen);
126 17926         34951 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
127 17926         42683 last;
128             }
129             }
130             }
131 30133 50       80476 return $word unless $lslen >= 3;
132              
133             # ### STEP 2
134 30133         71714 my $ending = substr($rs, $rslen - 2, $rslen);
135 30133 100       120459 if($ending =~ /dd|gd|nn|dt|gt|kt|tt/) {
136 2116         4546 $word = substr($word, 0, $wlen - 1);
137 2116         5882 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
138             }
139 30133 50       68516 return $word unless $lslen >= 3;
140              
141             # ### STEP 3
142 30133         61405 foreach my $ending (@endings2) {
143 148835 100       1901903 if($rs =~ /\Q$ending\E$/)
144             {
145 2418 100       14562 if($ending =~ /lig|ig|els/)
146             {
147 2348         6095 my $res = substr($word, 0, $wlen - length($ending));
148 2348         3922 $word = $res; #if length $res > 2;
149             last
150 2348         4905 }
151 70 100       342 if($ending eq "l${oe}st")
152             {
153 39         127 my $res = substr($word, 0, $wlen - 1);
154 39         79 $word = $res; #if length $res > 2;
155             last
156 39         86 }
157 31 50       105 if($ending eq 'fullt')
158             {
159 31         87 my $res = substr($word, 0, $wlen - 1);
160 31         43 $word = $res; #if length $res > 2;
161             last
162 31         95 }
163             }
164             }
165              
166 30133 50       93310 if($self->use_cache()) {
167 0         0 $cache{$orig_word} = $word;
168             }
169            
170 30133         120259 return $word;
171             }
172              
173             sub getsides {
174 51647     51647 0 105575 my $word = shift;
175 2     2   11 no warnings;
  2         5  
  2         496  
176 51647         90130 my $wlen = length $word;
177              
178 51647         86688 my($ls, $rs) = (undef, undef); # left side and right side.
179            
180             # ###
181             # find the first vowel with a non-vowel after it.
182 51647         84968 my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0);
183 51647         233985 foreach(split//, $word) {
184 159259 100       342097 if($found_vowel> 0) {
185 52233 100       209054 if(/[^$vowels]/o) {
186 51221 50       120276 if($curpos > 0) {
187 51221         65964 $nonv_position = $curpos + 1;
188 51221         87188 last;
189             }
190             }
191             }
192 108038 100       374158 if(/[$vowels]/o) {
193 52526         78823 $found_vowel = 1;
194             }
195 108038         173137 $curpos++;
196             }
197              
198             # got nothing: return false
199 51647 100       157264 return undef if $nonv_position < 0;
200              
201             # ###
202             # length of the left side must be atleast 3 chars.
203 51221         98455 my $leftlen = $wlen - ($wlen - $nonv_position);
204 51221 100       109406 if($leftlen < 3) {
205 9421         24384 $ls = substr($word, 0, 3);
206 9421         18394 $rs = substr($word, 3, $wlen);
207             }
208             else {
209 41800         73776 $ls = substr($word, 0, $leftlen);
210 41800         64911 $rs = substr($word, $nonv_position, $wlen);
211             }
212 51221         272832 return($ls, $rs, $wlen, length $ls, length $rs);
213             }
214              
215             1;
216              
217             __END__