File Coverage

blib/lib/Lingua/Stem/Snowball/Se.pm

Criterion	Covered	Total	%
statement	91	97	93.8
branch	40	50	80.0
condition	1	3	33.3
subroutine	9	9	100.0
pod	0	4	0.0
total	141	163	86.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Snowball::Se;
2	2			2		209987	use strict;
	2					6
	2					74
3	2			2		1049	use bytes;
	2					12
	2					13
4							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
5							# Lingua::Stem::Snowball::Se - Swedish stemmer
6							# :: based upon the swedish stemmer algorithm at snowball.tartarus.org.
7							# by Martin Porter.
8							# (c) 2001-2007 Ask Solem Hoel
9							#
10							# This program is free software; you can redistribute it and/or modify
11							# it under the terms of the GNU General Public License version 2,
12							# NOT "earlier versions", as published by the Free Software Foundation.
13							#
14							# This program is distributed in the hope that it will be useful,
15							# but WITHOUT ANY WARRANTY; without even the implied warranty of
16							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17							# GNU General Public License for more details.
18							#
19							# You should have received a copy of the GNU General Public License
20							# along with this program; if not, write to the Free Software
21							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
23							#####
24
25
26	2			2		72	use vars qw($VERSION);
	2					2
	2					611
27							$VERSION = 1.2;
28
29							# special characters
30							my $ae = chr(0xe4); # a"
31							my $ao = chr(0xe5); # ao
32							my $oe = chr(0xf6); # o"
33
34							# delete the s if a "s ending" is preceeded by one
35							# of these characters.
36							my $s_ending = "bcdfghjklmnoprtvy";
37
38							# swedish vowels.
39							my $vowels = "aeiouy$ae$ao$oe";
40
41							# ####
42							# the endings in step 1
43							# XXX: these must be sorted by length
44							# to save time we've done it already, you can do it like this:
45							# my $bylength = sub {
46							# length $a <=> length $b;
47							# }
48							# @endings = reverse sort $bylength @endings;
49							my @endings = qw/
50							heterna hetens ornas ernas andet heter arnas arens heten andes
51							anden orna erna erns arna ades aren aste arne ande ast het ens
52							ern ade are er at es ad as en or ar e a s
53							/;
54
55							# the endings in step 2
56							# XXX: these must be sorted by length, like @endings in step 1.
57							my @endings2 = ('fullt', "l${oe}st", 'els', 'lig', 'ig');
58
59							my %cache = ( );
60
61							sub new {
62	1			1	0	1237	my $pkg = shift;
63	1		33			7	$pkg = ref $pkg \|\| $pkg;
64	1					4	my %arg = @_;
65	1					2	my $self = {};
66	1					3	bless $self, $pkg;
67	1	50				4	if($arg{use_cache}) {
68	0					0	$self->use_cache(1);
69							}
70	1					3	return $self;
71							}
72
73							sub use_cache {
74	60756			60756	0	104016	my($self, $use_cache) = @_;
75	60756	50				177750	if($use_cache) {
76	0					0	$self->{USE_CACHE} = 1;
77							}
78	60756					208144	return $self->{USE_CACHE};
79							}
80
81							sub stem {
82	30623			30623	0	19899551	my ($self, $word) = @_;
83	2			2		12	no warnings;
	2					3
	2					1182
84	30623					42324	my $orig_word;
85
86	30623	50				74892	if($self->use_cache()) {
87	0					0	$orig_word = $word;
88	0					0	my $cached_word = $cache{$word};
89	0	0				0	return $cached_word if $cached_word;
90							}
91
92	30623					88472	my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
93	30623	100				80094	return $word unless $lslen >= 3;
94
95							# ### STEP 1
96							# only need to refresh wlen each time we change the word.
97	30133					50378	foreach my $ending (@endings) {
98	978971					1191911	my $endinglen = length $ending; # do this once.
99
100							# only continue if the word has this ending at all.
101	978971	100				2537921	if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) {
102	19151	100				46835	if($ending eq 's') { # b)
103							# check if it has a valid "s ending"...
104	1225					2194	my $valid_s_ending = 0;
105	1225	100				3359	if($rslen == 1) {
106	184					1322	my $wmr1 = substr($word, 0, $wlen - $rslen);
107	184	100				1416	if($wmr1 =~ /[$s_ending]$/o) {
108	151					284	$valid_s_ending = 1;
109							}
110							}
111							else {
112	1041	100				6574	if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) {
113	831					1244	$valid_s_ending = 1;
114							}
115							}
116	1225	100				3637	if($valid_s_ending) {
117							# ...delete the last character (which is a s)
118	982					2326	$word = substr($word, 0, $wlen - 1);
119	982					2441	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
120	982					11304	last;
121							}
122							}
123							else { # a)
124							# delete the ending.
125	17926					64626	$word = substr($word, 0, $wlen - $endinglen);
126	17926					34951	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
127	17926					42683	last;
128							}
129							}
130							}
131	30133	50				80476	return $word unless $lslen >= 3;
132
133							# ### STEP 2
134	30133					71714	my $ending = substr($rs, $rslen - 2, $rslen);
135	30133	100				120459	if($ending =~ /dd\|gd\|nn\|dt\|gt\|kt\|tt/) {
136	2116					4546	$word = substr($word, 0, $wlen - 1);
137	2116					5882	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
138							}
139	30133	50				68516	return $word unless $lslen >= 3;
140
141							# ### STEP 3
142	30133					61405	foreach my $ending (@endings2) {
143	148835	100				1901903	if($rs =~ /\Q$ending\E$/)
144							{
145	2418	100				14562	if($ending =~ /lig\|ig\|els/)
146							{
147	2348					6095	my $res = substr($word, 0, $wlen - length($ending));
148	2348					3922	$word = $res; #if length $res > 2;
149							last
150	2348					4905	}
151	70	100				342	if($ending eq "l${oe}st")
152							{
153	39					127	my $res = substr($word, 0, $wlen - 1);
154	39					79	$word = $res; #if length $res > 2;
155							last
156	39					86	}
157	31	50				105	if($ending eq 'fullt')
158							{
159	31					87	my $res = substr($word, 0, $wlen - 1);
160	31					43	$word = $res; #if length $res > 2;
161							last
162	31					95	}
163							}
164							}
165
166	30133	50				93310	if($self->use_cache()) {
167	0					0	$cache{$orig_word} = $word;
168							}
169
170	30133					120259	return $word;
171							}
172
173							sub getsides {
174	51647			51647	0	105575	my $word = shift;
175	2			2		11	no warnings;
	2					5
	2					496
176	51647					90130	my $wlen = length $word;
177
178	51647					86688	my($ls, $rs) = (undef, undef); # left side and right side.
179
180							# ###
181							# find the first vowel with a non-vowel after it.
182	51647					84968	my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0);
183	51647					233985	foreach(split//, $word) {
184	159259	100				342097	if($found_vowel> 0) {
185	52233	100				209054	if(/[^$vowels]/o) {
186	51221	50				120276	if($curpos > 0) {
187	51221					65964	$nonv_position = $curpos + 1;
188	51221					87188	last;
189							}
190							}
191							}
192	108038	100				374158	if(/[$vowels]/o) {
193	52526					78823	$found_vowel = 1;
194							}
195	108038					173137	$curpos++;
196							}
197
198							# got nothing: return false
199	51647	100				157264	return undef if $nonv_position < 0;
200
201							# ###
202							# length of the left side must be atleast 3 chars.
203	51221					98455	my $leftlen = $wlen - ($wlen - $nonv_position);
204	51221	100				109406	if($leftlen < 3) {
205	9421					24384	$ls = substr($word, 0, 3);
206	9421					18394	$rs = substr($word, 3, $wlen);
207							}
208							else {
209	41800					73776	$ls = substr($word, 0, $leftlen);
210	41800					64911	$rs = substr($word, $nonv_position, $wlen);
211							}
212	51221					272832	return($ls, $rs, $wlen, length $ls, length $rs);
213							}
214
215							1;
216
217							__END__