File Coverage

blib/lib/Lingua/Stem/Snowball/Da.pm

Criterion	Covered	Total	%
statement	93	96	96.8
branch	38	46	82.6
condition	4	8	50.0
subroutine	8	8	100.0
pod	0	4	0.0
total	143	162	88.2

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Snowball::Da;
2	1			1		8300	use strict;
	1					3
	1					38
3	1			1		1257	use bytes;
	1					9
	1					5
4							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
5							# This program is free software; you can redistribute it and/or modify
6							# it under the terms of the GNU General Public License version 2,
7							# NOT "earlier versions", as published by the Free Software Foundation.
8							#
9							# This program is distributed in the hope that it will be useful,
10							# but WITHOUT ANY WARRANTY; without even the implied warranty of
11							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12							# GNU General Public License for more details.
13							#
14							# You should have received a copy of the GNU General Public License
15							# along with this program; if not, write to the Free Software
16							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
18							#####
19
20	1			1		33	use constant DEBUG=>0;
	1					7
	1					76
21
22	1			1		4	use vars qw(%cache $VERSION);
	1					2
	1					1852
23							$Lingua::Stem::Snowball::Da::VERSION = 1.01;
24
25							# special characters
26							my $aa = chr(229); # å
27							my $ae = chr(230); # æ
28							my $oe = chr(248); # &oring;
29
30							# delete the s if a "s ending" is preceeded by one
31							# of these characters.
32							my %s_ending = (
33							a => 1,
34							b => 1,
35							c => 1,
36							d => 1,
37							f => 1,
38							g => 1,
39							h => 1,
40							j => 1,
41							k => 1,
42							l => 1,
43							"m" => 1,
44							n => 1,
45							o => 1,
46							p => 1,
47							r => 1,
48							t => 1,
49							v => 1,
50							"y" => 1,
51							z => 1,
52							$aa => 1,
53							);
54
55							# danish vowels.
56							my $vowels = "aeiouy$ae$aa$oe";
57							my %vowels = (
58							a=>1,
59							e=>1,
60							i=>1,
61							o=>1,
62							u=>1,
63							"y"=>1,
64							$ae=>1,
65							$aa=>1,
66							$oe=>1,
67							);
68
69							# ####
70							# the endings in step 1
71							# XXX: these must be sorted by length
72							# to save time we've done it already.
73							my @endings = (
74							["erendes"],
75							["erende", "hedens"],
76							["erens", "endes", "heden", "ethed", "ernes", "erets", "heder", "erede"],
77							["ende", "enes", "ered", "eren", "erer", "eres", "eret", "erne", "heds"],
78							["hed", "ers", "ene", "ere", "ens", "ets"],
79							["er", "es", "et", "en"],
80							["e"],
81							);
82
83							%Lingua::Stem::Snowball::Da::cache = ();
84
85							sub new {
86	1			1	0	14	my $pkg = shift;
87	1		33			7	$pkg = ref $pkg \|\| $pkg;
88	1					3	my %arg = @_;
89	1					2	my $self = {};
90	1					3	bless $self, $pkg;
91	1		50			9	$self->{USE_CACHE} = $arg{use_cache} \|\| 0;
92	1					3	return $self;
93							}
94
95							sub step1 {
96	23577			23577	0	28603	my ($rs, $word) = @_;
97							# ### STEP 1
98	23577					23388	my $endinglen = 8;
99	23577					31410	foreach (@endings) {
100	149853					140782	$endinglen--;
101	149853					182679	my $endingw = substr($rs, -$endinglen); # do this once.
102
103	149853					189035	foreach (@$_) {
104							# only continue if the word has this ending at all.
105	672937	100				1263575	next unless $endingw eq $_;
106	14433					13780	warn "matched $_ in $word" if DEBUG;
107							# a) delete the ending.
108	14433					37679	return substr($word, 0, -$endinglen);
109							}
110							}
111
112	9144	100				18842	if (substr($rs, -1) eq 's') { # b)
113							# check if it has a valid "s ending"...
114	1538	100				5701	if ((length $rs == 1) ?
		100
115							exists $s_ending{substr($word, -2, -1)} :
116							exists $s_ending{substr($rs, -2, -1)}) {
117	1250					1211	warn "Valid s eding $word" if DEBUG;
118							# ...delete the last character (which is a s)
119	1250					2885	return substr($word, 0, -1);
120							}
121							}
122	7894					14102	return $word;
123							}
124
125							sub stem {
126	23829			23829	0	197974	my ($self, $word) = @_;
127	23829					23304	my $orig_word;
128	23829					19671	warn " --- start : $word ---" if DEBUG;
129
130	23829	50				48699	if ($self->{USE_CACHE}) {
131	0					0	$orig_word = $word;
132	0	0				0	return $cache{$word} if defined $cache{$word};
133							}
134
135	23829					37554	my ($rs, $lslen, $rslen) = getsides($word);
136	23829	100				51538	return $word unless $lslen >= 3;
137
138	23577					36607	$word = step1($rs, $word);
139
140							# ### STEP 2
141	23577					24157	warn "Step 2" if DEBUG;
142	23577					36434	($rs, $lslen, $rslen) = getsides($word);
143	23577	50				51196	return $word unless $lslen >= 3;
144
145	23577	100				68752	if (substr($rs, -2) =~ /gd\|dt\|gt\|kt/) {
146	867					958	warn "delete last letter $word in step 2" if DEBUG;
147	867					1201	$word = substr($word, 0, - 1);
148	867					1300	($rs, $lslen, $rslen) = getsides($word);
149	867	50				2013	return $word unless $lslen >= 3;
150							}
151
152							# ### STEP 3
153	23577	100				45025	if (substr($rs, -4) eq "igst") {
154	16					37	warn "st as in igst deleted in $word" if DEBUG;
155	16					23	$word = substr($word, 0, -2);
156	16					31	($rs, $lslen, $rslen) = getsides($word);
157	16	50				50	return $word unless $lslen >= 3;
158							}
159	23577	100				49612	if (substr($rs, -4) eq "l${oe}st") {
160	26					30	warn "t as in l${oe}st deleted in $word" if DEBUG;
161	26					45	$word = substr($word, 0, -1);
162	26					47	($rs, $lslen, $rslen) = getsides($word);
163	26	50				65	return $word unless $lslen >= 3;
164							}
165	23577					37500	for (qw/elig lig els ig/) {
166	92528					87603	my $len = length;
167	92528	100				195853	if (substr($rs, -$len) eq $_) {
168	1504					1247	warn "delete $_ in $word" if DEBUG;
169	1504					2032	$word = substr($word, 0, -$len);
170	1504					2327	($rs) = getsides($word);
171	1504	100				4619	if (substr($rs, -2) =~ /gd\|dt\|gt\|kt/) {
172	65					62	warn "delete last letter $word in step 2 again" if DEBUG;
173	65					123	$word = substr($word, 0, - 1);
174	65					103	($rs, $lslen, $rslen) = getsides($word);
175							}
176	1504					1685	last;
177							}
178							}
179
180	23577	100	66			103566	return $word unless $lslen >= 3 && length $word > 3;
181
182							# ### STEP 4
183	21330	100				73032	if ($word =~ /([^$vowels])\1$/o) {
184	2029					1822	warn "delete double konsonant in $word" if DEBUG;
185	2029					3533	$word = substr($word, 0, -1);
186							}
187
188	21330	50				45065	if ($self->{USE_CACHE}) {
189	0					0	$cache{$orig_word} = $word;
190							}
191
192	21330					17025	warn " --- end : $word ---" if DEBUG;
193	21330					52400	return $word;
194							}
195
196							sub getsides {
197	49884			49884	0	67988	my $word = shift;
198							# ###
199							# find the first vowel with a non-vowel after it.
200	49884					65380	my($found_vowel, $nonv_position, $curpos) = (0, -1, 0);
201							#$found_vowel = 1 if exists $vowels{substr($word,0,1)};
202	49884					142136	foreach (split//, $word) {
203	151303					140520	$curpos++;
204	151303	100				334225	if (exists $vowels{$_}) {
		100
205	51011					49030	$found_vowel = 1;
206	51011					61275	next;
207							} elsif ($found_vowel) {
208	49679					47567	$nonv_position = $curpos;
209	49679					52511	last;
210							}
211							}
212
213
214							# got nothing: return false
215	49884	100				117235	return undef if $nonv_position == -1;
216
217	49679					45164	my($rs, $lslen); # left side and right side.
218							# ###
219							# length of the left side must be atleast 3 chars.
220	49679	100				74944	if ($nonv_position < 3) {
221	9776					11873	$lslen = length substr($word, 0, 3);
222	9776					12585	$rs = substr($word, 3);
223							} else {
224	39903					38349	$lslen = $nonv_position;
225	39903					59919	$rs = substr($word, $nonv_position);
226							}
227	49679					121963	return($rs, $lslen, length $rs);
228							}
229
230							1;
231
232							__END__