File Coverage

blib/lib/Lingua/Stem/Snowball/No.pm

Criterion	Covered	Total	%
statement	88	94	93.6
branch	37	46	80.4
condition	6	6	100.0
subroutine	9	9	100.0
pod	0	4	0.0
total	140	159	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Snowball::No;
2	2			2		28518	use strict;
	2					4
	2					57
3	2			2		867	use bytes;
	2					11
	2					11
4							# $Id: No.pm,v 1.1 2007/05/07 11:35:26 ask Exp $
5							# $Source: /opt/CVS/NoSnowball/lib/Lingua/Stem/Snowball/No.pm,v $
6							# $Author: ask $
7							# $HeadURL$
8							# $Revision: 1.1 $
9							# $Date: 2007/05/07 11:35:26 $
10							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
11							# Lingua::Stem::Snowball::No - Norwegian stemmer
12							# :: based upon the norwegian stemmer algorithm at snowball.tartarus.org
13							# by Martin Porter.
14							# (c) 2001-2007 Ask Solem Hoel
15							#
16							# This program is free software; you can redistribute it and/or modify
17							# it under the terms of the GNU General Public License version 2,
18							# NOT "earlier versions", as published by the Free Software Foundation.
19							#
20							# This program is distributed in the hope that it will be useful,
21							# but WITHOUT ANY WARRANTY; without even the implied warranty of
22							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23							# GNU General Public License for more details.
24							#
25							# You should have received a copy of the GNU General Public License
26							# along with this program; if not, write to the Free Software
27							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28							# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
29							#####
30
31
32	2			2		65	use vars qw($VERSION);
	2					2
	2					489
33							$VERSION = 1.2;
34
35							my %cache = ( );
36
37							# special characters
38							my $ae = "\xE6";
39							my $ao = "\xE5";
40							my $oe = "\xF8";
41
42							# delete the s if a "s ending" is preceeded by one
43							# of these characters.
44							my $s_ending = "bcdfghjklmnoprtvyz";
45
46							# norwegian vowels.
47							my $vowels = "aeiouy$ae$ao$oe";
48
49							# ####
50							# the endings in step 1
51							# XXX: these must be sorted by length
52							# to save time we've done it already, you can do it like this:
53							# my $bylength = sub {
54							# length $a <=> length $b;
55							# }
56							# @endings = reverse sort $bylength @endings;
57							my @endings = qw/
58							hetenes hetens hetene endes heter heten enes edes ende erte
59							ande ast het ets ers ert ens ene ane ede et es as er ar en
60							e a s
61							/;
62
63							# the endings in step 2
64							# XXX: these must be sorted by length, like @endings in step 1.
65							my @endings2 = qw/
66							hetslov slov elov elig eleg els lig eig lov leg ig
67							/;
68
69							sub new {
70	1			1	0	750	my $class = shift;
71	1					4	my %arg = @_;
72	1					2	my $self = { };
73
74	1					4	bless $self, $class;
75	1	50				4	if ($arg{use_cache}) {
76	0					0	$self->use_cache(1);
77							}
78
79	1					4	return $self;
80							}
81
82							sub use_cache {
83	40935			40935	0	60908	my ($self, $use_cache) = @_;
84	40935	50				76852	if ($use_cache) {
85	0					0	$self->{USE_CACHE} = 1;
86							}
87	40935					108733	return $self->{USE_CACHE};
88							}
89
90							sub stem {
91	20628			20628	0	11055064	my ($self, $word) = @_;
92	2			2		10	no warnings;
	2					3
	2					1008
93	20628					47406	$word = lc $word;
94	20628					30668	$word =~ y/\xC6/\xE6/;
95	20628					26078	$word =~ y/\xD8/\xF8/;
96	20628					37861	$word =~ y/\xC5/\xE5/;
97	20628					24043	my $orig_word;
98
99	20628	50				42725	if ($self->use_cache( )) {
100	0					0	$orig_word = $word;
101	0					0	my $cached_word = $cache{$word};
102	0	0				0	return $cached_word if $cached_word;
103							}
104
105	20628					42403	my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
106	20628	100				49445	return $word unless $lslen >= 3;
107
108							# ### STEP 1
109							# only need to refresh wlen each time we change the word.
110	20307					31177	foreach my $ending (@endings) {
111	509916					548278	my $endinglen = length $ending; # do this once.
112
113							# only continue if the word has this ending at all.
114	509916	100				1127878	if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) {
115							# replace erte and ert with er
116	13909	100	100			79749	if($ending eq 'erte' \|\| $ending eq 'ert') { # c)
		100
117	278					595	$word = substr($word, 0, $wlen - $endinglen);
118	278					422	$word .= "er";
119	278					605	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
120	278					578	last;
121							}
122							elsif($ending eq 's') { # b)
123							# check if it has a valid "s ending"...
124	409					1125	my $valid_s_ending = 0;
125	409	100				1161	if($rslen == 1) {
126	79					447	my $wmr1 = substr($word, 0, $wlen - $rslen);
127	79	100				431	if($wmr1 =~ /[$s_ending]$/o) {
128	56					115	$valid_s_ending = 1;
129							}
130							}
131							else {
132	330	100				1914	if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) {
133	196					377	$valid_s_ending = 1;
134							}
135							}
136	409	100				1110	if($valid_s_ending) {
137							# ...delete the last character (which is a s)
138	252					649	$word = substr($word, 0, $wlen - 1);
139	252					554	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
140	252					558	last;
141							}
142							}
143							else { # a)
144							# delete the ending.
145	13222					22941	$word = substr($word, 0, $wlen - $endinglen);
146	13222					22132	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
147	13222					32348	last;
148							}
149							}
150							}
151	20307	50				72132	return $word unless $lslen >= 3;
152
153							# ### STEP 2
154	20307					31242	my $ending = substr($rs, $rslen - 2, $rslen);
155	20307	100	100			91328	if($ending eq 'dt' \|\| $ending eq 'vt') {
156	76					150	$word = substr($word, 0, $wlen - 1);
157	76					181	($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
158							}
159	20307	50				37153	return $word unless $lslen >= 3;
160
161							# ### STEP 3
162	20307					29606	foreach my $ending (@endings2) {
163	219275	100				2177417	if($rs =~ /\Q$ending\E$/) {
164	1188					2865	$word = substr($word, 0, $wlen - length($ending));
165	1188					2075	last;
166							}
167							}
168
169	20307	50				58051	if($self->use_cache()) {
170	0					0	$cache{$orig_word} = $word;
171							}
172
173	20307					67144	return $word;
174							}
175
176							sub getsides {
177	34456			34456	0	44756	my $word = shift;
178	2			2		9	no warnings;
	2					2
	2					523
179	34456					40098	my $wlen = length $word;
180
181	34456					49383	my($ls, $rs) = (undef, undef); # left side and right side.
182
183							# ###
184							# find the first vowel with a non-vowel after it.
185	34456					51887	my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0);
186	34456					141296	foreach(split//, $word) {
187	105132	100				195559	if($found_vowel> 0) {
188	35943	100				107642	if(/[^$vowels]/o) {
189	34171	50				58917	if($curpos > 0) {
190	34171					38543	$nonv_position = $curpos + 1;
191	34171					48535	last;
192							}
193							}
194							}
195	70961	100				211284	if(/[$vowels]/o) {
196	36149					43082	$found_vowel = 1;
197							}
198	70961					89562	$curpos++;
199							}
200
201							# got nothing: return false
202	34456	100				98478	return undef if $nonv_position < 0;
203
204							# ###
205							# length of the left side must be atleast 3 chars.
206	34171					48403	my $leftlen = $wlen - ($wlen - $nonv_position);
207	34171	100				61034	if($leftlen < 3) {
208	6970					10827	$ls = substr($word, 0, 3);
209	6970					9643	$rs = substr($word, 3, $wlen);
210							}
211							else {
212	27201					47189	$ls = substr($word, 0, $leftlen);
213	27201					43414	$rs = substr($word, $nonv_position, $wlen);
214							}
215	34171					125047	return($ls, $rs, $wlen, length $ls, length $rs);
216							}
217
218							1;
219
220							__END__