File Coverage

blib/lib/Lingua/Stem/Snowball/No.pm
Criterion Covered Total %
statement 88 94 93.6
branch 37 46 80.4
condition 6 6 100.0
subroutine 9 9 100.0
pod 0 4 0.0
total 140 159 88.0


line stmt bran cond sub pod time code
1             package Lingua::Stem::Snowball::No;
2 2     2   28518 use strict;
  2         4  
  2         57  
3 2     2   867 use bytes;
  2         11  
  2         11  
4             # $Id: No.pm,v 1.1 2007/05/07 11:35:26 ask Exp $
5             # $Source: /opt/CVS/NoSnowball/lib/Lingua/Stem/Snowball/No.pm,v $
6             # $Author: ask $
7             # $HeadURL$
8             # $Revision: 1.1 $
9             # $Date: 2007/05/07 11:35:26 $
10             # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
11             # Lingua::Stem::Snowball::No - Norwegian stemmer
12             # :: based upon the norwegian stemmer algorithm at snowball.tartarus.org
13             # by Martin Porter.
14             # (c) 2001-2007 Ask Solem Hoel
15             #
16             # This program is free software; you can redistribute it and/or modify
17             # it under the terms of the GNU General Public License version 2,
18             # *NOT* "earlier versions", as published by the Free Software Foundation.
19             #
20             # This program is distributed in the hope that it will be useful,
21             # but WITHOUT ANY WARRANTY; without even the implied warranty of
22             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23             # GNU General Public License for more details.
24             #
25             # You should have received a copy of the GNU General Public License
26             # along with this program; if not, write to the Free Software
27             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28             # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
29             #####
30              
31              
32 2     2   65 use vars qw($VERSION);
  2         2  
  2         489  
33             $VERSION = 1.2;
34              
35             my %cache = ( );
36              
37             # special characters
38             my $ae = "\xE6";
39             my $ao = "\xE5";
40             my $oe = "\xF8";
41              
42             # delete the s if a "s ending" is preceeded by one
43             # of these characters.
44             my $s_ending = "bcdfghjklmnoprtvyz";
45              
46             # norwegian vowels.
47             my $vowels = "aeiouy$ae$ao$oe";
48              
49             # ####
50             # the endings in step 1
51             # XXX: these must be sorted by length
52             # to save time we've done it already, you can do it like this:
53             # my $bylength = sub {
54             # length $a <=> length $b;
55             # }
56             # @endings = reverse sort $bylength @endings;
57             my @endings = qw/
58             hetenes hetens hetene endes heter heten enes edes ende erte
59             ande ast het ets ers ert ens ene ane ede et es as er ar en
60             e a s
61             /;
62              
63             # the endings in step 2
64             # XXX: these must be sorted by length, like @endings in step 1.
65             my @endings2 = qw/
66             hetslov slov elov elig eleg els lig eig lov leg ig
67             /;
68              
69             sub new {
70 1     1 0 750 my $class = shift;
71 1         4 my %arg = @_;
72 1         2 my $self = { };
73              
74 1         4 bless $self, $class;
75 1 50       4 if ($arg{use_cache}) {
76 0         0 $self->use_cache(1);
77             }
78              
79 1         4 return $self;
80             }
81              
82             sub use_cache {
83 40935     40935 0 60908 my ($self, $use_cache) = @_;
84 40935 50       76852 if ($use_cache) {
85 0         0 $self->{USE_CACHE} = 1;
86             }
87 40935         108733 return $self->{USE_CACHE};
88             }
89              
90             sub stem {
91 20628     20628 0 11055064 my ($self, $word) = @_;
92 2     2   10 no warnings;
  2         3  
  2         1008  
93 20628         47406 $word = lc $word;
94 20628         30668 $word =~ y/\xC6/\xE6/;
95 20628         26078 $word =~ y/\xD8/\xF8/;
96 20628         37861 $word =~ y/\xC5/\xE5/;
97 20628         24043 my $orig_word;
98              
99 20628 50       42725 if ($self->use_cache( )) {
100 0         0 $orig_word = $word;
101 0         0 my $cached_word = $cache{$word};
102 0 0       0 return $cached_word if $cached_word;
103             }
104              
105 20628         42403 my ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
106 20628 100       49445 return $word unless $lslen >= 3;
107              
108             # ### STEP 1
109             # only need to refresh wlen each time we change the word.
110 20307         31177 foreach my $ending (@endings) {
111 509916         548278 my $endinglen = length $ending; # do this once.
112              
113             # only continue if the word has this ending at all.
114 509916 100       1127878 if(substr($rs, $rslen - $endinglen, $rslen) eq $ending) {
115             # replace erte and ert with er
116 13909 100 100     79749 if($ending eq 'erte' || $ending eq 'ert') { # c)
    100          
117 278         595 $word = substr($word, 0, $wlen - $endinglen);
118 278         422 $word .= "er";
119 278         605 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
120 278         578 last;
121             }
122             elsif($ending eq 's') { # b)
123             # check if it has a valid "s ending"...
124 409         1125 my $valid_s_ending = 0;
125 409 100       1161 if($rslen == 1) {
126 79         447 my $wmr1 = substr($word, 0, $wlen - $rslen);
127 79 100       431 if($wmr1 =~ /[$s_ending]$/o) {
128 56         115 $valid_s_ending = 1;
129             }
130             }
131             else {
132 330 100       1914 if(substr($rs, $rslen - 2, $rslen - 1) =~ /[$s_ending]/o) {
133 196         377 $valid_s_ending = 1;
134             }
135             }
136 409 100       1110 if($valid_s_ending) {
137             # ...delete the last character (which is a s)
138 252         649 $word = substr($word, 0, $wlen - 1);
139 252         554 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
140 252         558 last;
141             }
142             }
143             else { # a)
144             # delete the ending.
145 13222         22941 $word = substr($word, 0, $wlen - $endinglen);
146 13222         22132 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
147 13222         32348 last;
148             }
149             }
150             }
151 20307 50       72132 return $word unless $lslen >= 3;
152              
153             # ### STEP 2
154 20307         31242 my $ending = substr($rs, $rslen - 2, $rslen);
155 20307 100 100     91328 if($ending eq 'dt' || $ending eq 'vt') {
156 76         150 $word = substr($word, 0, $wlen - 1);
157 76         181 ($ls, $rs, $wlen, $lslen, $rslen) = getsides($word);
158             }
159 20307 50       37153 return $word unless $lslen >= 3;
160              
161             # ### STEP 3
162 20307         29606 foreach my $ending (@endings2) {
163 219275 100       2177417 if($rs =~ /\Q$ending\E$/) {
164 1188         2865 $word = substr($word, 0, $wlen - length($ending));
165 1188         2075 last;
166             }
167             }
168              
169 20307 50       58051 if($self->use_cache()) {
170 0         0 $cache{$orig_word} = $word;
171             }
172            
173 20307         67144 return $word;
174             }
175              
176             sub getsides {
177 34456     34456 0 44756 my $word = shift;
178 2     2   9 no warnings;
  2         2  
  2         523  
179 34456         40098 my $wlen = length $word;
180              
181 34456         49383 my($ls, $rs) = (undef, undef); # left side and right side.
182            
183             # ###
184             # find the first vowel with a non-vowel after it.
185 34456         51887 my($found_vowel, $nonv_position, $curpos) = (-1, -1, 0);
186 34456         141296 foreach(split//, $word) {
187 105132 100       195559 if($found_vowel> 0) {
188 35943 100       107642 if(/[^$vowels]/o) {
189 34171 50       58917 if($curpos > 0) {
190 34171         38543 $nonv_position = $curpos + 1;
191 34171         48535 last;
192             }
193             }
194             }
195 70961 100       211284 if(/[$vowels]/o) {
196 36149         43082 $found_vowel = 1;
197             }
198 70961         89562 $curpos++;
199             }
200              
201             # got nothing: return false
202 34456 100       98478 return undef if $nonv_position < 0;
203              
204             # ###
205             # length of the left side must be atleast 3 chars.
206 34171         48403 my $leftlen = $wlen - ($wlen - $nonv_position);
207 34171 100       61034 if($leftlen < 3) {
208 6970         10827 $ls = substr($word, 0, 3);
209 6970         9643 $rs = substr($word, 3, $wlen);
210             }
211             else {
212 27201         47189 $ls = substr($word, 0, $leftlen);
213 27201         43414 $rs = substr($word, $nonv_position, $wlen);
214             }
215 34171         125047 return($ls, $rs, $wlen, length $ls, length $rs);
216             }
217              
218             1;
219              
220             __END__