File Coverage

blib/lib/Lingua/Stem/It.pm

Criterion	Covered	Total	%
statement	100	123	81.3
branch	27	46	58.7
condition	5	15	33.3
subroutine	9	11	81.8
pod	4	5	80.0
total	145	200	72.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::It;
2
3	1			1		1511	use strict;
	1					3
	1					43
4
5
6	1			1		7	use Exporter;
	1					2
	1					49
7	1			1		17	use Carp;
	1					2
	1					84
8	1			1		6	use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
	1					2
	1					137
9							BEGIN {
10	1			1		18	@ISA = qw (Exporter);
11	1					3	@EXPORT = ();
12	1					4	@EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching);
13	1					1908	%EXPORT_TAGS = ();
14							}
15							$VERSION = "0.02";
16
17							my $Stem_Caching = 0;
18							my $Stem_Cache = {};
19
20
21							sub stem {
22	2	50		2	1	754	return [] if ($#_ == -1);
23	2					5	my $parm_ref;
24	2	50				7	if (ref $_[0]) {
25	2					5	$parm_ref = shift;
26							} else {
27	0					0	$parm_ref = { @_ };
28							}
29
30	2					4	my $words = [];
31	2					4	my $locale = 'it';
32	2					4	my $exceptions = {};
33	2					8	foreach (keys %$parm_ref) {
34	2					5	my $key = lc ($_);
35	2	50				9	if ($key eq '-words') {
		0
		0
36	2					2	@$words = @{$parm_ref->{$key}};
	2					10
37							} elsif ($key eq '-exceptions') {
38	0					0	$exceptions = $parm_ref->{$key};
39							} elsif ($key eq '-locale') {
40	0					0	$locale = $parm_ref->{$key};
41							} else {
42	0					0	croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
43							}
44							}
45
46	2					4	local( $_ );
47	2					5	foreach (@$words) {
48							# Flatten case
49	8					16	$_ = lc $_;
50
51							# Check against exceptions list
52	8	50				23	if (exists $exceptions->{$_}) {
53	0					0	$_ = $exceptions->{$_};
54	0					0	next;
55							}
56
57							# Check against cache of stemmed words
58	8					11	my $original_word = $_;
59	8	50	33			21	if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
60	0					0	$_ = $Stem_Cache->{$original_word};
61	0					0	next;
62							}
63
64	8					19	$_ = stem_word($_);
65
66	8	50				23	$Stem_Cache->{$original_word} = $_ if $Stem_Caching;
67							}
68	2	50				27	$Stem_Cache = {} if ($Stem_Caching < 2);
69
70	2					14	return $words;
71
72							}
73
74							sub stem_word {
75
76	103			103	1	19684	our($word) = @_;
77	103					159	my @suffix;
78
79	103					192	$word = lc $word;
80
81							# Check against cache of stemmed words
82	103	50	33			301	if ($Stem_Caching && exists $Stem_Cache->{$word}) {
83	0					0	return $Stem_Cache->{$word};
84							}
85
86	103					125	our($RV, $R1, $R2);
87
88							#### First, replace all acute accents by grave accents.
89	103					181	$word =~ s/�/�/g;
90
91							### put u after q, and u, i between vowels into upper case.
92	103					282	$word =~ s/([a�e�i�o�u�])([ui])([a�e�i�o�u�])/$1.uc($2).$3/eg;
	0					0
93
94							#### RV is defined as follows
95	103					169	$RV = $word;
96
97							#### If the second letter is a consonant,
98	103	100				348	if($word =~ /^.[^a�e�i�o�u�]/) {
		50
99
100							#### RV is the region after the next following vowel
101	94					339	$RV =~ s/^..[^a�e�i�o�u�]*[a�e�i�o�u�]//;
102
103							#### or if the first two letters are vowels
104							} elsif ($word =~ /^[a�e�i�o�u�][^a�e�i�o�u�]/) {
105
106							#### RV is the region after the next consonant
107	0					0	$RV =~ s/^..[a�e�i�o�u�]*[^a�e�i�o�u�]//;
108
109							#### and otherwise (consonant-vowel case)
110							} else {
111
112							#### RV is the region after the third letter
113	9					28	$RV =~ s/^...//;
114							}
115
116							#print "RV=$RV\n";
117
118							#### Defining R1 and R2
119	103					166	$R1 = $word;
120
121							#### R1 is the region after the first non-vowel following a
122							#### vowel, or is the null region at the end of the word if
123							#### there is no such non-vowel.
124
125	103	100				657	unless($R1 =~ s/^.*?[a�e�i�o�u�][^a�e�i�o�u�]//) {
126	2					5	$R1 = "";
127							}
128
129							#print "R1=$R1\n";
130
131							#### R2 is the region after the first non-vowel following a
132							#### vowel in R1, or is the null region at the end of the
133							#### word if there is no such non-vowel.
134
135	103					210	$R2 = $R1;
136
137	103	100				209	if($R2) {
138	101	100				420	unless($R2 =~ s/^.*?[a�e�i�o�u�][^a�e�i�o�u�]//) {
139	12					20	$R2 = "";
140							}
141							}
142
143							#print "R2=$R2\n";
144
145							#### Step 0: Attached pronoun
146							##### Search for the longest among the following suffixes
147	103					627	my @pronoun = qw(
148							ci gli la le li lo mi ne si ti vi
149							sene gliela gliele glieli glielo gliene
150							mela mele meli melo mene
151							tela tele teli telo tene
152							cela cele celi celo cene
153							vela vele veli velo vene
154							);
155
156							#### following one of
157							#### (a) ando endo
158							#### (b) ar er ir
159							#### in RV.
160							#### In case of (a) the suffix is deleted,
161							#### in case (b) it is replace by e
162
163	103	100				251	stem_killer( $RV, "[ae]ndo", "", @pronoun )
164							or stem_killer( $RV, "[aei]r", "e", @pronoun );
165
166
167							#### Step 1: Standard suffix removal
168
169	103					141	my $step1 = 0;
170
171							#### Search for the longest among the following suffixes,
172							#### and perform the action indicated
173
174	103					480	@suffix = qw(
175							anza anze
176							ico ici ica ice iche ichi
177							ismo ismi
178							abile abili ibile ibili
179							ista iste isti ist� ist� ist�
180							oso osi osa ose
181							mente
182							atrice atrici
183							);
184
185							#### delete if in R2
186	103					240	$step1 += stem_killer( $R2, "", "", @suffix );
187
188	103					387	@suffix = qw(
189							icazione icazioni icatore icatori
190							azione azioni atore atori
191							);
192
193							#### delete if in R2
194							#### if preceded by ic, delete if in R2
195	103					236	$step1 += stem_killer( $R2, "", "", @suffix );
196
197	103					238	@suffix = qw(
198							logia logie
199							);
200
201							#### replace with log if in R2
202	103					220	$step1 += stem_killer( $R2, "", "log", @suffix );
203
204	103					248	@suffix = qw(
205							uzione uzioni usione usioni
206							);
207
208							#### replace with u if in R2
209	103					216	$step1 += stem_killer( $R2, "", "u", @suffix );
210
211	103					204	@suffix = qw(
212							enza enze
213							);
214
215							#### replace with ente if in R2
216	103					198	$step1 += stem_killer( $R2, "", "ente", @suffix );
217
218	103					236	@suffix = qw(
219							amento amenti imento imenti
220							);
221
222							#### delete if in RV
223	103					243	$step1 += stem_killer( $RV, "", "", @suffix );
224
225	103					201	@suffix = qw(
226							amente
227							);
228
229							#### delete if in R1
230							#### if preceded by iv, delete if in R2
231							#### (and if further preceded by at, delete if in R2), otherwise,
232							#### if preceded by os, ic or abil, delete if in R2
233	103		33			208	$step1 += stem_killer( $R2, "ativ", "", @suffix )
234							\|\| stem_killer( $R2, "iv", "", @suffix )
235							\|\| stem_killer( $R2, "(os\|ic\|abil)", "", @suffix )
236							\|\| stem_killer( $R1, "", "", @suffix );
237
238	103					211	@suffix = qw(
239							it�
240							);
241
242							#### delete if in R2
243							#### if preceded by abil, ic or iv, delete if in R2
244	103		33			195	$step1 += stem_killer( $R2, "(abil\|ic\|iv)", "", @suffix )
245							\|\| stem_killer( $R2, "", "", @suffix );
246
247
248	103					249	@suffix = qw(
249							ivo ivi iva ive
250							);
251
252							#### delete if in R2
253							#### if preceded by at, delete if in R2
254							#### (and if further preceded by ic, delete if in R2)
255	103		33			219	$step1 += stem_killer( $R2, "icat", "", @suffix)
256							\|\| stem_killer( $R2, "at", "", @suffix)
257							\|\| stem_killer( $R2, "", "", @suffix);
258
259
260							#### Step 2: Verb suffixes
261
262							#### Do step 2 if no ending was removed by step 1.
263	103	100				301	if($step1 == 0) {
264
265							#### Search for the longest among the following suffixes in RV,
266							#### and if found, delete.
267	95					439	stem_killer( $RV, "", "", qw(
268							ammo ando ano are arono asse assero assi assimo
269							ata ate ati ato ava avamo avano avate avi avo
270							emmo enda ende endi endo er� erai eranno ere
271							erebbe erebbero erei eremmo eremo ereste eresti
272							erete er� erono essero ete eva evamo evano evate
273							evi evo Yamo iamo immo ir� irai iranno ire
274							irebbe irebbero irei iremmo iremo ireste iresti
275							irete ir� irono isca iscano isce isci isco iscono
276							issero ita ite iti ito iva ivamo ivano ivate
277							ivi ivo ono uta ute uti uto ar er
278							));
279							}
280
281							#### Step 3a
282							#### Delete a final a, e, i, o, �, �, � or � if it is in RV,
283							#### and a preceding i if it is in RV
284	103	100				496	if($RV =~ s/i?[aeio��]$//) {
285	50					160	$word =~ s/i?[aeio��]$//;
286							#} else {
287							# if($RV =~ s/[aeio��]$//) {
288							# $word =~ s/[aeio��]$//;
289							# }
290							}
291
292							#### Step 3b
293							#### Replace final ch (or gh) with c (or g) if in RV
294	103	100				290	if($RV =~ s/([cg])h$/$1/) {
295	1					4	$word =~ s/([cg])h$/$1/;
296							}
297
298							#### Finally,
299							#### turn I and U back into lower case
300	103					170	$word =~ s/([IU])/lc($1)/eg;
	0					0
301
302	103					536	return $word;
303
304							}
305
306							sub stem_killer {
307	1843			1843	0	5669	my($where, $pre, $with, @list) = @_;
308	1			1		6	use vars qw($RV $R1 $R2 $word);
	1					2
	1					381
309	1843					2066	my $done = 0;
310	1843					3778	foreach my $P (sort { length($b) <=> length($a) } @list) {
	66819					71507
311	20851	100				212621	if($where =~ /$pre$P$/) {
312	64					732	$R2 =~ s/$pre$P$/$with/;
313	64					712	$R1 =~ s/$pre$P$/$with/;
314	64					688	$RV =~ s/$pre$P$/$with/;
315	64					744	$word =~ s/$pre$P$/$with/;
316	64					180	$done = 1;
317	64					158	last;
318							}
319							}
320	1843					7706	return $done;
321							}
322
323							sub stem_caching {
324	0			0	1		my $parm_ref;
325	0	0					if (ref $_[0]) {
326	0						$parm_ref = shift;
327							} else {
328	0						$parm_ref = { @_ };
329							}
330	0						my $caching_level = $parm_ref->{-level};
331	0	0					if (defined $caching_level) {
332	0	0					if ($caching_level !~ m/^[012]$/) {
333	0						croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
334							}
335	0						$Stem_Caching = $caching_level;
336							}
337	0						return $Stem_Caching;
338							}
339
340							sub clear_stem_cache {
341	0			0	1		$Stem_Cache = {};
342							}
343
344
345							1;
346							__END__