File Coverage

blib/lib/Lingua/Stem/Es.pm

Criterion	Covered	Total	%
statement	139	159	87.4
branch	93	140	66.4
condition	11	16	68.7
subroutine	9	11	81.8
pod	4	7	57.1
total	256	333	76.8

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Es;
2
3	2			2		86877	use Carp;
	2					5
	2					230
4
5	2			2		10	use warnings;
	2					5
	2					53
6	2			2		10	use strict;
	2					9
	2					68
7
8	2			2		2486	use utf8;
	2					22
	2					13
9
10							require Exporter;
11
12							our @ISA = qw(Exporter);
13
14							our %EXPORT_TAGS = ();
15							our @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching);
16							our @EXPORT = ();
17
18							our $VERSION = '0.04';
19
20							our $DEBUG = 0;
21
22							my $Stem_Caching = 0;
23							my $Stem_Cache = {};
24
25							my $vowels = 'aeiouáéíóúü';
26							my $consonants = 'bcdfghjklmnñpqrstvwxyz';
27
28							my $revowel = qr/[$vowels]/;
29							my $reconsonants = qr/[$consonants]/;
30
31							sub stem {
32	2	50		2	1	133117	return [] if ( $#_ == -1 );
33	2					7	my $parm_ref;
34	2	50				12	if ( ref $_[0] ) {
35	0					0	$parm_ref = shift;
36							}
37							else {
38	2					11	$parm_ref = {@_};
39							}
40
41	2					6	my $words = [];
42	2					6	my $locale = 'es';
43	2					7	my $exceptions = {};
44	2					13	foreach ( keys %$parm_ref ) {
45	2					9	my $key = lc($_);
46	2	50				13	if ( $key eq '-words' ) {
		0
		0
47	2					4	@$words = @{ $parm_ref->{$key} };
	2					11640
48							}
49							elsif ( $key eq '-exceptions' ) {
50	0					0	$exceptions = $parm_ref->{$key};
51							}
52							elsif ( $key eq '-locale' ) {
53	0					0	$locale = $parm_ref->{$key};
54							}
55							else {
56	0					0	croak( __PACKAGE__
57							. "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"
58							);
59							}
60							}
61
62	2					10	local ($_);
63	2					12	foreach (@$words) {
64
65							# Check against exceptions list
66	56780	50				139614	if ( exists $exceptions->{$_} ) {
67	0					0	$_ = $exceptions->{$_};
68	0					0	next;
69							}
70
71							# Cache and stem
72	56780					74006	my $original_word = $_;
73	56780					106873	$_ = stem_word($_);
74	56780	50				164802	$Stem_Cache->{$original_word} = $_ if $Stem_Caching;
75							}
76	2	50				15	$Stem_Cache = {} if ( $Stem_Caching < 2 );
77
78	2					20	return $words;
79							}
80
81							sub stem_word {
82	56780			56780	1	69179	my $word = shift;
83
84	56780	50				141050	print "*****************\nOriginal: $word\n" if $DEBUG;
85
86							# Flatten case
87	56780					92791	$word =~ s/Á/á/g;
88	56780					68847	$word =~ s/É/é/g;
89	56780					69207	$word =~ s/Í/í/g;
90	56780					89819	$word =~ s/Ó/ó/g;
91	56780					67722	$word =~ s/Ú/ú/g;
92	56780					77124	$word =~ s/Ü/ü/g;
93	56780					70381	$word =~ s/Ñ/ñ/g;
94	56780					99380	$word = lc $word;
95	56780	50				124284	print "Flatened word: $word\n" if $DEBUG;
96
97							# Check against cache of stemmed words
98	56780	50	33			127313	if ( $Stem_Caching && exists $Stem_Cache->{$word} ) {
99	0					0	return $Stem_Cache->{$word};
100							}
101
102							# Remove punctuation
103	56780					249594	$word =~ s/[^$vowels$consonants]//g;
104	56780	100				123274	return '' unless $word;
105	56754	50				103684	print "Removed punctuation: $word\n" if $DEBUG;
106
107	56754					98711	my $RV = define_RV($word);
108	56754					80836	my $suffix;
109
110							############################################################
111							########### Step 0 ###########
112							############################################################
113							# Attached pronoun
114							# Search for the longest among the following suffixes:
115							# me se sela selo selas selos la le lo las les los nos
116							# and delete it, if it comes after one of
117							# a) iéndo ándo ár ér ír
118							# b) ando iendo ar er ir
119							# c) yendo following u
120							# in RV. In the case of c), yendo must lie in RV, but the preceding u can
121							# be outside it.
122							# In the case of a), deletion is followed by removing the acute accent.
123							# Always do step 0
124
125	56754	100				114436	if ($RV) {
126	55420					199787	my $pronoun =
127							qr/(selas\|selos\|sela\|selo\|las\|les\|los\|nos\|me\|se\|la\|le\|lo)$/;
128
129	55420	100	33			799690	if ( ($suffix) = $RV =~ /(?:ándo\|iéndo\|ár\|ér\|ír)($pronoun)$/ ) {
		100
		50
130
131							# Case a)
132	234					2963	$word =~ s/$suffix$//;
133	234					840	$word =~ s/á/a/;
134	234					541	$word =~ s/é/e/;
135	234					454	$word =~ s/í/i/;
136	234					439	$word =~ s/ó/o/;
137	234					404	$word =~ s/ú/u/;
138	234					391	$word =~ s/ü/u/;
139	234	50				863	print "Step 0 case a: $word\n" if $DEBUG;
140							}
141							elsif ( ($suffix) = $RV =~ /(?:ando\|iendo\|ar\|er\|ir)($pronoun)$/ ) {
142
143							# Case b)
144	1684					23267	$word =~ s/$suffix$//;
145	1684	50				7031	print "Step 0 case b: $word\n" if $DEBUG;
146							}
147							elsif ( ($suffix) =
148							$word =~ /uyendo($pronoun)$/ and $RV =~ /yendo$pronoun$/ )
149							{
150
151							# Case c)
152	0					0	$word =~ s/$suffix$//;
153	0	0				0	print "Step 0 case c: $word\n" if $DEBUG;
154							}
155							}
156
157							############################################################
158							########### Step 1 ###########
159							############################################################
160							# Standard suffix removal
161							# Search for the longest among the following suffixes, and perform the
162							# action indicated.
163							# Always do step 1
164
165	56754					118644	$RV = define_RV($word);
166	56754					121383	my $R1 = define_R1($word);
167	56754					114220	my $R2 = define_R2($word);
168
169	56754	100	100			868733	if (
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
		100
170							($suffix) = $R2 =~
171							/(amientos\|imientos\|amiento\|imiento\|anzas\|ismos\|ables\|ibles\|istas\|
172							anza\|icos\|icas\|ismo\|able\|ible\|ista\|osos\|osas\|ico\|ica\|oso\|osa)$/x
173							)
174							{
175
176							# anza anzas ico ica icos icas ismo ismos able ables ible ibles ista istas
177							# oso osa osos osas amiento amientos imiento imientos
178							# delete if in R2
179	2346					30433	$word =~ s/$suffix$//;
180	2346	50				6755	print "Step 1 case 1: $word\n" if $DEBUG;
181							}
182							elsif ( ($suffix) =
183							$R2 =~ /(aciones\|adores\|adoras\|adora\|antes?\|ancias?\|ación\|ador)$/ )
184							{
185
186							# adora ador ación adoras adores aciones
187							# delete if in R2
188							# if preceded by ic, delete if in R2
189	1766	100				35967	if ( $R2 =~ /ic$suffix$/ ) {
190	86					1063	$word =~ s/ic$suffix$//;
191							}
192							else {
193	1680					19765	$word =~ s/$suffix$//;
194							}
195	1766	50				5402	print "Step 1 case 2: $word\n" if $DEBUG;
196							}
197							elsif ( ($suffix) = $R2 =~ /(logías?)$/ ) {
198
199							# logía logías
200							# replace with log if in R2
201	32					277	$word =~ s/$suffix$/log/;
202	32	50				167	print "Step 1 case 3: $word\n" if $DEBUG;
203							}
204							elsif ( ($suffix) = $R2 =~ /uci(ones\|ón)$/ ) {
205
206							# ución uciones
207							# replace with u if in R2
208	42					491	$word =~ s/uci$suffix$/u/;
209	42	50				131	print "Step 1 case 4: $word\n" if $DEBUG;
210							}
211							elsif ( ($suffix) = $R2 =~ /(encias?)$/ ) {
212
213							# encia encias
214							# replace with ente if in R2
215	246					2301	$word =~ s/$suffix$/ente/;
216	246	50				804	print "Step 1 case 5: $word\n" if $DEBUG;
217							}
218							elsif ( $R1 =~ /amente$/ ) {
219
220							# delete if in R1
221							# if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2)
222							# otherwise,
223							# if preceded by os, ic or ad, delete if in R2
224	412	100				2361	if ( ($suffix) = $R2 =~ /(os\|ic\|ad)amente$/ ) {
		100
225	120					1553	$word =~ s/($suffix)amente$//;
226							}
227							elsif ( ($suffix) = $R2 =~ /((?:at(?=iv))?(?:iv))amente$/ ) {
228	38					453	$word =~ s/($suffix)amente$//;
229							}
230							else {
231	254					1244	$word =~ s/amente$//;
232							}
233	412	50				1116	print "Step 1 case 6: $word\n" if $DEBUG;
234							}
235							elsif ( $R2 =~ /mente$/ ) {
236
237							# mente
238							# delete if in R2
239							# if preceded by able, ante or ible, delete if in R2
240	256	100				985	if ( ($suffix) = $R2 =~ /([ai]ble\|ante)mente$/ ) {
241	38					541	$word =~ s/($suffix)mente$//;
242							}
243							else {
244	218					934	$word =~ s/mente$//;
245							}
246	256	50				629	print "Step 1 case 7: $word\n" if $DEBUG;
247							}
248							elsif ( $R2 =~ /idad(es)?$/ ) {
249
250							# idad idades
251							# delete if in R2
252							# if preceded by abil, ic or iv, delete if in R2
253	436	100				1900	if ( ($suffix) = $R2 =~ /(abil\|ic\|iv)idad(es)?$/ ) {
254	56					322	$word =~ s/(abil\|ic\|iv)idad(es)?$//;
255							}
256							else {
257	380					2121	$word =~ s/idad(es)?$//;
258							}
259	436	50				1223	print "Step 1 case 8: $word\n" if $DEBUG;
260							}
261							elsif ( ($suffix) = $R2 =~ /(iv[ao]s?)$/ ) {
262
263							# iva ivo ivas ivos
264							# delete if in R2
265							# if preceded by at, delete if in R2
266	500	100				11843	$R2 =~ /at$suffix$/ ? $word =~ s/at$suffix$// : $word =~ s/$suffix$//;
267	500	50				2123	print "Step 1 case 9: $word\n" if $DEBUG;
268							}
269
270							############################################################
271							########### Step 2a ###########
272							############################################################
273							# Verb suffixes beginning 'y'
274							# Search for the longest among the following suffixes in RV, and
275							# if found, delete if preceded by u. (Note that the preceding u
276							# need not be in RV).
277							# ya ye yan yen yeron yendo yo yó yas yes yais yamos
278							# Do step 2a if no ending was removed by step 1
279							elsif ($word =~ /u(yeron\|yendo\|yamos\|yais\|ya[ns]?\|ye[ns]?\|yo\|yó)$/
280							&& $RV =~ /(yeron\|yendo\|yamos\|yais\|ya[ns]?\|ye[ns]?\|yo\|yó)$/ )
281							{
282	114					547	$word =~ s/u(yeron\|yendo\|yamos\|yais\|ya[ns]?\|ye[ns]?\|yo\|yó)$/u/;
283	114	50				292	print "Step 2a: $word\n" if $DEBUG;
284							}
285
286							############################################################
287							########### Step 2b ###########
288							############################################################
289							# Other verb suffixes
290							# Search for the longest among the following suffixes in RV, and
291							# perform the action indicated.
292							# Do step 2b if step 2a was done but failed to remove a suffix.
293
294							elsif (
295							($suffix) =
296							$RV =~ /(iésemos\|iéramos\|iríamos\|eríamos\|aríamos\|ásemos\|
297							áramos\|ábamos\|isteis\|asteis\|ieseis\|ierais\|iremos\|iríais\|eremos\|eríais\|aremos\|
298							aríais\|aseis\|arais\|abais\|ieses\|ieras\|iendo\|ieron\|iesen\|ieran\|iréis\|irías\|irían\| #ancias?\|
299							eréis\|erías\|erían\|aréis\|arías\|arían\|íamos\|imos\|amos\|idos\|ados\|íais\|ases\|aras\|idas\| #antes?\|
300							adas\|abas\|ando\|aron\|asen\|aran\|aban\|iste\|aste\|iese\|iera\|iría\|irás\|irán\|ería\|erás\|erán\|
301							aría\|arás\|arán\|áis\|ías\|ido\|ado\|ían\|ase\|ara\|ida\|ada\|aba\|iré\|irá\|eré\|erá\|aré\|
302							ará\|ís\|as\|ir\|er\|ar\|ió\|an\|id\|ed\|ad\|ía)$/x
303							)
304							{
305
306							# delete
307	20092					276671	$word =~ s/$suffix$//;
308	20092	50				60969	print "Step 2b1: $word\n" if $DEBUG;
309							}
310							elsif ( ($suffix) = $RV =~ /(emos\|éis\|en\|es)$/ ) {
311
312							# en es éis emos
313							# delete, and if preceded by gu delete the u (the gu need not be in RV)
314	3790	100				51383	$word =~ /gu$suffix$/
315							? $word =~ s/gu$suffix$/g/
316							: $word =~ s/$suffix$//;
317	3790	50				11877	print "Step 2b2: $word\n" if $DEBUG;
318							}
319
320							############################################################
321							########### Step 3 ###########
322							############################################################
323							# Residual suffix
324							# Search for the longest among the following suffixes in RV, and
325							# perform the action indicated.
326							# Always do step 3.
327
328	56754					111442	$RV = define_RV($word);
329
330	56754	100				288612	if ( ($suffix) = $RV =~ /(os\|[aoáíó])$/ ) {
		100
331
332							# os a o á í ó
333							# delete if in RV
334	15954					189923	$word =~ s/$suffix$//;
335	15954	50				59374	print "Step 3a: $word\n" if $DEBUG;
336							}
337							elsif ( $RV =~ /[eé]$/ ) {
338
339							# e é
340							# delete if in RV, and if preceded by gu with the u in RV, delete the u.
341	3900	100	66			13511	if ( $word =~ /gu[eé]$/ && $RV =~ /u[eé]$/ ) {
342	60					270	$word =~ s/gu[eé]$/g/;
343							}
344							else {
345	3840					13165	$word =~ s/[eé]$//;
346							}
347	3900	50				15943	print "Step 3b: $word\n" if $DEBUG;
348							}
349	56754	50				112771	print "Before step 4: $word\n" if $DEBUG;
350							############################################################
351							########### Step 4 ###########
352							############################################################
353							# Remove the acute accents
354	56754					89426	$word =~ s/á/a/g;
355	56754					93048	$word =~ s/é/e/g;
356	56754					75199	$word =~ s/í/i/g;
357	56754					65396	$word =~ s/ó/o/g;
358	56754					65502	$word =~ s/ú/u/g;
359	56754	50				103784	print "Step 4: $word\n" if $DEBUG;
360
361	56754					151749	return $word;
362							}
363
364							sub define_R1 {
365							############################################
366							######## Find R1 ###########
367							############################################
368							# R1 is the region after the first non-vowel following a vowel,
369							# or is the null region at the end of the word if there is
370							# no such non-vowel.
371	56754			56754	0	76834	my $word = shift;
372	56754					57065	my $R1;
373	56754					382077	($R1) = $word =~ /^.?$revowel$reconsonants(.)$/;
374	56754		100			148369	$R1 \|\|= '';
375	56754	50				107759	print "R1: $R1\n" if $DEBUG;
376	56754					126293	return $R1;
377							}
378
379							sub define_R2 {
380							############################################
381							######## Find R2 ###########
382							############################################
383							# R2 is the region after the second non-vowel following a vowel,
384							# or is the null region at the end of the word if there is
385							# no such non-vowel.
386	56754			56754	0	76139	my $word = shift;
387	56754					54140	my $R2;
388	56754					469891	($R2) = $word =~ /^.?$revowel$reconsonants.?$revowel$reconsonants(.*)$/;
389	56754		100			161918	$R2 \|\|= '';
390	56754	50				114282	print "R2: $R2\n" if $DEBUG;
391	56754					149626	return $R2;
392							}
393
394							sub define_RV {
395							############################################
396							######## Find RV ###########
397							############################################
398							# RV is defined as follows:
399							# If the second letter is a consonant, RV is the region
400							# after the next following vowel.
401							# If the first two letters are vowels, RV is the region
402							# after the next consonant
403							# If the first letter is a consonant and the second a vowel,
404							# RV is the region after the third letter
405							# RV is the end of the word if these positions cannot be found.
406	170262			170262	0	244709	my $word = shift;
407	170262					193728	my $RV;
408	170262	100				1804568	if ( $word =~ /^.$reconsonants.?$revowel(.)$/ ) {
		100
		100
409	60844					114218	$RV = $1;
410	60844	50				137546	print "$word -- RV: Case 1 '$RV'\n" if $DEBUG;
411							}
412							elsif ( $word =~ /^$revowel{2,}$reconsonants(.*)$/ ) {
413	1542					2805	$RV = $1;
414	1542	50				3181	print "$word -- RV: Case 2 '$RV'\n" if $DEBUG;
415							}
416							elsif ( $word =~ /^$reconsonants$revowel.(.*)$/ ) {
417	106422					205956	$RV = $1;
418	106422	50				218792	print "$word -- RV: Case 3 '$RV'\n" if $DEBUG;
419							}
420							else {
421	1454					1847	$RV = '';
422	1454	50				2785	print "$word -- RV: Case 4 '$RV'\n" if $DEBUG;
423							}
424	170262					539420	return $RV;
425							}
426
427							sub stem_caching {
428	0			0	1		my $parm_ref;
429	0	0					if ( ref $_[0] ) {
430	0						$parm_ref = shift;
431							}
432							else {
433	0						$parm_ref = {@_};
434							}
435	0						my $caching_level = $parm_ref->{-level};
436	0	0					if ( defined $caching_level ) {
437	0	0					if ( $caching_level !~ m/^[012]$/ ) {
438	0						croak( __PACKAGE__
439							. q{::stem_caching() - Legal values are '0','1' or '2'.}
440							. qq{ '$caching_level' is not a legal value) } );
441							}
442	0						$Stem_Caching = $caching_level;
443							}
444	0						return $Stem_Caching;
445							}
446
447							sub clear_stem_cache {
448	0			0	1		$Stem_Cache = {};
449							}
450
451							1;
452							__END__