File Coverage

blib/lib/Lingua/Stem/Cistem.pm

Criterion	Covered	Total	%
statement	124	124	100.0
branch	32	32	100.0
condition	50	59	84.7
subroutine	10	10	100.0
pod	4	4	100.0
total	220	229	96.0

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Stem::Cistem;
2
3	2			2		120401	use strict;
	2					12
	2					45
4	2			2		10	use warnings;
	2					3
	2					48
5
6	2			2		9	use utf8;
	2					2
	2					10
7
8	2			2		83	use 5.010001;
	2					6
9
10							require Exporter;
11
12							BEGIN {
13	2			2		6	$Lingua::Stem::Cistem::VERSION = '0.07';
14	2					26	@Lingua::Stem::Cistem::ISA = qw(Exporter);
15	2					6	@Lingua::Stem::Cistem::EXPORT = qw();
16	2					4	@Lingua::Stem::Cistem::EXPORT_OK = qw(stem segment stem_robust segment_robust);
17	2					203	%Lingua::Stem::Cistem::EXPORT_TAGS = (
18							'all' => [qw(stem segment stem_robust segment_robust)],
19							'orig' => [qw(stem segment)],
20							'robust' => [qw(stem_robust segment_robust)],
21							);
22							}
23
24							sub stem {
25	43		50	43	1	28874	my $word = shift // '';
26	43					61	my $case_insensitive = shift;
27
28	43					78	my $upper = (ucfirst $word eq $word);
29
30	43					69	$word = lc($word);
31	2			2		16	$word =~ tr/äöü/aou/;
	2					5
	2					23
	43					104
32	43					96	$word =~ s/ß/ss/g;
33
34	43					98	$word =~ s/^ge(.{4,})/$1/;
35
36	43					64	$word =~ s/sch/\$/g;
37	43					52	$word =~ s/ei/\%/g;
38	43					56	$word =~ s/ie/\&/g;
39	43					123	$word =~ s/(.)\1/$1*/g;
40
41	43					112	while(length($word)>3) {
42	28	100	66			241	if( length($word)>5 && ($word =~ s/e[mr]$// \|\| $word =~ s/nd$//) ) {}
		100	66
		100	100
			100
43							elsif( (!($upper) \|\| $case_insensitive) && $word =~ s/t$//) {}
44							elsif( $word =~ s/[esn]$//) {}
45	17					26	else { last; }
46							}
47
48	43					78	$word =~ s/(.)\*/$1$1/g;
49	43					56	$word =~ s/\$/sch/g;
50	43					61	$word =~ s/\%/ei/g;
51	43					64	$word =~ s/\&/ie/g;
52
53	43					176	return $word;
54							}
55
56							sub segment {
57	37		50	37	1	15664	my $word = shift // '';
58	37					57	my $case_insensitive = shift;
59
60	37					62	my $upper = (ucfirst $word eq $word);
61
62	37					58	$word = lc($word);
63
64	37					43	my $original = $word;
65
66	37					71	$word =~ s/sch/\$/g;
67	37					52	$word =~ s/ei/\%/g;
68	37					45	$word =~ s/ie/\&/g;
69	37					109	$word =~ s/(.)\1/$1*/g;
70
71	37					45	my $suffix_length = 0;
72
73	37					71	while(length($word)>3){
74	21	100	66			142	if( length($word)>5 && ($word =~ s/(e[mr])$// \|\| $word =~ s/(nd)$//) ) {
		100	66
		100	100
			66
75	3					6	$suffix_length += 2;
76							}
77							elsif( (!($upper) \|\| $case_insensitive) && $word =~ s/t$//) {
78	5					13	$suffix_length++;
79							}
80							elsif( $word =~ s/([esn])$//) {
81	3					7	$suffix_length++;
82							}
83	10					17	else{ last; }
84							}
85
86	37					61	$word =~ s/(.)\*/$1$1/g;
87
88	37					51	$word =~ s/\$/sch/g;
89	37					43	$word =~ s/\%/ei/g;
90	37					45	$word =~ s/\&/ie/g;
91
92	37					43	my $suffix = '';
93
94	37	100				64	if( $suffix_length ) {
95	11					18	$suffix = substr($original, - $suffix_length);
96							}
97
98	37					169	return ($word, $suffix);
99							}
100
101							sub stem_robust {
102	51		50	51	1	24389	my $word = shift // '';
103	51					82	my $case_insensitive = shift;
104	51					60	my $keep_ge_prefix = shift;
105
106	51					100	my $ucfirst = (ucfirst $word eq $word);
107
108	51					100	$word = lc($word);
109	51					17356	$word =~ tr/äöü/aou/;
110	51					163	$word =~ s/([aou])\N{U+0308}/$1/g; # remove U+0308 COMBINING DIAERESIS
111	51					71	$word =~ s/ß/ss/g;
112
113	51	100				109	$word =~ s/^ge(.{4,})/$1/ unless $keep_ge_prefix;
114
115	51					110	$word =~ s/sch/\N{U+0006}/g; # \N{U+0006} ACK
116	51					67	$word =~ s/ei/\N{U+0007}/g; # \N{U+0007} BEL
117	51					65	$word =~ s/ie/\N{U+0008}/g; # \N{U+0008} BS
118
119	51					189	$word =~ s/(.)\1/$1*/g;
120
121	51					247	my @graphemes = $word =~ m/\X/g;
122	51					73	my $length = scalar @graphemes;
123							#my $length = scalar (($word =~ m/\X/g)); # does not work
124
125	51					102	while($length > 3) {
126	32	100	100			262	if( $length>5 && ($word =~ s/e[mr]$// \|\| $word =~ s/nd$//) ) {$length -= 2;}
	3	100	100			8
		100	100
			100
127	5					14	elsif( (!($ucfirst) \|\| $case_insensitive) && $word =~ s/t$//) {$length--;}
128	5					15	elsif( $word =~ s/[esn]$//) {$length--;}
129	19					29	else { last; }
130							}
131
132	51					79	$word =~ s/(.)\*/$1$1/g;
133
134	51					66	$word =~ s/\N{U+0006}/sch/g; # \N{U+0006} ACK
135	51					60	$word =~ s/\N{U+0007}/ei/g; # \N{U+0007} BEL
136	51					61	$word =~ s/\N{U+0008}/ie/g; # \N{U+0008} BS
137
138	51					207	return $word;
139							}
140
141
142							sub segment_robust {
143	51		50	51	1	40064	my $word = shift // '';
144	51					79	my $case_insensitive = shift;
145	51					61	my $keep_ge_prefix = shift;
146
147	51					101	my $ucfirst = (ucfirst $word eq $word);
148
149	51					89	$word = lc($word);
150	51					145	$word =~ tr/äöü/aou/;
151	51					122	$word =~ s/([aou])\N{U+0308}/$1/g; # remove U+0308 COMBINING DIAERESIS
152	51					65	$word =~ s/ß/ss/g;
153
154	51					68	my $prefix = '';
155	51	100	100			190	if (!$keep_ge_prefix && $word =~ s/^ge(.{4,})/$1/) {
156	2					4	$prefix = 'ge';
157							}
158
159	51					67	my $original = $word;
160
161	51					68	$word =~ s/sch/\N{U+0006}/g; # \N{U+0006} ACK
162	51					65	$word =~ s/ei/\N{U+0007}/g; # \N{U+0007} BEL
163	51					77	$word =~ s/ie/\N{U+0008}/g; # \N{U+0008} BS
164
165	51					357	$word =~ s/(.)\1/$1*/g;
166
167	51					1122	my @graphemes = $word =~ m/\X/g;
168	51					79	my $length = scalar @graphemes;
169
170	51					62	my $suffix_length = 0;
171
172	51					96	while($length > 3){
173	32	100	100			245	if( $length > 5 && ($word =~ s/(e[mr])$// \|\| $word =~ s/(nd)$//) ) {
		100	100
		100	100
			100
174	3					6	$suffix_length += 2;
175	3					6	$length -= 2;
176							}
177							elsif( (!($ucfirst) \|\| $case_insensitive) && $word =~ s/t$//) {
178	5					6	$suffix_length++;
179	5					13	$length--;
180							}
181							elsif( $word =~ s/([esn])$//) {
182	5					9	$suffix_length++;
183	5					10	$length--;
184							}
185	19					36	else{ last; }
186							}
187
188	51					86	$word =~ s/(.)\*/$1$1/g;
189
190	51					64	$word =~ s/\N{U+0006}/sch/g; # \N{U+0006} ACK
191	51					64	$word =~ s/\N{U+0007}/ei/g; # \N{U+0007} BEL
192	51					57	$word =~ s/\N{U+0008}/ie/g; # \N{U+0008} BS
193
194	51					85	my $suffix = '';
195
196	51	100				79	if( $suffix_length ) {
197	13					31	$suffix = substr($original, - $suffix_length);
198							}
199
200	51					972	return ($prefix, $word, $suffix);
201							}
202
203							1;
204							__END__