line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Affixes; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
81809
|
use 5.006; |
|
3
|
|
|
|
|
12
|
|
4
|
3
|
|
|
3
|
|
15
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
79
|
|
5
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
1820
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
require Exporter; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
) ] ); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @EXPORT = qw( |
18
|
|
|
|
|
|
|
get_prefixes |
19
|
|
|
|
|
|
|
get_suffixes |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our $VERSION = '0.09'; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 NAME |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Text::Affixes - Prefixes and suffixes analysis of text |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 SYNOPSIS |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
use Text::Affixes; |
31
|
|
|
|
|
|
|
my $text = "Hello, world. Hello, big world."; |
32
|
|
|
|
|
|
|
my $prefixes = get_prefixes($text); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# $prefixes now holds |
35
|
|
|
|
|
|
|
# { |
36
|
|
|
|
|
|
|
# 3 => { |
37
|
|
|
|
|
|
|
# 'Hel' => 2, |
38
|
|
|
|
|
|
|
# 'wor' => 2, |
39
|
|
|
|
|
|
|
# } |
40
|
|
|
|
|
|
|
# } |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# or |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
$prefixes = get_prefixes({min => 1, max => 2},$text); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# $prefixes now holds |
47
|
|
|
|
|
|
|
# { |
48
|
|
|
|
|
|
|
# 1 => { |
49
|
|
|
|
|
|
|
# 'H' => 2, |
50
|
|
|
|
|
|
|
# 'w' => 2, |
51
|
|
|
|
|
|
|
# 'b' => 1, |
52
|
|
|
|
|
|
|
# }, |
53
|
|
|
|
|
|
|
# 2 => { |
54
|
|
|
|
|
|
|
# 'He' => 2, |
55
|
|
|
|
|
|
|
# 'wo' => 2, |
56
|
|
|
|
|
|
|
# 'bi' => 1, |
57
|
|
|
|
|
|
|
# } |
58
|
|
|
|
|
|
|
# } |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# the use for get_suffixes is similar |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head1 DESCRIPTION |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Provides methods for prefix and suffix analysis of text. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 METHODS |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 get_prefixes |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Extracts prefixes from text. You can specify the minimum and maximum |
71
|
|
|
|
|
|
|
number of characters of prefixes you want. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Returns a reference to a hash, where the specified limits are mapped |
74
|
|
|
|
|
|
|
in hashes; each of those hashes maps every prefix in the text into the |
75
|
|
|
|
|
|
|
number of times it was found. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
By default, both minimum and maximum limits are 3. If the minimum |
78
|
|
|
|
|
|
|
limit is greater than the lower one, an empty hash is returned. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
A prefix is considered to be a sequence of word characters (\w) in |
81
|
|
|
|
|
|
|
the beginning of a word (that is, after a word boundary) that does not |
82
|
|
|
|
|
|
|
reach the end of the word ("regular expressionly", a prefix is the $1 |
83
|
|
|
|
|
|
|
of /\b(\w+)\w/). |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# extracting prefixes of size 3 |
86
|
|
|
|
|
|
|
$prefixes = get_prefixes( $text ); |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# extracting prefixes of sizes 2 and 3 |
89
|
|
|
|
|
|
|
$prefixes = get_prefixes( {min => 2}, $text ); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# extracting prefixes of sizes 3 and 4 |
92
|
|
|
|
|
|
|
$prefixes = get_prefixes( {max => 4}, $text ); |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# extracting prefixes of sizes 2, 3 and 4 |
95
|
|
|
|
|
|
|
$prefixes = get_prefixes( {min => 2, max=> 4}, $text); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub get_prefixes { |
100
|
11
|
|
|
11
|
1
|
1815
|
return _get_elements(1,@_); |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head2 get_suffixes |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
The get_suffixes function is similar to the get_prefixes one. You |
106
|
|
|
|
|
|
|
should read the documentation for that one and than come back to this |
107
|
|
|
|
|
|
|
point. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
A suffix is considered to be a sequence of word characters (\w) in |
110
|
|
|
|
|
|
|
the end of a word (that is, before a word boundary) that does not start |
111
|
|
|
|
|
|
|
at the beginning of the word ("regular expressionly" speaking, a |
112
|
|
|
|
|
|
|
suffix is the $1 of /\w(\w+)\b/). |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# extracting suffixes of size 3 |
115
|
|
|
|
|
|
|
$suffixes = get_suffixes( $text ); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# extracting suffixes of sizes 2 and 3 |
118
|
|
|
|
|
|
|
$suffixes = get_suffixes( {min => 2}, $text ); |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# extracting suffixes of sizes 3 and 4 |
121
|
|
|
|
|
|
|
$suffixes = get_suffixes( {max => 4}, $text ); |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# extracting suffixes of sizes 2, 3 and 4 |
124
|
|
|
|
|
|
|
$suffixes = get_suffixes( {min => 2, max=> 4}, $text); |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=cut |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub get_suffixes { |
129
|
7
|
|
|
7
|
1
|
30
|
return _get_elements(0,@_); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub _get_elements { |
133
|
18
|
|
|
18
|
|
30
|
my $task = shift; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=head1 OPTIONS |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
Apart from deciding on a minimum and maximum size for prefixes or suffixes, you |
138
|
|
|
|
|
|
|
can also decide on some configuration options. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=cut |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# configuration |
143
|
18
|
|
|
|
|
84
|
my %conf = ( min => 3, |
144
|
|
|
|
|
|
|
max => 3, |
145
|
|
|
|
|
|
|
exclude_numbers => 1, |
146
|
|
|
|
|
|
|
lowercase => 0, |
147
|
|
|
|
|
|
|
); |
148
|
18
|
100
|
|
|
|
61
|
if (ref $_[0] eq 'HASH') { |
149
|
15
|
|
|
|
|
39
|
%conf = (%conf, %{+shift}); |
|
15
|
|
|
|
|
80
|
|
150
|
|
|
|
|
|
|
} |
151
|
18
|
100
|
|
|
|
64
|
return {} if $conf{max} < $conf{min}; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# get the elements |
154
|
17
|
|
|
|
|
23
|
my %elements; |
155
|
17
|
|
100
|
|
|
49
|
my $text = shift || return undef; |
156
|
16
|
100
|
|
|
|
39
|
$conf{min} = 1 if $conf{min} < 1; |
157
|
16
|
|
|
|
|
53
|
for ($conf{min} .. $conf{max}) { |
158
|
|
|
|
|
|
|
|
159
|
17
|
100
|
|
|
|
163
|
my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes |
160
|
|
|
|
|
|
|
qr/\w(\w{$_})\b/ ; # suffixes |
161
|
|
|
|
|
|
|
|
162
|
17
|
|
|
|
|
132
|
while ($text =~ /$regex/g) { |
163
|
52
|
|
|
|
|
386
|
$elements{$_}{$1}++; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head2 exclude_numbers |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
Set to 0 if you consider numbers as part of words. Default value is 1. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# this |
173
|
|
|
|
|
|
|
get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" ); |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# returns this: |
176
|
|
|
|
|
|
|
{ |
177
|
|
|
|
|
|
|
1 => { |
178
|
|
|
|
|
|
|
'o' => 1, |
179
|
|
|
|
|
|
|
't' => 1, |
180
|
|
|
|
|
|
|
'8' => 1 |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=cut |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# exclude elements containing numbers |
187
|
16
|
100
|
|
|
|
45
|
if ($conf{exclude_numbers}) { |
188
|
14
|
|
|
|
|
39
|
for my $s (keys %elements) { |
189
|
15
|
|
|
|
|
20
|
for (keys %{$elements{$s}}) { |
|
15
|
|
|
|
|
48
|
|
190
|
34
|
100
|
|
|
|
120
|
delete ${$elements{$s}}{$_} if /\d/; |
|
4
|
|
|
|
|
12
|
|
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 lowercase |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Set to 1 to extract all prefixes in lowercase mode. Default value is 0. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
ATTENTION: This does not mean that prefixes with uppercased characters won't be |
200
|
|
|
|
|
|
|
extracted. It means they will be extracted after being lowercased. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# this... |
203
|
|
|
|
|
|
|
get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello"); |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# returns this: |
206
|
|
|
|
|
|
|
{ |
207
|
|
|
|
|
|
|
2 => { |
208
|
|
|
|
|
|
|
'he' => 2 |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=cut |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# elements containing uppercased characters become lowercased ones |
215
|
16
|
100
|
|
|
|
47
|
if ($conf{lowercase}) { |
216
|
4
|
|
|
|
|
10
|
for my $s (keys %elements) { |
217
|
4
|
|
|
|
|
6
|
for (keys %{$elements{$s}}) { |
|
4
|
|
|
|
|
11
|
|
218
|
6
|
100
|
|
|
|
23
|
if (/[[:upper:]]/) { |
219
|
4
|
|
|
|
|
11
|
${$elements{$s}}{lc $_} += |
220
|
4
|
|
|
|
|
5
|
delete ${$elements{$s}}{$_}; |
|
4
|
|
|
|
|
13
|
|
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
16
|
|
|
|
|
133
|
return \%elements; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
1; |
230
|
|
|
|
|
|
|
__END__ |