File Coverage

blib/lib/Text/Affixes.pm

Criterion	Covered	Total	%
statement	38	38	100.0
branch	16	16	100.0
condition	2	2	100.0
subroutine	6	6	100.0
pod	2	2	100.0
total	64	64	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Affixes;
2
3	3			3		81809	use 5.006;
	3					12
4	3			3		15	use strict;
	3					6
	3					79
5	3			3		15	use warnings;
	3					10
	3					1820
6
7							require Exporter;
8
9							our @ISA = qw(Exporter);
10
11							our %EXPORT_TAGS = ( 'all' => [ qw(
12
13							) ] );
14
15							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
16
17							our @EXPORT = qw(
18							get_prefixes
19							get_suffixes
20							);
21
22							our $VERSION = '0.09';
23
24							=head1 NAME
25
26							Text::Affixes - Prefixes and suffixes analysis of text
27
28							=head1 SYNOPSIS
29
30							use Text::Affixes;
31							my $text = "Hello, world. Hello, big world.";
32							my $prefixes = get_prefixes($text);
33
34							# $prefixes now holds
35							# {
36							# 3 => {
37							# 'Hel' => 2,
38							# 'wor' => 2,
39							# }
40							# }
41
42							# or
43
44							$prefixes = get_prefixes({min => 1, max => 2},$text);
45
46							# $prefixes now holds
47							# {
48							# 1 => {
49							# 'H' => 2,
50							# 'w' => 2,
51							# 'b' => 1,
52							# },
53							# 2 => {
54							# 'He' => 2,
55							# 'wo' => 2,
56							# 'bi' => 1,
57							# }
58							# }
59
60							# the use for get_suffixes is similar
61
62							=head1 DESCRIPTION
63
64							Provides methods for prefix and suffix analysis of text.
65
66							=head1 METHODS
67
68							=head2 get_prefixes
69
70							Extracts prefixes from text. You can specify the minimum and maximum
71							number of characters of prefixes you want.
72
73							Returns a reference to a hash, where the specified limits are mapped
74							in hashes; each of those hashes maps every prefix in the text into the
75							number of times it was found.
76
77							By default, both minimum and maximum limits are 3. If the minimum
78							limit is greater than the lower one, an empty hash is returned.
79
80							A prefix is considered to be a sequence of word characters (\w) in
81							the beginning of a word (that is, after a word boundary) that does not
82							reach the end of the word ("regular expressionly", a prefix is the $1
83							of /\b(\w+)\w/).
84
85							# extracting prefixes of size 3
86							$prefixes = get_prefixes( $text );
87
88							# extracting prefixes of sizes 2 and 3
89							$prefixes = get_prefixes( {min => 2}, $text );
90
91							# extracting prefixes of sizes 3 and 4
92							$prefixes = get_prefixes( {max => 4}, $text );
93
94							# extracting prefixes of sizes 2, 3 and 4
95							$prefixes = get_prefixes( {min => 2, max=> 4}, $text);
96
97							=cut
98
99							sub get_prefixes {
100	11			11	1	1815	return _get_elements(1,@_);
101							}
102
103							=head2 get_suffixes
104
105							The get_suffixes function is similar to the get_prefixes one. You
106							should read the documentation for that one and than come back to this
107							point.
108
109							A suffix is considered to be a sequence of word characters (\w) in
110							the end of a word (that is, before a word boundary) that does not start
111							at the beginning of the word ("regular expressionly" speaking, a
112							suffix is the $1 of /\w(\w+)\b/).
113
114							# extracting suffixes of size 3
115							$suffixes = get_suffixes( $text );
116
117							# extracting suffixes of sizes 2 and 3
118							$suffixes = get_suffixes( {min => 2}, $text );
119
120							# extracting suffixes of sizes 3 and 4
121							$suffixes = get_suffixes( {max => 4}, $text );
122
123							# extracting suffixes of sizes 2, 3 and 4
124							$suffixes = get_suffixes( {min => 2, max=> 4}, $text);
125
126							=cut
127
128							sub get_suffixes {
129	7			7	1	30	return _get_elements(0,@_);
130							}
131
132							sub _get_elements {
133	18			18		30	my $task = shift;
134
135							=head1 OPTIONS
136
137							Apart from deciding on a minimum and maximum size for prefixes or suffixes, you
138							can also decide on some configuration options.
139
140							=cut
141
142							# configuration
143	18					84	my %conf = ( min => 3,
144							max => 3,
145							exclude_numbers => 1,
146							lowercase => 0,
147							);
148	18	100				61	if (ref $_[0] eq 'HASH') {
149	15					39	%conf = (%conf, %{+shift});
	15					80
150							}
151	18	100				64	return {} if $conf{max} < $conf{min};
152
153							# get the elements
154	17					23	my %elements;
155	17		100			49	my $text = shift \|\| return undef;
156	16	100				39	$conf{min} = 1 if $conf{min} < 1;
157	16					53	for ($conf{min} .. $conf{max}) {
158
159	17	100				163	my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes
160							qr/\w(\w{$_})\b/ ; # suffixes
161
162	17					132	while ($text =~ /$regex/g) {
163	52					386	$elements{$_}{$1}++;
164							}
165
166							}
167
168							=head2 exclude_numbers
169
170							Set to 0 if you consider numbers as part of words. Default value is 1.
171
172							# this
173							get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" );
174
175							# returns this:
176							{
177							1 => {
178							'o' => 1,
179							't' => 1,
180							'8' => 1
181							}
182							}
183
184							=cut
185
186							# exclude elements containing numbers
187	16	100				45	if ($conf{exclude_numbers}) {
188	14					39	for my $s (keys %elements) {
189	15					20	for (keys %{$elements{$s}}) {
	15					48
190	34	100				120	delete ${$elements{$s}}{$_} if /\d/;
	4					12
191							}
192							}
193							}
194
195							=head2 lowercase
196
197							Set to 1 to extract all prefixes in lowercase mode. Default value is 0.
198
199							ATTENTION: This does not mean that prefixes with uppercased characters won't be
200							extracted. It means they will be extracted after being lowercased.
201
202							# this...
203							get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello");
204
205							# returns this:
206							{
207							2 => {
208							'he' => 2
209							}
210							}
211
212							=cut
213
214							# elements containing uppercased characters become lowercased ones
215	16	100				47	if ($conf{lowercase}) {
216	4					10	for my $s (keys %elements) {
217	4					6	for (keys %{$elements{$s}}) {
	4					11
218	6	100				23	if (/[[:upper:]]/) {
219	4					11	${$elements{$s}}{lc $_} +=
220	4					5	delete ${$elements{$s}}{$_};
	4					13
221							}
222							}
223							}
224							}
225
226	16					133	return \%elements;
227							}
228
229							1;
230							__END__