File Coverage

blib/lib/String/ToIdentifier/EN.pm

Criterion	Covered	Total	%
statement	76	76	100.0
branch	27	28	96.4
condition			n/a
subroutine	13	13	100.0
pod	2	2	100.0
total	118	119	99.1

line	stmt	bran	sub	pod	time	code
1						package String::ToIdentifier::EN;
2						BEGIN {
3	3		3		144419	$String::ToIdentifier::EN::AUTHORITY = 'cpan:RKITOVER';
4						}
5						{
6						$String::ToIdentifier::EN::VERSION = '0.11';
7						}
8
9	3		3		84	use 5.008001;
	3				9
	3				99
10	3		3		14	use strict;
	3				8
	3				98
11	3		3		13	use warnings;
	3				6
	3				76
12	3		3		2576	use Text::Unidecode 'unidecode';
	3				8542
	3				218
13	3		3		2900	use Lingua::EN::Inflect::Phrase 'to_PL';
	3				392589
	3				196
14	3		3		4446	use Unicode::UCD 'charinfo';
	3				104620
	3				346
15	3		3		31	use namespace::clean;
	3				8
	3				25
16	3		3		529	use Exporter 'import';
	3				6
	3				2630
17
18						=head1 NAME
19
20						String::ToIdentifier::EN - Convert Strings to English Program Identifiers
21
22						=head1 SYNOPSIS
23
24						use utf8;
25						use String::ToIdentifier::EN 'to_identifier';
26
27						to_identifier 'foo-bar'; # fooDashBar
28						to_identifier 'foo-bar', '_'; # foo_dash_bar
29						to_identifier 'foo.bar', '_'; # foo_dot_bar
30						to_identifier "foo\x{4EB0}bar"; # fooJingBar
31						to_identifier "foo\x00bar"; # fooNullCharBar
32						to_identifier "foo\x00\x00bar"; # foo2NullCharsBar
33						to_identifier "foo\x00\x00bar", '_'; # foo_2_null_chars_bar
34
35						{
36						no utf8;
37						to_identifier "foo\xFF\xFFbar.baz"; # foo_2_0xFF_BarDotBaz
38						to_identifier "foo\xFF\xFFbar.baz", '_'; # foo_2_0xFF_bar_dot_baz
39						}
40
41						=head1 DESCRIPTION
42
43						This module provides a utility method, L for converting an
44						arbitrary string into a readable representation using the ASCII subset of C<\w>
45						for use as an identifier in a computer program. The intent is to make unique
46						identifier names from which the content of the original string can be easily
47						inferred by a human just by reading the identifier.
48
49						If you need the full set of C<\w> including Unicode, see
50						the subclass L.
51
52						Currently, this process is one way only, and will likely remain this way.
53
54						The default is to create camelCase identifiers, or you may pass in a separator
55						char of your choice such as C<_>.
56
57						Binary char groups will be separated by C<_> even in camelCase identifiers to
58						make them easier to read, e.g.: C.
59
60						=head1 EXPORT
61
62						Optionally exports the L function.
63
64						=cut
65
66						our @EXPORT_OK = qw/to_identifier/;
67
68						=head1 SUBROUTINES
69
70						=cut
71
72						our %ASCII_MAP = (
73						0x00 => ['null'],
74						0x01 => ['start', 'of', 'heading'],
75						0x02 => ['start', 'of', 'text'],
76						0x03 => ['end', 'of', 'text'],
77						0x04 => ['end', 'of', 'transmission'],
78						0x05 => ['enquiry', 'char'],
79						0x06 => ['ack'],
80						0x07 => ['bell', 'char'],
81						0x08 => ['backspace'],
82						0x09 => ['tab', 'char'],
83						0x0A => ['newline'],
84						0x0B => ['vertical', 'tab'],
85						0x0C => ['form', 'feed'],
86						0x0D => ['carriage', 'return'],
87						0x0E => ['shift', 'out'],
88						0x0F => ['shift', 'in'],
89						0x10 => ['data', 'link', 'escape'],
90						0x11 => ['device', 'control1'],
91						0x12 => ['device', 'control2'],
92						0x13 => ['device', 'control3'],
93						0x14 => ['device', 'control4'],
94						0x15 => ['negative', 'ack'],
95						0x16 => ['synchronous', 'idle'],
96						0x17 => ['end', 'of', 'transmission', 'block'],
97						0x18 => ['cancel', 'char'],
98						0x19 => ['end', 'of', 'medium'],
99						0x1A => ['substitute', 'char'],
100						0x1B => ['escape', 'char'],
101						0x1C => ['file', 'separator'],
102						0x1D => ['group', 'separator'],
103						0x1E => ['record', 'separator'],
104						0x1F => ['unit', 'separator'],
105						0x20 => ['space', 'char'],
106						0x21 => ['exclamation', 'mark'],
107						0x22 => ['double', 'quote'],
108						0x23 => ['hash', 'mark'],
109						0x24 => ['dollar', 'sign'],
110						0x25 => ['percent', 'sign'],
111						0x26 => ['ampersand'],
112						0x27 => ['single', 'quote'],
113						0x28 => ['left', 'paren'],
114						0x29 => ['right', 'paren'],
115						0x2A => ['asterisk'],
116						0x2B => ['plus', 'sign'],
117						0x2C => ['comma'],
118						0x2D => ['dash'],
119						0x2E => ['dot'],
120						0x2F => ['slash'],
121						0x3A => ['colon'],
122						0x3B => ['semicolon'],
123						0x3C => ['left', 'angle', 'bracket'],
124						0x3D => ['equals', 'sign'],
125						0x3E => ['right', 'angle', 'bracket'],
126						0x3F => ['question', 'mark'],
127						0x40 => ['at', 'sign'],
128						0x5B => ['left', 'bracket'],
129						0x5C => ['backslash'],
130						0x5D => ['right', 'bracket'],
131						0x5E => ['caret'],
132						0x60 => ['backtick'],
133						0x7B => ['left', 'brace'],
134						0x7C => ['pipe', 'char'],
135						0x7D => ['right', 'brace'],
136						0x7E => ['tilde'],
137						0x7F => ['delete', 'char'],
138						);
139
140						# fixup for perl <= 5.8.3
141						$ASCII_MAP{0} = ['null'];
142
143						=head2 to_identifier
144
145						Takes the string to be converted to an identifier, and optionally a separator
146						char such as C<_>. If a separator char is not provided, a camelCase identifier
147						will be returned.
148
149						=cut
150
151						sub to_identifier {
152	404		404	1	214816	return __PACKAGE__->string_to_identifier(@_);
153						}
154
155						# Override some pluralizations Lingua::EN::Inflect::Phrase gets wrong here, if
156						# needed.
157						sub _pluralize_phrase {
158	146		146		220	my ($self, $phrase) = @_;
159
160	146				488	return to_PL($phrase);
161						}
162
163						# for overriding in ::Unicode
164						sub _non_identifier_char {
165	406		406		1543	return qr/[^0-9a-zA-Z_]/;
166						}
167
168						=head1 METHODS
169
170						=head2 string_to_identifier
171
172						The class method version of L, if you want to use the object
173						oriented interface.
174
175						=cut
176
177						sub string_to_identifier {
178	809		809	1	1455	my ($self, $str, $sep_char) = @_;
179
180	809				1653	my $is_utf8 = utf8::is_utf8($str);
181
182	809				2439	my $char_to_match = $self->_non_identifier_char;
183
184	809				1154	my $phrase_at_start = 0;
185
186	809				19097	while ($str =~ /((${char_to_match})\2*)/sg) {
187	445				892	my $to_replace = $1;
188	445				987	my $pos = $-[1];
189
190	445				715	my $count = length $to_replace;
191	445				760	my $char = substr $to_replace, 0, 1;
192
193	445				419	my $replacement_phrase;
194	445				485	my $use_underscore = 0;
195
196	445	100			912	if (ord $char < 128) {
		100
197	412				463	$replacement_phrase = join ' ', @{ $ASCII_MAP{ord $char} };
	412				1480
198						}
199						elsif ($is_utf8) {
200	17				60	my $decoded = lcfirst unidecode $char;
201
202	17				10640	$decoded =~ s/^\s+//;
203	17				60	$decoded =~ s/\s+\z//;
204
205	17				42	(my $decoded_without_spaces = $decoded) =~ s/\s+//g;
206
207	17				69	my $bad_chars =()= $decoded_without_spaces =~ /$char_to_match/sg;
208
209						# If Text::Unidecode gives us non-identifier chars, we use
210						# either it or the UCD charname, whichever has fewer
211						# non-identifier chars, after recursively passing the strings
212						# through ->string_to_identifier.
213	17	100			54	if ($bad_chars) {
214	2				14	my $charname = lc charinfo(ord $char)->{name};
215
216	2				272619	$charname =~ s/^\s+//;
217	2				9	$charname =~ s/\s+\z//;
218
219	2				11	(my $charname_without_spaces = $charname) =~ s/\s+//g;
220
221	2				16	my $charname_bad_chars =()=
222						$charname_without_spaces =~ /$char_to_match/sg;
223
224	2	50			10	$decoded = $charname if $charname_bad_chars < $bad_chars;
225
226	2				28	$decoded =
227						join ' ',
228						map $self->string_to_identifier($_),
229						split /\s+/, $decoded;
230						}
231
232	17				33	$replacement_phrase = $decoded;
233						}
234						else { # binary
235	16				50	$replacement_phrase = sprintf '0x%X', ord $char;
236	16				24	$use_underscore = 1;
237						}
238
239						# For single char replacements, no separation or camelcasing is
240						# necessary.
241	445	100			837	if (length $replacement_phrase > 1) {
242	438	100			928	$phrase_at_start = 1 if $pos == 0;
243
244	438	100			1289	$replacement_phrase = $self->_pluralize_phrase("$count $replacement_phrase")
245						if $count > 1;
246
247						{
248	438	100			847683	my $sep_char = $use_underscore ? '_' : $sep_char;
	438				965
249
250	438	100			600	if ($sep_char) {
251	290				1132	$replacement_phrase =
252						join($sep_char, split /\s+/, $replacement_phrase);
253
254	290	100			794	$replacement_phrase = $sep_char . $replacement_phrase
255						unless $pos == 0;
256
257						# Insert sep_char at the end of replacement text unless
258						# position is at the end of the string.
259	290	100			803	$replacement_phrase .= $sep_char
260						unless $pos + length($to_replace) == length($str);
261						}
262						else {
263	148				735	$replacement_phrase =
264						join '', map "\u$_", split /\s+/, $replacement_phrase;
265						}
266						}
267
268						# titlecase the following text for camelCase identifiers
269	438	100			1146	substr($str, $pos + length($to_replace), 1) =
270						ucfirst substr($str, $pos + length($to_replace), 1)
271						if not $sep_char;
272						}
273						else {
274						# For single char replacements we want to match the case.
275	7	100			39	if (substr($str, $pos, 1) =~ /^\p{Lu}\z/) {
276	2				6	$replacement_phrase = ucfirst $replacement_phrase;
277						}
278						else {
279	5				9	$replacement_phrase = lcfirst $replacement_phrase;
280						}
281						}
282
283	445				2771	substr($str, $pos, length($to_replace)) = $replacement_phrase;
284						}
285
286	809	100			1812	$str = lcfirst $str if $phrase_at_start;
287
288	809				4144	return $str;
289						}
290
291						=head1 SEE ALSO
292
293						L,
294						L,
295						L
296
297						=head1 AUTHOR
298
299						Rafael Kitover, C<< >>
300
301						=head1 BUGS
302
303						Please report any bugs or feature requests to C
304						rt.cpan.org>, or through the web interface at
305						L. I
306						will be notified, and then you'll automatically be notified of progress on your
307						bug as I make changes.
308
309						=head1 REPOSITORY
310
311						L
312
313						=head1 SUPPORT
314
315						More information on this module is available at:
316
317						=over 4
318
319						=item * RT: CPAN's request tracker (report bugs here)
320
321						L
322
323						=item * AnnoCPAN: Annotated CPAN documentation
324
325						L
326
327						=item * CPAN Ratings
328
329						L
330
331						=item * MetaCPAN
332
333						L
334
335						=item * Search CPAN
336
337						L
338
339						=back
340
341						=head1 LICENSE AND COPYRIGHT
342
343						Copyright (c) 2011 Rafael Kitover .
344
345						This program is free software; you can redistribute it and/or modify it
346						under the terms of either: the GNU General Public License as published
347						by the Free Software Foundation; or the Artistic License.
348
349						See http://dev.perl.org/licenses/ for more information.
350
351						=cut
352
353						1; # End of String::ToIdentifier::EN