File Coverage

blib/lib/String/ToIdentifier/EN.pm

Criterion	Covered	Total	%
statement	74	74	100.0
branch	27	28	96.4
condition			n/a
subroutine	12	12	100.0
pod	2	2	100.0
total	115	116	99.1

line	stmt	bran	sub	pod	time	code
1						package String::ToIdentifier::EN;
2						our $AUTHORITY = 'cpan:AVAR';
3						$String::ToIdentifier::EN::VERSION = '0.12';
4	3		3		205361	use 5.008001;
	3				18
5	3		3		12	use strict;
	3				4
	3				55
6	3		3		12	use warnings;
	3				5
	3				62
7	3		3		1193	use Text::Unidecode 'unidecode';
	3				5938
	3				140
8	3		3		1245	use Lingua::EN::Inflect::Phrase 'to_PL';
	3				247026
	3				169
9	3		3		2327	use Unicode::UCD 'charinfo';
	3				69486
	3				210
10	3		3		22	use namespace::clean;
	3				6
	3				23
11	3		3		643	use Exporter 'import';
	3				5
	3				2197
12
13						=head1 NAME
14
15						String::ToIdentifier::EN - Convert Strings to English Program Identifiers
16
17						=head1 SYNOPSIS
18
19						use utf8;
20						use String::ToIdentifier::EN 'to_identifier';
21
22						to_identifier 'foo-bar'; # fooDashBar
23						to_identifier 'foo-bar', '_'; # foo_dash_bar
24						to_identifier 'foo.bar', '_'; # foo_dot_bar
25						to_identifier "foo\x{4EB0}bar"; # fooJingBar
26						to_identifier "foo\x00bar"; # fooNullCharBar
27						to_identifier "foo\x00\x00bar"; # foo2NullCharsBar
28						to_identifier "foo\x00\x00bar", '_'; # foo_2_null_chars_bar
29
30						{
31						no utf8;
32						to_identifier "foo\xFF\xFFbar.baz"; # foo_2_0xFF_BarDotBaz
33						to_identifier "foo\xFF\xFFbar.baz", '_'; # foo_2_0xFF_bar_dot_baz
34						}
35
36						=head1 DESCRIPTION
37
38						This module provides a utility method, L for converting an
39						arbitrary string into a readable representation using the ASCII subset of C<\w>
40						for use as an identifier in a computer program. The intent is to make unique
41						identifier names from which the content of the original string can be easily
42						inferred by a human just by reading the identifier.
43
44						If you need the full set of C<\w> including Unicode, see
45						the subclass L.
46
47						Currently, this process is one way only, and will likely remain this way.
48
49						The default is to create camelCase identifiers, or you may pass in a separator
50						char of your choice such as C<_>.
51
52						Binary char groups will be separated by C<_> even in camelCase identifiers to
53						make them easier to read, e.g.: C.
54
55						=head1 EXPORT
56
57						Optionally exports the L function.
58
59						=cut
60
61						our @EXPORT_OK = qw/to_identifier/;
62
63						=head1 SUBROUTINES
64
65						=cut
66
67						our %ASCII_MAP = (
68						0x00 => ['null'],
69						0x01 => ['start', 'of', 'heading'],
70						0x02 => ['start', 'of', 'text'],
71						0x03 => ['end', 'of', 'text'],
72						0x04 => ['end', 'of', 'transmission'],
73						0x05 => ['enquiry', 'char'],
74						0x06 => ['ack'],
75						0x07 => ['bell', 'char'],
76						0x08 => ['backspace'],
77						0x09 => ['tab', 'char'],
78						0x0A => ['newline'],
79						0x0B => ['vertical', 'tab'],
80						0x0C => ['form', 'feed'],
81						0x0D => ['carriage', 'return'],
82						0x0E => ['shift', 'out'],
83						0x0F => ['shift', 'in'],
84						0x10 => ['data', 'link', 'escape'],
85						0x11 => ['device', 'control1'],
86						0x12 => ['device', 'control2'],
87						0x13 => ['device', 'control3'],
88						0x14 => ['device', 'control4'],
89						0x15 => ['negative', 'ack'],
90						0x16 => ['synchronous', 'idle'],
91						0x17 => ['end', 'of', 'transmission', 'block'],
92						0x18 => ['cancel', 'char'],
93						0x19 => ['end', 'of', 'medium'],
94						0x1A => ['substitute', 'char'],
95						0x1B => ['escape', 'char'],
96						0x1C => ['file', 'separator'],
97						0x1D => ['group', 'separator'],
98						0x1E => ['record', 'separator'],
99						0x1F => ['unit', 'separator'],
100						0x20 => ['space', 'char'],
101						0x21 => ['exclamation', 'mark'],
102						0x22 => ['double', 'quote'],
103						0x23 => ['hash', 'mark'],
104						0x24 => ['dollar', 'sign'],
105						0x25 => ['percent', 'sign'],
106						0x26 => ['ampersand'],
107						0x27 => ['single', 'quote'],
108						0x28 => ['left', 'paren'],
109						0x29 => ['right', 'paren'],
110						0x2A => ['asterisk'],
111						0x2B => ['plus', 'sign'],
112						0x2C => ['comma'],
113						0x2D => ['dash'],
114						0x2E => ['dot'],
115						0x2F => ['slash'],
116						0x3A => ['colon'],
117						0x3B => ['semicolon'],
118						0x3C => ['left', 'angle', 'bracket'],
119						0x3D => ['equals', 'sign'],
120						0x3E => ['right', 'angle', 'bracket'],
121						0x3F => ['question', 'mark'],
122						0x40 => ['at', 'sign'],
123						0x5B => ['left', 'bracket'],
124						0x5C => ['backslash'],
125						0x5D => ['right', 'bracket'],
126						0x5E => ['caret'],
127						0x60 => ['backtick'],
128						0x7B => ['left', 'brace'],
129						0x7C => ['pipe', 'char'],
130						0x7D => ['right', 'brace'],
131						0x7E => ['tilde'],
132						0x7F => ['delete', 'char'],
133						);
134
135						# fixup for perl <= 5.8.3
136						$ASCII_MAP{0} = ['null'];
137
138						=head2 to_identifier
139
140						Takes the string to be converted to an identifier, and optionally a separator
141						char such as C<_>. If a separator char is not provided, a camelCase identifier
142						will be returned.
143
144						=cut
145
146						sub to_identifier {
147	404		404	1	170216	return __PACKAGE__->string_to_identifier(@_);
148						}
149
150						# Override some pluralizations Lingua::EN::Inflect::Phrase gets wrong here, if
151						# needed.
152						sub _pluralize_phrase {
153	146		146		262	my ($self, $phrase) = @_;
154
155	146				352	return to_PL($phrase);
156						}
157
158						# for overriding in ::Unicode
159						sub _non_identifier_char {
160	406		406		1068	return qr/[^0-9a-zA-Z_]/;
161						}
162
163						=head1 METHODS
164
165						=head2 string_to_identifier
166
167						The class method version of L, if you want to use the object
168						oriented interface.
169
170						=cut
171
172						sub string_to_identifier {
173	809		809	1	1450	my ($self, $str, $sep_char) = @_;
174
175	809				1402	my $is_utf8 = utf8::is_utf8($str);
176
177	809				1406	my $char_to_match = $self->_non_identifier_char;
178
179	809				1129	my $phrase_at_start = 0;
180
181	809				12901	while ($str =~ /((${char_to_match})\2*)/sg) {
182	445				1140	my $to_replace = $1;
183	445				946	my $pos = $-[1];
184
185	445				702	my $count = length $to_replace;
186	445				715	my $char = substr $to_replace, 0, 1;
187
188	445				493	my $replacement_phrase;
189	445				532	my $use_underscore = 0;
190
191	445	100			905	if (ord $char < 128) {
		100
192	412				455	$replacement_phrase = join ' ', @{ $ASCII_MAP{ord $char} };
	412				1136
193						}
194						elsif ($is_utf8) {
195	17				48	my $decoded = lcfirst unidecode $char;
196
197	17				6573	$decoded =~ s/^\s+//;
198	17				54	$decoded =~ s/\s+\z//;
199
200	17				33	(my $decoded_without_spaces = $decoded) =~ s/\s+//g;
201
202	17				74	my $bad_chars =()= $decoded_without_spaces =~ /$char_to_match/sg;
203
204						# If Text::Unidecode gives us non-identifier chars, we use
205						# either it or the UCD charname, whichever has fewer
206						# non-identifier chars, after recursively passing the strings
207						# through ->string_to_identifier.
208	17	100			37	if ($bad_chars) {
209	2				11	my $charname = lc charinfo(ord $char)->{name};
210
211	2				222619	$charname =~ s/^\s+//;
212	2				6	$charname =~ s/\s+\z//;
213
214	2				10	(my $charname_without_spaces = $charname) =~ s/\s+//g;
215
216	2				12	my $charname_bad_chars =()=
217						$charname_without_spaces =~ /$char_to_match/sg;
218
219	2	50			9	$decoded = $charname if $charname_bad_chars < $bad_chars;
220
221	2				22	$decoded =
222						join ' ',
223						map $self->string_to_identifier($_),
224						split /\s+/, $decoded;
225						}
226
227	17				43	$replacement_phrase = $decoded;
228						}
229						else { # binary
230	16				44	$replacement_phrase = sprintf '0x%X', ord $char;
231	16				21	$use_underscore = 1;
232						}
233
234						# For single char replacements, no separation or camelcasing is
235						# necessary.
236	445	100			833	if (length $replacement_phrase > 1) {
237	438	100			710	$phrase_at_start = 1 if $pos == 0;
238
239	438	100			973	$replacement_phrase = $self->_pluralize_phrase("$count $replacement_phrase")
240						if $count > 1;
241
242						{
243	438	100			599146	my $sep_char = $use_underscore ? '_' : $sep_char;
	438				752
244
245	438	100			625	if ($sep_char) {
246	290				1049	$replacement_phrase =
247						join($sep_char, split /\s+/, $replacement_phrase);
248
249	290	100			678	$replacement_phrase = $sep_char . $replacement_phrase
250						unless $pos == 0;
251
252						# Insert sep_char at the end of replacement text unless
253						# position is at the end of the string.
254	290	100			649	$replacement_phrase .= $sep_char
255						unless $pos + length($to_replace) == length($str);
256						}
257						else {
258	148				838	$replacement_phrase =
259						join '', map "\u$_", split /\s+/, $replacement_phrase;
260						}
261						}
262
263						# titlecase the following text for camelCase identifiers
264	438	100			1025	substr($str, $pos + length($to_replace), 1) =
265						ucfirst substr($str, $pos + length($to_replace), 1)
266						if not $sep_char;
267						}
268						else {
269						# For single char replacements we want to match the case.
270	7	100			33	if (substr($str, $pos, 1) =~ /^\p{Lu}\z/) {
271	2				6	$replacement_phrase = ucfirst $replacement_phrase;
272						}
273						else {
274	5				11	$replacement_phrase = lcfirst $replacement_phrase;
275						}
276						}
277
278	445				2097	substr($str, $pos, length($to_replace)) = $replacement_phrase;
279						}
280
281	809	100			1770	$str = lcfirst $str if $phrase_at_start;
282
283	809				3225	return $str;
284						}
285
286						=head1 SEE ALSO
287
288						L,
289						L,
290						L
291
292						=head1 AUTHOR
293
294						Rafael Kitover, C<< >>
295
296						=head1 REPOSITORY
297
298						L
299
300						=head1 LICENSE AND COPYRIGHT
301
302						Copyright (c) 2018 Rafael Kitover .
303
304						This program is free software; you can redistribute it and/or modify it
305						under the terms of either: the GNU General Public License as published
306						by the Free Software Foundation; or the Artistic License.
307
308						See http://dev.perl.org/licenses/ for more information.
309
310						=cut
311
312						1; # End of String::ToIdentifier::EN