File Coverage

blib/lib/Perl/ToPerl6/Transformer/BasicTypes/Strings/Interpolation.pm

Criterion	Covered	Total	%
statement	30	120	25.0
branch	0	70	0.0
condition	0	12	0.0
subroutine	13	17	76.4
pod	3	8	37.5
total	46	227	20.2

line	stmt	bran	cond	sub	pod	time	code
1							package Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation;
2
3	17			17		11291	use 5.006001;
	17					48
4	17			17		74	use strict;
	17					24
	17					356
5	17			17		68	use warnings;
	17					24
	17					2671
6	17			17		62	use Readonly;
	17					29
	17					801
7	17			17		78	use List::Util qw( max );
	17					22
	17					1021
8	17			17		21610301	use Text::Balanced qw( extract_variable );
	17					3541418
	17					1781
9
10	17			17		166	use Perl::ToPerl6::Utils qw{ :characters :severities };
	17					28
	17					1310
11	17			17		5675	use Perl::ToPerl6::Utils::PPI qw{ set_string };
	17					36
	17					986
12
13	17			17		100	use base 'Perl::ToPerl6::Transformer';
	17					30
	17					32454
14
15							our $VERSION = '0.03';
16
17							#-----------------------------------------------------------------------------
18
19							Readonly::Scalar my $DESC => q{Rewrite interpolated strings};
20							Readonly::Scalar my $EXPL => q{Rewrite interpolated strings};
21
22							#-----------------------------------------------------------------------------
23
24	40			40	0	1478	sub supported_parameters { return () }
25	28			28	1	125	sub default_severity { return $SEVERITY_HIGHEST }
26	25			25	1	88	sub default_themes { return qw(core bugs) }
27							sub applies_to {
28	4			4	1	13	return 'PPI::Token::Quote::Interpolate',
29							'PPI::Token::Quote::Double'
30							}
31
32							#-----------------------------------------------------------------------------
33							#
34							# A fairly comprehensive list of edge case handling:
35							#
36							# "\o" --> "o". It's illegal in Perl5, but may be encountered.
37
38							# "\o" is illegal in Perl5, so it shouldn't be encountered.
39							# "\o1" is illegal in Perl5, so it shouldn't be encountered.
40							# "\0" is legal though.
41							# "\c" is illegal in Perl5, so it shouldn't be encountered.
42							#
43							#-----------------------------------------------------------------------------
44
45							# Tokenizer II: Electric Boogaloo.
46							#
47							# Since it'll eventually be needed...
48
49							sub tokenize {
50	0			0	0		my ( $str ) = @_;
51	0						my @c = split //, $str;
52	0						my @token;
53
54	0						for ( my $i = 0; $i < @c; $i++ ) {
55	0						my ( $v, $la1 ) = @c[ $i, $i + 1 ];
56
57	0	0					if ( $v eq '\\' ) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
58	0	0					if ( $la1 eq 'c' ) {
		0
		0
		0
		0
		0
		0
		0
59							}
60							elsif ( $la1 eq 'l' ) {
61							}
62							elsif ( $la1 eq 'u' ) {
63							}
64							elsif ( $la1 eq 'E' ) {
65							}
66							elsif ( $la1 eq 'F' ) {
67							}
68							elsif ( $la1 eq 'L' ) {
69							}
70							elsif ( $la1 eq 'Q' ) {
71							}
72							elsif ( $la1 eq 'U' ) {
73							}
74							else {
75							}
76							}
77							elsif ( $v eq '$' ) {
78							}
79							elsif ( $v eq '@' ) {
80							}
81							elsif ( $v eq '%' ) {
82							}
83							elsif ( $v eq '{' ) {
84							}
85							elsif ( $v eq '}' ) {
86							}
87							elsif ( $v eq '(' ) {
88							}
89							elsif ( $v eq ')' ) {
90							}
91							elsif ( $v eq '<' ) {
92							}
93							elsif ( $v eq '>' ) {
94							}
95							else {
96							}
97							}
98
99	0						return @token;
100							}
101
102							#-----------------------------------------------------------------------------
103							#
104							# Some more cases get folded away here (hee.)
105							# \U foo \L\E bar \E - 'bar' will get altered here.
106							# \U foo \L$x\E bar \E - 'bar' will not get altered here,
107							# even if $x is empty.
108							# \U foo \Lxxx\E bar \E - 'bar' will not get altered here,
109							#
110							# So, \L..\E affects the rest of the string only if the contents
111							# are empty. So it's effectively as if it never was there.
112							# Get rid of it.
113							#
114							sub casefold {
115	0			0	0		my ($self, $residue) = @_;
116
117	0						my @tokens;
118	0						my @split = grep { $_ ne '' } split /( \\[luEFLQU] )/x, $residue;
	0
119	0						for ( my $i = 0; $i < @split; $i++ ) {
120	0						my ($v, $la1) = @split[$i,$i+1];
121	0	0	0				if ( $v =~ m< ^ \\[FLU] $ >x and
			0
122							$la1 and $la1 eq '\\E' ) {
123	0						$i+=2;
124							}
125							else {
126	0						push @tokens, $v;
127							}
128							}
129	0						return @tokens;
130							}
131
132							sub tokenize_variables {
133	0			0	0		my ($self, $elem, $string) = @_;
134	0						my $full_string = $string;
135
136	0						my @tokens;
137	0						my $iter = 100;
138	0						while ( $string ) {
139	0	0					unless ( --$iter ) {
140	0						my $line_number = $elem->line_number;
141	0						die "Congratulations, you've broken string interpolation. Please report this message, along with the test file you were using to the author: <<$full_string>> on line $line_number\n";
142							}
143	0						my $residue;
144
145							# '${foo}', '@{foo}' is an interpolated value.
146							#
147	0	0	0				if ( $string =~ s< ^ ( [\$\@] \{ [^}]+ \} ) ><>x ) {
		0
		0
		0
148	0						push @tokens, $1;
149							}
150
151							# '\c' is a token on its own.
152							# '$\', '@\', '$ ', '@ ' is also its own token. Feels buggy though.
153							#
154							elsif ( $string =~ s< ^ ( \\ c . ) ><>x or
155							$string =~ s< ^ ( [\$\@] (?: \\ \| \s ) ) ><>x ) {
156	0	0					if ( @tokens ) {
157	0						$tokens[-1] .= $1;
158							}
159							else {
160	0						push @tokens, $1;
161							}
162							}
163
164							# Anything that does not start with '$' or '@' is its own token,
165							# at least up until the next '$' or '@' encountered.
166							#
167							elsif ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
168	0						$residue .= $1;
169
170	0		0				while ( $residue and $residue =~ m< \\ $ >x ) {
171	0	0					if ( $string =~ s< ^ ( \$\@ ) ><>x ) {
172	0						$residue .= $1;
173							}
174	0						$string =~ s< (.) ><>x;
175	0	0					$residue .= $1 if $1;
176	0	0					if ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
177	0						$residue .= $1;
178							}
179							else {
180	0						last;
181							}
182							}
183	0						push @tokens, $self->casefold($residue);
184							}
185
186							# Anything else starting with a '$' or '@' is fair game.
187							#
188							elsif ( $string =~ m< ^ [\$\@] >x ) {
189	0						my ( $var_name, $remainder, $prefix ) =
190							extract_variable( $string );
191	0	0					if ( $var_name ) {
192	0						push @tokens, $var_name;
193	0						$string = $remainder;
194							}
195							#
196							# XXX I"m betting that extract_variable() doesn't quite catch $] etc.
197							#
198							else {
199	0						$string =~ s< ^ ( [\$\@] [^\$\@]* ) ><>x;
200	0	0					if ( @tokens ) {
201	0						$tokens[-1] .= $1;
202							}
203							else {
204	0						push @tokens, $1;
205							}
206							}
207							}
208							else {
209	0						warn "XXX failed\n";
210							}
211							}
212
213	0						return grep { $_ ne '' } @tokens;
	0
214							}
215
216							sub transform {
217	0			0	0		my ($self, $elem, $doc) = @_;
218
219	0						my $old_string = $elem->string;
220	0						my $new_string;
221
222	0	0					if ( index( $old_string, '@{[' ) >= 0 ) {
223	0						warn "Interpolating perl code.";
224	0						return;
225							}
226
227							# Save the delimiters for later. Since they surrounded the original Perl5
228							# string, we can be certain that if we use these for the Perl6 equivalent
229							# they'll be valid.
230							# Yes, even in the case of:
231							#
232							# "\lfoo" -> "{lcfirst("f")}oo".
233							#
234							# Perl6 is smart enough to know which segment of the braid it's on, and
235							# interprets the {..} boundary as a new Perl6 block.
236							#
237	0						my ( $start_delimiter ) =
238							$elem->content =~ m{ ^ ( qq[ ]. \| qq. \| q[ ]. \| q. \| . ) }x;
239	0						my $end_delimiter = substr( $elem->content, -1, 1 );
240
241							# \l or \u followed directly by any \l or \u modifier simply ignores
242							# the remaining \l or \u modifiers.
243							#
244	0						$old_string =~ s{ (\\[lu]) (\\[lu])+ }{$1}gx;
245
246							# \F, \L or \U followed directly by any \F, \L or \U modifier is a
247							# syntax error in Perl5, and can be reduced to a single \F, \L or \U.
248							#
249	0						$old_string =~ s{ (\\[FLU]) (\\[FLU])+ }{$1}gx;
250
251							# \Q followed by anything is still a legal sequence.
252
253							# \t is unchanged in Perl6.
254							# \n is unchanged in Perl6.
255							# \r is unchanged in Perl6.
256							# \f is unchanged in Perl6.
257							# \b is unchanged in Perl6.
258							# \a is unchanged in Perl6.
259							# \e is unchanged in Perl6.
260
261							# \v is deprecated
262							#
263	0						$old_string =~ s{ \\v }{v}mgx;
264
265							# \x{263a} now looks like \x[263a].
266							#
267	0						$old_string =~ s{ \\x \{ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
268
269							# \x12 is unaltered.
270							# \x1L is unaltered.
271							#
272							# '..\x' is illegal in Perl6.
273							#
274	0						$old_string =~ s{ \\x $ }{}mx;
275
276							# \N{U+1234} is now \x[1234].
277							# Variants with whitespace are illegal in Perl5, so don't worry about 'em
278							#
279	0						$old_string =~ s{ \\N \{ U \+ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
280
281							# \N{LATIN CAPITAL LETTER X} is now \c[LATIN CAPITAL LETTER X]
282							#
283	0						$old_string =~ s{ \\N \{ ([^\}]+) \} }{\\c[$1]}mgx;
284
285							# \o{2637} now looks like \o[2637].
286							#
287							# \o12 is unaltered.
288							#
289	0						$old_string =~ s{ \\o \{ ([0-7]*) \) }{\\o[$1]}mgx;
290
291							# \0123 is now \o[123].
292							#
293	0						$old_string =~ s{ \\0 \{ ([0-7]*) \) }{\\o[$1]}mgx;
294
295							# \oL is now illegal, and in perl5 it generated nothing.
296							#
297							# "...\o" is a syntax error in both languages, so don't worry about it.
298							#
299	0						$old_string =~ s{ \\o \{ ([^\}]*) \} }{\\o[$1]}mgx;
300
301							# \c. is unchanged. Or at least I'll treat it as such.
302
303							# At this point, you'll notice that practically every '{' and '}'
304							# character is out of our target string, with trivial exceptions like
305							# 'c{', which is illegal anyway.
306							#
307							# This is important for two main reasons. The first is that anything inside
308							# {} in Perl6 is considered valid Perl6 code, which is also why a trick
309							# we use later works.
310							#
311							# So, unless {} are part of a variable that can be interpolated, we have
312							# to escape it. And we can't do that if there are constructs like \x{123}
313							# hanging around in the string, because those would get messed up.
314							#
315
316	0						my @tokens = $self->tokenize_variables($elem,$old_string);
317
318							# Now on to rewriting \l, \u, \E, \F, \L, \Q, \U in Perl6.
319							#
320							# \F, \L, \Q and \U are "sort of" nested.
321							#
322							# You can see this by running C<print "Start \L lOWER \U Upper Me \E mE \E">
323							# > Start lower UPPER ME mE
324							#
325							# Note how 'lOWER' is case-flattend, but after the \U..E, 'mE' isn't?
326							#
327							# So, rather than having to retain case settings, we can simply stop the
328							# lc(..) block after the first...
329							#
330	0						my $new_content;
331	0						for ( my $i = 0; $i < @tokens; $i++ ) {
332	0						my ( $v, $la1 ) = @tokens[$i,$i+1];
333
334	0	0					if ( $v =~ m< ^ ( \$ \| \@ ) >x ) {
335	0	0					if ( $v =~ s< ^ \$ \@ ><\\\$\@>x ) {
		0
336							}
337							elsif ( $v =~ s< ^ ( \$ \| \@ ) \{ ([^\}]+) \} ><{$1$2}>sx ) {
338							}
339							else {
340	0						$v =~ s< [-][\>] ><.>gx;
341	0						$v =~ s< \{ (\w+) (\s*) \} >< '{' .
	0
342							$start_delimiter . $1 .
343							$end_delimiter . $2 .
344							'}'>segx;
345							}
346
347	0						$v =~ s< ^ ( [(<>)] ) ><\\$1>sgx;
348	0						$v =~ s< ( [^\\] ) ( [(<>)] ) ><$1\\$2>sgx;
349							}
350							else {
351							# < > is now a pointy block, { } is now a code block, ( ) is also
352							# used.
353							#
354	0						$v =~ s< ^ ( [{(<>)}] ) ><\\$1>sgx;
355	0						$v =~ s< ( [^\\] ) ( [{(<>)}] ) ><$1\\$2>sgx;
356							}
357	0						$new_content .= $v;
358							}
359
360							# if ( $v eq '\\l' ) {
361							# }
362							# elsif ( $v eq '\\u' ) {
363							# }
364							#
365							# elsif ( $v eq '\\F' or $v eq '\\L' ) {
366							# $collected .= '{' if @manip == 0;
367							# if ( @manip == 0 ) {
368							# $collected .= 'lc(' . $start_delimiter;
369							# }
370							# else {
371							# $collected .= $end_delimiter . ')~lc(' . $start_delimiter;
372							# }
373							# push @manip, $v;
374							# }
375							# elsif ( $v eq '\\Q' ) {
376							# $collected .= '{' if @manip == 0;
377							# if ( @manip == 0 ) {
378							# $collected .= 'quotemeta(' . $start_delimiter;
379							# }
380							# else {
381							# $collected .= $end_delimiter . ')~quotemeta(' . $start_delimiter;
382							# }
383							# push @manip, $v;
384							# }
385							# elsif ( $v eq '\\U' ) {
386							# $collected .= '{' if @manip == 0;
387							# if ( @manip == 0 ) {
388							# $collected .= 'tc(' . $start_delimiter;
389							# }
390							# else {
391							# $collected .= $end_delimiter . ')~tc(' . $start_delimiter;
392							# }
393							# push @manip, $v;
394							# }
395							# elsif ( $v eq '\\E' ) {
396							# pop @manip;
397							# if ( @manip == 0 ) {
398							# $collected .= $end_delimiter . ')}';
399							# }
400							# else {
401							# $collected .= $end_delimiter . ')';
402							# }
403							# }
404							# elsif ( $v =~ / ^ ( \$ \| \@ ) /x ) {
405							# $collected .= $v;
406							# }
407
408	0						eval {
409	0						set_string($elem,$new_content);
410							};
411	0	0					if ( $@ ) {
412	0						die "set_string broke! Please report this: ".Dump($elem);
413							}
414
415	0						return $self->transformation( $DESC, $EXPL, $elem );
416							}
417
418							1;
419
420							#-----------------------------------------------------------------------------
421
422							__END__
423
424							=pod
425
426							=head1 NAME
427
428							Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation - Format C<${x}> correctly
429
430
431							=head1 AFFILIATION
432
433							This Transformer is part of the core L<Perl::ToPerl6\|Perl::ToPerl6>
434							distribution.
435
436
437							=head1 DESCRIPTION
438
439							In Perl6, contents inside {} are now executable code. That means that inside interpolated strings, C<"${x}"> will be parsed as C<"${x()}"> and throw an exception if C<x()> is not defined. As such, this transforms C<"${x}"> into C<"{$x}">:
440
441							"The $x bit" --> "The $x bit"
442							"The $x-30 bit" --> "The $x\-30 bit"
443							"\N{FOO}" --> "\c{FOO}"
444							"The \l$x bit" --> "The {lc $x} bit"
445							"The \v bit" --> "The bit"
446							"The ${x}rd bit" --> "The {$x}rd bit"
447							"The \${x}rd bit" --> "The \$\{x\}rd bit"
448
449							Transforms only interpolated strings outside of comments, heredocs and POD.
450
451							=head1 CONFIGURATION
452
453							This Transformer is not configurable except for the standard options.
454
455							=head1 AUTHOR
456
457							Jeffrey Goff <drforr@pobox.com>
458
459							=head1 COPYRIGHT
460
461							Copyright (c) 2015 Jeffrey Goff
462
463							This program is free software; you can redistribute it and/or modify
464							it under the same terms as Perl itself.
465
466							=cut
467
468							##############################################################################
469							# Local Variables:
470							# mode: cperl
471							# cperl-indent-level: 4
472							# fill-column: 78
473							# indent-tabs-mode: nil
474							# c-indentation-style: bsd
475							# End:
476							# ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :