File Coverage

blib/lib/Perl/ToPerl6/Transformer/BasicTypes/Strings/Interpolation.pm

Criterion	Covered	Total	%
statement	27	91	29.6
branch	0	68	0.0
condition	0	15	0.0
subroutine	10	17	58.8
pod			n/a
total	37	191	19.3

line	stmt	bran	cond	sub	time	code
1						package Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation;
2
3	1			1	842	use 5.006001;
	1				3
4	1			1	4	use strict;
	1				1
	1				19
5	1			1	4	use warnings;
	1				2
	1				25
6	1			1	4	use Readonly;
	1				2
	1				41
7	1			1	5	use List::Util qw( max );
	1				2
	1				73
8	1			1	15438	use Text::Balanced qw( extract_variable );
	1				20245
	1				104
9
10	1			1	10	use Perl::ToPerl6::Utils qw{ :severities };
	1				4
	1				83
11	1			1	149	use Perl::ToPerl6::Utils::PPI qw{ set_string };
	1				2
	1				57
12
13	1			1	10	use base 'Perl::ToPerl6::Transformer';
	1				2
	1				2060
14
15						#-----------------------------------------------------------------------------
16
17						Readonly::Scalar my $DESC => q{Rewrite interpolated strings};
18						Readonly::Scalar my $EXPL => q{Rewrite interpolated strings};
19
20						#-----------------------------------------------------------------------------
21
22	0			0		sub supported_parameters { return () }
23	0			0		sub default_necessity { return $NECESSITY_HIGHEST }
24	0			0		sub default_themes { return qw( core ) }
25						sub applies_to {
26	0			0		return 'PPI::Token::Quote::Interpolate',
27						'PPI::Token::Quote::Double'
28						}
29
30						#-----------------------------------------------------------------------------
31						#
32						# A fairly comprehensive list of edge case handling:
33						#
34						# "\o" --> "o". It's illegal in Perl5, but may be encountered.
35
36						# "\o" is illegal in Perl5, so it shouldn't be encountered.
37						# "\o1" is illegal in Perl5, so it shouldn't be encountered.
38						# "\0" is legal though.
39						# "\c" is illegal in Perl5, so it shouldn't be encountered.
40						#
41						#-----------------------------------------------------------------------------
42
43						# Tokenizer II: Electric Boogaloo.
44						#
45						# Since it'll eventually be needed...
46
47						sub tokenize {
48	0			0		my ( $str ) = @_;
49	0					my @c = split //, $str;
50	0					my @token;
51
52	0					for ( my $i = 0; $i < @c; $i++ ) {
53	0					my ( $v, $la1 ) = @c[ $i, $i + 1 ];
54
55	0	0				if ( $v eq '\\' ) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
56	0	0				if ( $la1 eq 'c' ) {
		0
		0
		0
		0
		0
		0
		0
57						}
58						elsif ( $la1 eq 'l' ) {
59						}
60						elsif ( $la1 eq 'u' ) {
61						}
62						elsif ( $la1 eq 'E' ) {
63						}
64						elsif ( $la1 eq 'F' ) {
65						}
66						elsif ( $la1 eq 'L' ) {
67						}
68						elsif ( $la1 eq 'Q' ) {
69						}
70						elsif ( $la1 eq 'U' ) {
71						}
72						else {
73						}
74						}
75						elsif ( $v eq '$' ) {
76						}
77						elsif ( $v eq '@' ) {
78						}
79						elsif ( $v eq '%' ) {
80						}
81						elsif ( $v eq '{' ) {
82						}
83						elsif ( $v eq '}' ) {
84						}
85						elsif ( $v eq '(' ) {
86						}
87						elsif ( $v eq ')' ) {
88						}
89						elsif ( $v eq '<' ) {
90						}
91						elsif ( $v eq '>' ) {
92						}
93						else {
94						}
95						}
96
97	0					return @token;
98						}
99
100						#-----------------------------------------------------------------------------
101						#
102						# Some more cases get folded away here (hee.)
103						# \U foo \L\E bar \E - 'bar' will get altered here.
104						# \U foo \L$x\E bar \E - 'bar' will not get altered here,
105						# even if $x is empty.
106						# \U foo \Lxxx\E bar \E - 'bar' will not get altered here,
107						#
108						# So, \L..\E affects the rest of the string only if the contents
109						# are empty. So it's effectively as if it never was there.
110						# Get rid of it.
111						#
112						sub casefold {
113	0			0		my ($self, $text) = @_;
114	0					my @split = grep { $_ ne '' } split /( \\[luEFLQU] )/x, $text;
	0
115
116	0					my @token;
117	0					for ( my $i = 0; $i < @split; $i++ ) {
118	0					my ($v, $la1) = @split[$i,$i+1];
119	0	0	0			if ( $v =~ m< ^ \\[FLU] $ >x and
		0	0
		0
120						$la1 and $la1 eq '\\E' ) {
121	0					$i+=2;
122						}
123						elsif ( $v eq '\\Q' ) {
124	0					push @token, { type => 'quotemeta', content => $v };
125						}
126						elsif ( $v =~ m{ \\[luEFLQU] }x ) {
127	0					push @token, { type => 'casefold', content => $v };
128						}
129						else {
130	0					push @token, { type => 'uninterpolated', content => $v };
131						}
132						}
133	0					return @token;
134						}
135
136						# At the end of this process, ideally we should only have these types of tokens:
137						#
138						# Disambiguated variable - '${foo}' (needs separate handling)
139						# Variable - '$foo', '$foo[32]', '$foo{blah}'
140						# Case folding - '\l', '\E'
141						# Quotemeta - '\Q'
142						# Uninterpolated content - Anything that's not one of the above.
143						#
144						sub tokenize_variables {
145	0			0		my ($self, $elem, $string) = @_;
146	0					my $full_string = $string;
147
148	0					my @token;
149
150	0					my $iter = 100;
151	0					while ( $string ) {
152	0	0				unless ( --$iter ) {
153	0					my $line_number = $elem->line_number;
154	0					die "Congratulations, you've broken string interpolation. Please report this message, along with the test file you were using to the author: <<$full_string>> on line $line_number\n";
155						}
156
157						# '${foo}', '@{foo}' is an interpolated value.
158						#
159	0	0				if ( $string =~ s< ^ ( [\$\@] ) \{ ( [^}]+ ) \} ><>x ) {
		0
		0
		0
160	0					push @token, {
161						type => 'disambiguated variable',
162						sigil => $1,
163						content => $2
164						};
165						}
166
167						# extract_variable() doesn't handle most 'special' Perl variables,
168						# so handle them specially.
169						#
170						elsif ( $string =~ s< ^ ( [\$\@] ) ( \s \| [^a-zA-Z] ) }><>x ) {
171	0					my ( $sigil, $content ) = ( $1, $2 );
172	0		0			while ( $string =~ s< ^ ( \[ [^\]]+ \] ) ><>sx or
173						$string =~ s< ^ ( \{ [^\}]+ \} ) ><>sx ) {
174	0					$content .= $1;
175						}
176	0					push @token, {
177						type => 'variable',
178						sigil => $sigil,
179						content => $content
180						};
181						}
182
183						# Anything else starting with a '$' or '@' is fair game.
184						#
185						elsif ( $string =~ m< ^ [\$\@] >x ) {
186	0					my ( $var_name, $remainder, $prefix ) =
187						extract_variable( $string );
188	0	0				if ( $var_name ) {
		0
189	0					push @token, { type => 'variable', content => $var_name };
190	0					$string = $remainder;
191						}
192						#
193						# XXX I"m betting that extract_variable() doesn't catch $] etc.
194						#
195						elsif ( $string =~ s< ^ ( [\$\@] [^\$\@]* ) ><>x ) {
196	0	0				if ( @token ) {
197	0					$token[-1]{content} .= $1;
198						}
199						else {
200	0					push @token, { type => 'variable', content => $1 };
201						}
202						}
203						else {
204	0					die "XXX String interpolation (leading variable) failed on '$string'! Please report this to the author.\n";
205						}
206						}
207
208						# Anything that does not start with '$' or '@' is its own token,
209						# at least up until the next '$' or '@' encountered.
210						#
211						elsif ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
212	0					my $residue = $1;
213
214	0		0			while ( $residue and $residue =~ m< \\ $ >x ) {
215	0	0				if ( $string =~ s< ^ ( \$\@ ) ><>x ) {
216	0					$residue .= $1;
217						}
218	0					$string =~ s< (.) ><>x;
219	0	0				$residue .= $1 if $1;
220	0	0				if ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
221	0					$residue .= $1;
222						}
223						else {
224	0					last;
225						}
226						}
227
228						# Merge the first element of the residue with the last token if
229						# possible.
230						#
231	0					my @result = $self->casefold($residue);
232	0	0	0			if ( @token and $token[-1]{type} eq 'uninterpolated' ) {
233	0					$token[-1]{content} .= shift(@result)->{content};
234						}
235	0	0				push @token, @result if @result;
236						}
237
238						else {
239	0					die "XXX String interpolation failed on '$string'! Please report this to the author.\n";
240						}
241						}
242
243	0					return grep { $_ ne '' } @token;
	0
244						}
245
246						sub transform {
247						my ($self, $elem, $doc) = @_;
248
249						my $old_string = $elem->string;
250						my $new_string;
251
252						if ( index( $old_string, '@{[' ) >= 0 ) {
253						warn "Interpolating perl code.";
254						return;
255						}
256
257						# Save the delimiters for later. Since they surrounded the original Perl5
258						# string, we can be certain that if we use these for the Perl6 equivalent
259						# they'll be valid.
260						# Yes, even in the case of:
261						#
262						# "\lfoo" -> "{lcfirst("f")}oo".
263						#
264						# Perl6 is smart enough to know which segment of the braid it's on, and
265						# interprets the {..} boundary as a new Perl6 block.
266						#
267						my ( $start_delimiter ) =
268						$elem->content =~ m{ ^ ( qq[ ]. \| qq. \| q[ ]. \| q. \| . ) }x;
269						my $end_delimiter = substr( $elem->content, -1, 1 );
270
271						# \l or \u followed directly by any \l or \u modifier simply ignores
272						# the remaining \l or \u modifiers.
273						#
274						$old_string =~ s{ (\\[lu]) (\\[lu])+ }{$1}gx;
275
276						# \F, \L or \U followed directly by any \F, \L or \U modifier is a
277						# syntax error in Perl5, and can be reduced to a single \F, \L or \U.
278						#
279						$old_string =~ s{ (\\[FLU]) (\\[FLU])+ }{$1}gx;
280
281						# \Q followed by anything is still a legal sequence.
282
283						# \t is unchanged in Perl6.
284						# \n is unchanged in Perl6.
285						# \r is unchanged in Perl6.
286						# \f is unchanged in Perl6.
287						# \b is unchanged in Perl6.
288						# \a is unchanged in Perl6.
289						# \e is unchanged in Perl6.
290
291						# \v is deprecated
292						#
293						$old_string =~ s{ \\v }{v}mgx;
294
295						# \x{263a} now looks like \x[263a].
296						#
297						$old_string =~ s{ \\x \{ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
298
299						# \x12 is unaltered.
300						# \x1L is unaltered.
301						#
302						# '..\x' is illegal in Perl6.
303						#
304						$old_string =~ s{ \\x $ }{}mx;
305
306						# \N{U+1234} is now \x[1234].
307						# Variants with whitespace are illegal in Perl5, so don't worry about 'em
308						#
309						$old_string =~ s{ \\N \{ U \+ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
310
311						# \N{LATIN CAPITAL LETTER X} is now \c[LATIN CAPITAL LETTER X]
312						#
313						$old_string =~ s{ \\N \{ ([^\}]+) \} }{\\c[$1]}mgx;
314
315						# \o{2637} now looks like \o[2637].
316						#
317						# \o12 is unaltered.
318						#
319						$old_string =~ s{ \\o \{ ([0-7]*) \) }{\\o[$1]}mgx;
320
321						# \0123 is now \o[123].
322						#
323						$old_string =~ s{ \\0 \{ ([0-7]*) \) }{\\o[$1]}mgx;
324
325						# \oL is now illegal, and in perl5 it generated nothing.
326						#
327						# "...\o" is a syntax error in both languages, so don't worry about it.
328						#
329						$old_string =~ s{ \\o \{ ([^\}]*) \} }{\\o[$1]}mgx;
330
331						# \c. is unchanged. Or at least I'll treat it as such.
332
333						# At this point, you'll notice that practically every '{' and '}'
334						# character is out of our target string, with trivial exceptions like
335						# 'c{', which is illegal anyway.
336						#
337						# This is important for two main reasons. The first is that anything inside
338						# {} in Perl6 is considered valid Perl6 code, which is also why a trick
339						# we use later works.
340						#
341						# So, unless {} are part of a variable that can be interpolated, we have
342						# to escape it. And we can't do that if there are constructs like \x{123}
343						# hanging around in the string, because those would get messed up.
344						#
345
346						my @token = $self->tokenize_variables($elem,$old_string);
347						#use YAML;warn Dump grep { $_->{type} eq 'a' } @token;
348
349						# Now on to rewriting \l, \u, \E, \F, \L, \Q, \U in Perl6.
350						#
351						# \F, \L, \Q and \U are "sort of" nested.
352						#
353						# You can see this by running C<print "Start \L lOWER \U Upper Me \E mE \E">
354						# > Start lower UPPER ME mE
355						#
356						# Note how 'lOWER' is case-flattend, but after the \U..E, 'mE' isn't?
357						#
358						# So, rather than having to retain case settings, we can simply stop the
359						# lc(..) block after the first...
360						#
361						my $new_content;
362						for ( my $i = 0; $i < @token; $i++ ) {
363						my ( $v, $la1 ) = @token[$i,$i+1];
364
365						# '${a}' is mapped to '{$a}'.
366						# '${$a}' is a varvar in Perl5, needs other techniques in perl6
367						#
368						if ( $v->{type} eq 'disambiguated variable' ) {
369						if ( $v->{content} =~ m{ ^ ( \$ \| \@ ) }x ) {
370						warn "Use of varvar in string, not translating.\n";
371						$v->{content} = $v->{sigil} . '{' . $v->{content} . '}';
372						}
373						else {
374						$v->{content} =~ s< [-][\>] ><.>gx;
375						$v->{content} =~ s< \{ (\w+) (\s*) \} >< '{' .
376						$start_delimiter . $1 .
377						$end_delimiter . $2 .
378						'}'>segx;
379						$v->{content} = '{' . $v->{sigil} . $v->{content} . '}';
380						}
381						}
382
383						# All the other variables, including those with {} and [] indices,
384						# are grouped in this category.
385						#
386						elsif ( $v->{type} eq 'variable' ) {
387						$v->{content} =~ s< [-][\>] ><.>gx;
388						$v->{content} =~ s< \{ (\w+) (\s*) \} >< '{' .
389						$start_delimiter . $1 .
390						$end_delimiter . $2 .
391						'}'>segx;
392
393						$v->{content} =~ s< ^ ( [(<>)] ) ><\\$1>sgx;
394						$v->{content} =~ s< ( [^\\] ) ( [(<>)] ) ><$1\\$2>sgx;
395						}
396
397						# Non-variables are handled down here.
398						#
399						else {
400						# < > is now a pointy block, { } is now a code block, ( ) is also
401						# used.
402						#
403						$v->{content} =~ s< ^ ( [{(<>)}] ) ><\\$1>sgx;
404						$v->{content} =~ s< ( [^\\] ) ( [{(<>)}] ) ><$1\\$2>sgx;
405						}
406						$new_content .= $v->{content};
407						}
408
409						# elsif ( $v eq '\\F' or $v eq '\\L' ) {
410						# $collected .= '{' if @manip == 0;
411						# if ( @manip == 0 ) {
412						# $collected .= 'lc(' . $start_delimiter;
413						# }
414						# else {
415						# $collected .= $end_delimiter . ')~lc(' . $start_delimiter;
416						# }
417						# push @manip, $v;
418						# }
419						# elsif ( $v eq '\\Q' ) {
420						# $collected .= '{' if @manip == 0;
421						# if ( @manip == 0 ) {
422						# $collected .= 'quotemeta(' . $start_delimiter;
423						# }
424						# else {
425						# $collected .= $end_delimiter . ')~quotemeta(' . $start_delimiter;
426						# }
427						# push @manip, $v;
428						# }
429						# elsif ( $v eq '\\U' ) {
430						# $collected .= '{' if @manip == 0;
431						# if ( @manip == 0 ) {
432						# $collected .= 'tc(' . $start_delimiter;
433						# }
434						# else {
435						# $collected .= $end_delimiter . ')~tc(' . $start_delimiter;
436						# }
437						# push @manip, $v;
438						# }
439						# elsif ( $v eq '\\E' ) {
440						# pop @manip;
441						# if ( @manip == 0 ) {
442						# $collected .= $end_delimiter . ')}';
443						# }
444						# else {
445						# $collected .= $end_delimiter . ')';
446						# }
447						# }
448
449						eval {
450						set_string($elem,$new_content);
451						};
452						if ( $@ ) {
453	1			1	565	use YAML;die "set_string broke! Please report this: ".Dump($elem);
	0
	0
454						}
455
456						return $self->transformation( $DESC, $EXPL, $elem );
457						}
458
459						1;
460
461						#-----------------------------------------------------------------------------
462
463						__END__
464
465						=pod
466
467						=head1 NAME
468
469						Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation - Format C<${x}> correctly
470
471
472						=head1 AFFILIATION
473
474						This Transformer is part of the core L<Perl::ToPerl6\|Perl::ToPerl6>
475						distribution.
476
477
478						=head1 DESCRIPTION
479
480						In Perl6, contents inside {} are now executable code. That means that inside interpolated strings, C<"${x}"> will be parsed as C<"${x()}"> and throw an exception if C<x()> is not defined. As such, this transforms C<"${x}"> into C<"{$x}">:
481
482						"The $x bit" --> "The $x bit"
483						"The $x-30 bit" --> "The $x\-30 bit"
484						"\N{FOO}" --> "\c[FOO]"
485						"The \l$x bit" --> "The {lc $x} bit"
486						"The \v bit" --> "The bit"
487						"The ${x}rd bit" --> "The {$x}rd bit"
488						"The \${x}rd bit" --> "The \$\{x\}rd bit"
489
490						Many other transforms are performed in this module, see the code for a better
491						idea of how complex this transformation really is.
492
493						Transforms only interpolated strings outside of comments, heredocs and POD.
494
495						=head1 CONFIGURATION
496
497						This Transformer is not configurable except for the standard options.
498
499						=head1 AUTHOR
500
501						Jeffrey Goff <drforr@pobox.com>
502
503						=head1 COPYRIGHT
504
505						Copyright (c) 2015 Jeffrey Goff
506
507						This program is free software; you can redistribute it and/or modify
508						it under the same terms as Perl itself.
509
510						=cut
511
512						##############################################################################
513						# Local Variables:
514						# mode: cperl
515						# cperl-indent-level: 4
516						# fill-column: 78
517						# indent-tabs-mode: nil
518						# c-indentation-style: bsd
519						# End:
520						# ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :