File Coverage

blib/lib/Perl/ToPerl6/Transformer/BasicTypes/Strings/Interpolation.pm
Criterion Covered Total %
statement 27 91 29.6
branch 0 68 0.0
condition 0 15 0.0
subroutine 10 17 58.8
pod n/a
total 37 191 19.3


line stmt bran cond sub pod time code
1             package Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation;
2              
3 1     1   842 use 5.006001;
  1         3  
4 1     1   4 use strict;
  1         1  
  1         19  
5 1     1   4 use warnings;
  1         2  
  1         25  
6 1     1   4 use Readonly;
  1         2  
  1         41  
7 1     1   5 use List::Util qw( max );
  1         2  
  1         73  
8 1     1   15438 use Text::Balanced qw( extract_variable );
  1         20245  
  1         104  
9              
10 1     1   10 use Perl::ToPerl6::Utils qw{ :severities };
  1         4  
  1         83  
11 1     1   149 use Perl::ToPerl6::Utils::PPI qw{ set_string };
  1         2  
  1         57  
12              
13 1     1   10 use base 'Perl::ToPerl6::Transformer';
  1         2  
  1         2060  
14              
15             #-----------------------------------------------------------------------------
16              
17             Readonly::Scalar my $DESC => q{Rewrite interpolated strings};
18             Readonly::Scalar my $EXPL => q{Rewrite interpolated strings};
19              
20             #-----------------------------------------------------------------------------
21              
22 0     0     sub supported_parameters { return () }
23 0     0     sub default_necessity { return $NECESSITY_HIGHEST }
24 0     0     sub default_themes { return qw( core ) }
25             sub applies_to {
26 0     0     return 'PPI::Token::Quote::Interpolate',
27             'PPI::Token::Quote::Double'
28             }
29              
30             #-----------------------------------------------------------------------------
31             #
32             # A fairly comprehensive list of edge case handling:
33             #
34             # "\o" --> "o". It's illegal in Perl5, but may be encountered.
35              
36             # "\o" is illegal in Perl5, so it shouldn't be encountered.
37             # "\o1" is illegal in Perl5, so it shouldn't be encountered.
38             # "\0" is legal though.
39             # "\c" is illegal in Perl5, so it shouldn't be encountered.
40             #
41             #-----------------------------------------------------------------------------
42              
43             # Tokenizer II: Electric Boogaloo.
44             #
45             # Since it'll eventually be needed...
46              
47             sub tokenize {
48 0     0     my ( $str ) = @_;
49 0           my @c = split //, $str;
50 0           my @token;
51              
52 0           for ( my $i = 0; $i < @c; $i++ ) {
53 0           my ( $v, $la1 ) = @c[ $i, $i + 1 ];
54              
55 0 0         if ( $v eq '\\' ) {
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
56 0 0         if ( $la1 eq 'c' ) {
    0          
    0          
    0          
    0          
    0          
    0          
    0          
57             }
58             elsif ( $la1 eq 'l' ) {
59             }
60             elsif ( $la1 eq 'u' ) {
61             }
62             elsif ( $la1 eq 'E' ) {
63             }
64             elsif ( $la1 eq 'F' ) {
65             }
66             elsif ( $la1 eq 'L' ) {
67             }
68             elsif ( $la1 eq 'Q' ) {
69             }
70             elsif ( $la1 eq 'U' ) {
71             }
72             else {
73             }
74             }
75             elsif ( $v eq '$' ) {
76             }
77             elsif ( $v eq '@' ) {
78             }
79             elsif ( $v eq '%' ) {
80             }
81             elsif ( $v eq '{' ) {
82             }
83             elsif ( $v eq '}' ) {
84             }
85             elsif ( $v eq '(' ) {
86             }
87             elsif ( $v eq ')' ) {
88             }
89             elsif ( $v eq '<' ) {
90             }
91             elsif ( $v eq '>' ) {
92             }
93             else {
94             }
95             }
96              
97 0           return @token;
98             }
99              
100             #-----------------------------------------------------------------------------
101             #
102             # Some more cases get folded away here (hee.)
103             # \U foo \L\E bar \E - 'bar' will get altered here.
104             # \U foo \L$x\E bar \E - 'bar' will *not* get altered here,
105             # even if $x is empty.
106             # \U foo \Lxxx\E bar \E - 'bar' will *not* get altered here,
107             #
108             # So, \L..\E affects the rest of the string only if the contents
109             # are empty. So it's effectively as if it never was there.
110             # Get rid of it.
111             #
112             sub casefold {
113 0     0     my ($self, $text) = @_;
114 0           my @split = grep { $_ ne '' } split /( \\[luEFLQU] )/x, $text;
  0            
115              
116 0           my @token;
117 0           for ( my $i = 0; $i < @split; $i++ ) {
118 0           my ($v, $la1) = @split[$i,$i+1];
119 0 0 0       if ( $v =~ m< ^ \\[FLU] $ >x and
    0 0        
    0          
120             $la1 and $la1 eq '\\E' ) {
121 0           $i+=2;
122             }
123             elsif ( $v eq '\\Q' ) {
124 0           push @token, { type => 'quotemeta', content => $v };
125             }
126             elsif ( $v =~ m{ \\[luEFLQU] }x ) {
127 0           push @token, { type => 'casefold', content => $v };
128             }
129             else {
130 0           push @token, { type => 'uninterpolated', content => $v };
131             }
132             }
133 0           return @token;
134             }
135              
136             # At the end of this process, ideally we should only have these types of tokens:
137             #
138             # Disambiguated variable - '${foo}' (needs separate handling)
139             # Variable - '$foo', '$foo[32]', '$foo{blah}'
140             # Case folding - '\l', '\E'
141             # Quotemeta - '\Q'
142             # Uninterpolated content - Anything that's not one of the above.
143             #
144             sub tokenize_variables {
145 0     0     my ($self, $elem, $string) = @_;
146 0           my $full_string = $string;
147              
148 0           my @token;
149              
150 0           my $iter = 100;
151 0           while ( $string ) {
152 0 0         unless ( --$iter ) {
153 0           my $line_number = $elem->line_number;
154 0           die "Congratulations, you've broken string interpolation. Please report this message, along with the test file you were using to the author: <<$full_string>> on line $line_number\n";
155             }
156              
157             # '${foo}', '@{foo}' is an interpolated value.
158             #
159 0 0         if ( $string =~ s< ^ ( [\$\@] ) \{ ( [^}]+ ) \} ><>x ) {
    0          
    0          
    0          
160 0           push @token, {
161             type => 'disambiguated variable',
162             sigil => $1,
163             content => $2
164             };
165             }
166              
167             # extract_variable() doesn't handle most 'special' Perl variables,
168             # so handle them specially.
169             #
170             elsif ( $string =~ s< ^ ( [\$\@] ) ( \s | [^a-zA-Z] ) }><>x ) {
171 0           my ( $sigil, $content ) = ( $1, $2 );
172 0   0       while ( $string =~ s< ^ ( \[ [^\]]+ \] ) ><>sx or
173             $string =~ s< ^ ( \{ [^\}]+ \} ) ><>sx ) {
174 0           $content .= $1;
175             }
176 0           push @token, {
177             type => 'variable',
178             sigil => $sigil,
179             content => $content
180             };
181             }
182              
183             # Anything else starting with a '$' or '@' is fair game.
184             #
185             elsif ( $string =~ m< ^ [\$\@] >x ) {
186 0           my ( $var_name, $remainder, $prefix ) =
187             extract_variable( $string );
188 0 0         if ( $var_name ) {
    0          
189 0           push @token, { type => 'variable', content => $var_name };
190 0           $string = $remainder;
191             }
192             #
193             # XXX I"m betting that extract_variable() doesn't catch $] etc.
194             #
195             elsif ( $string =~ s< ^ ( [\$\@] [^\$\@]* ) ><>x ) {
196 0 0         if ( @token ) {
197 0           $token[-1]{content} .= $1;
198             }
199             else {
200 0           push @token, { type => 'variable', content => $1 };
201             }
202             }
203             else {
204 0           die "XXX String interpolation (leading variable) failed on '$string'! Please report this to the author.\n";
205             }
206             }
207              
208             # Anything that does *not* start with '$' or '@' is its own token,
209             # at least up until the next '$' or '@' encountered.
210             #
211             elsif ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
212 0           my $residue = $1;
213              
214 0   0       while ( $residue and $residue =~ m< \\ $ >x ) {
215 0 0         if ( $string =~ s< ^ ( \$\@ ) ><>x ) {
216 0           $residue .= $1;
217             }
218 0           $string =~ s< (.) ><>x;
219 0 0         $residue .= $1 if $1;
220 0 0         if ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
221 0           $residue .= $1;
222             }
223             else {
224 0           last;
225             }
226             }
227              
228             # Merge the first element of the residue with the last token if
229             # possible.
230             #
231 0           my @result = $self->casefold($residue);
232 0 0 0       if ( @token and $token[-1]{type} eq 'uninterpolated' ) {
233 0           $token[-1]{content} .= shift(@result)->{content};
234             }
235 0 0         push @token, @result if @result;
236             }
237              
238             else {
239 0           die "XXX String interpolation failed on '$string'! Please report this to the author.\n";
240             }
241             }
242              
243 0           return grep { $_ ne '' } @token;
  0            
244             }
245              
246             sub transform {
247             my ($self, $elem, $doc) = @_;
248              
249             my $old_string = $elem->string;
250             my $new_string;
251              
252             if ( index( $old_string, '@{[' ) >= 0 ) {
253             warn "Interpolating perl code.";
254             return;
255             }
256              
257             # Save the delimiters for later. Since they surrounded the original Perl5
258             # string, we can be certain that if we use these for the Perl6 equivalent
259             # they'll be valid.
260             # Yes, even in the case of:
261             #
262             # "\lfoo" -> "{lcfirst("f")}oo".
263             #
264             # Perl6 is smart enough to know which segment of the braid it's on, and
265             # interprets the {..} boundary as a new Perl6 block.
266             #
267             my ( $start_delimiter ) =
268             $elem->content =~ m{ ^ ( qq[ ]. | qq. | q[ ]. | q. | . ) }x;
269             my $end_delimiter = substr( $elem->content, -1, 1 );
270              
271             # \l or \u followed *directly* by any \l or \u modifier simply ignores
272             # the remaining \l or \u modifiers.
273             #
274             $old_string =~ s{ (\\[lu]) (\\[lu])+ }{$1}gx;
275              
276             # \F, \L or \U followed *directly* by any \F, \L or \U modifier is a
277             # syntax error in Perl5, and can be reduced to a single \F, \L or \U.
278             #
279             $old_string =~ s{ (\\[FLU]) (\\[FLU])+ }{$1}gx;
280              
281             # \Q followed by anything is still a legal sequence.
282              
283             # \t is unchanged in Perl6.
284             # \n is unchanged in Perl6.
285             # \r is unchanged in Perl6.
286             # \f is unchanged in Perl6.
287             # \b is unchanged in Perl6.
288             # \a is unchanged in Perl6.
289             # \e is unchanged in Perl6.
290              
291             # \v is deprecated
292             #
293             $old_string =~ s{ \\v }{v}mgx;
294              
295             # \x{263a} now looks like \x[263a].
296             #
297             $old_string =~ s{ \\x \{ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
298              
299             # \x12 is unaltered.
300             # \x1L is unaltered.
301             #
302             # '..\x' is illegal in Perl6.
303             #
304             $old_string =~ s{ \\x $ }{}mx;
305              
306             # \N{U+1234} is now \x[1234].
307             # Variants with whitespace are illegal in Perl5, so don't worry about 'em
308             #
309             $old_string =~ s{ \\N \{ U \+ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
310              
311             # \N{LATIN CAPITAL LETTER X} is now \c[LATIN CAPITAL LETTER X]
312             #
313             $old_string =~ s{ \\N \{ ([^\}]+) \} }{\\c[$1]}mgx;
314              
315             # \o{2637} now looks like \o[2637].
316             #
317             # \o12 is unaltered.
318             #
319             $old_string =~ s{ \\o \{ ([0-7]*) \) }{\\o[$1]}mgx;
320              
321             # \0123 is now \o[123].
322             #
323             $old_string =~ s{ \\0 \{ ([0-7]*) \) }{\\o[$1]}mgx;
324              
325             # \oL is now illegal, and in perl5 it generated nothing.
326             #
327             # "...\o" is a syntax error in both languages, so don't worry about it.
328             #
329             $old_string =~ s{ \\o \{ ([^\}]*) \} }{\\o[$1]}mgx;
330              
331             # \c. is unchanged. Or at least I'll treat it as such.
332              
333             # At this point, you'll notice that practically every '{' and '}'
334             # character is out of our target string, with trivial exceptions like
335             # 'c{', which is illegal anyway.
336             #
337             # This is important for two main reasons. The first is that anything inside
338             # {} in Perl6 is considered valid Perl6 code, which is also why a trick
339             # we use later works.
340             #
341             # So, unless {} are part of a variable that can be interpolated, we have
342             # to escape it. And we can't do that if there are constructs like \x{123}
343             # hanging around in the string, because those would get messed up.
344             #
345              
346             my @token = $self->tokenize_variables($elem,$old_string);
347             #use YAML;warn Dump grep { $_->{type} eq 'a' } @token;
348              
349             # Now on to rewriting \l, \u, \E, \F, \L, \Q, \U in Perl6.
350             #
351             # \F, \L, \Q and \U are "sort of" nested.
352             #
353             # You can see this by running C<print "Start \L lOWER \U Upper Me \E mE \E">
354             # > Start lower UPPER ME mE
355             #
356             # Note how 'lOWER' is case-flattend, but after the \U..E, 'mE' isn't?
357             #
358             # So, rather than having to retain case settings, we can simply stop the
359             # lc(..) block after the first...
360             #
361             my $new_content;
362             for ( my $i = 0; $i < @token; $i++ ) {
363             my ( $v, $la1 ) = @token[$i,$i+1];
364              
365             # '${a}' is mapped to '{$a}'.
366             # '${$a}' is a varvar in Perl5, needs other techniques in perl6
367             #
368             if ( $v->{type} eq 'disambiguated variable' ) {
369             if ( $v->{content} =~ m{ ^ ( \$ | \@ ) }x ) {
370             warn "Use of varvar in string, not translating.\n";
371             $v->{content} = $v->{sigil} . '{' . $v->{content} . '}';
372             }
373             else {
374             $v->{content} =~ s< [-][\>] ><.>gx;
375             $v->{content} =~ s< \{ (\w+) (\s*) \} >< '{' .
376             $start_delimiter . $1 .
377             $end_delimiter . $2 .
378             '}'>segx;
379             $v->{content} = '{' . $v->{sigil} . $v->{content} . '}';
380             }
381             }
382              
383             # All the other variables, including those with {} and [] indices,
384             # are grouped in this category.
385             #
386             elsif ( $v->{type} eq 'variable' ) {
387             $v->{content} =~ s< [-][\>] ><.>gx;
388             $v->{content} =~ s< \{ (\w+) (\s*) \} >< '{' .
389             $start_delimiter . $1 .
390             $end_delimiter . $2 .
391             '}'>segx;
392              
393             $v->{content} =~ s< ^ ( [(<>)] ) ><\\$1>sgx;
394             $v->{content} =~ s< ( [^\\] ) ( [(<>)] ) ><$1\\$2>sgx;
395             }
396              
397             # Non-variables are handled down here.
398             #
399             else {
400             # < > is now a pointy block, { } is now a code block, ( ) is also
401             # used.
402             #
403             $v->{content} =~ s< ^ ( [{(<>)}] ) ><\\$1>sgx;
404             $v->{content} =~ s< ( [^\\] ) ( [{(<>)}] ) ><$1\\$2>sgx;
405             }
406             $new_content .= $v->{content};
407             }
408              
409             # elsif ( $v eq '\\F' or $v eq '\\L' ) {
410             # $collected .= '{' if @manip == 0;
411             # if ( @manip == 0 ) {
412             # $collected .= 'lc(' . $start_delimiter;
413             # }
414             # else {
415             # $collected .= $end_delimiter . ')~lc(' . $start_delimiter;
416             # }
417             # push @manip, $v;
418             # }
419             # elsif ( $v eq '\\Q' ) {
420             # $collected .= '{' if @manip == 0;
421             # if ( @manip == 0 ) {
422             # $collected .= 'quotemeta(' . $start_delimiter;
423             # }
424             # else {
425             # $collected .= $end_delimiter . ')~quotemeta(' . $start_delimiter;
426             # }
427             # push @manip, $v;
428             # }
429             # elsif ( $v eq '\\U' ) {
430             # $collected .= '{' if @manip == 0;
431             # if ( @manip == 0 ) {
432             # $collected .= 'tc(' . $start_delimiter;
433             # }
434             # else {
435             # $collected .= $end_delimiter . ')~tc(' . $start_delimiter;
436             # }
437             # push @manip, $v;
438             # }
439             # elsif ( $v eq '\\E' ) {
440             # pop @manip;
441             # if ( @manip == 0 ) {
442             # $collected .= $end_delimiter . ')}';
443             # }
444             # else {
445             # $collected .= $end_delimiter . ')';
446             # }
447             # }
448              
449             eval {
450             set_string($elem,$new_content);
451             };
452             if ( $@ ) {
453 1     1   565 use YAML;die "set_string broke! Please report this: ".Dump($elem);
  0            
  0            
454             }
455              
456             return $self->transformation( $DESC, $EXPL, $elem );
457             }
458              
459             1;
460              
461             #-----------------------------------------------------------------------------
462              
463             __END__
464              
465             =pod
466              
467             =head1 NAME
468              
469             Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation - Format C<${x}> correctly
470              
471              
472             =head1 AFFILIATION
473              
474             This Transformer is part of the core L<Perl::ToPerl6|Perl::ToPerl6>
475             distribution.
476              
477              
478             =head1 DESCRIPTION
479              
480             In Perl6, contents inside {} are now executable code. That means that inside interpolated strings, C<"${x}"> will be parsed as C<"${x()}"> and throw an exception if C<x()> is not defined. As such, this transforms C<"${x}"> into C<"{$x}">:
481              
482             "The $x bit" --> "The $x bit"
483             "The $x-30 bit" --> "The $x\-30 bit"
484             "\N{FOO}" --> "\c[FOO]"
485             "The \l$x bit" --> "The {lc $x} bit"
486             "The \v bit" --> "The bit"
487             "The ${x}rd bit" --> "The {$x}rd bit"
488             "The \${x}rd bit" --> "The \$\{x\}rd bit"
489              
490             Many other transforms are performed in this module, see the code for a better
491             idea of how complex this transformation really is.
492              
493             Transforms only interpolated strings outside of comments, heredocs and POD.
494              
495             =head1 CONFIGURATION
496              
497             This Transformer is not configurable except for the standard options.
498              
499             =head1 AUTHOR
500              
501             Jeffrey Goff <drforr@pobox.com>
502              
503             =head1 COPYRIGHT
504              
505             Copyright (c) 2015 Jeffrey Goff
506              
507             This program is free software; you can redistribute it and/or modify
508             it under the same terms as Perl itself.
509              
510             =cut
511              
512             ##############################################################################
513             # Local Variables:
514             # mode: cperl
515             # cperl-indent-level: 4
516             # fill-column: 78
517             # indent-tabs-mode: nil
518             # c-indentation-style: bsd
519             # End:
520             # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :