File Coverage

blib/lib/Perl/ToPerl6/Transformer/BasicTypes/Strings/Interpolation.pm
Criterion Covered Total %
statement 30 120 25.0
branch 0 70 0.0
condition 0 12 0.0
subroutine 13 17 76.4
pod 3 8 37.5
total 46 227 20.2


line stmt bran cond sub pod time code
1             package Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation;
2              
3 17     17   11291 use 5.006001;
  17         48  
4 17     17   74 use strict;
  17         24  
  17         356  
5 17     17   68 use warnings;
  17         24  
  17         2671  
6 17     17   62 use Readonly;
  17         29  
  17         801  
7 17     17   78 use List::Util qw( max );
  17         22  
  17         1021  
8 17     17   21610301 use Text::Balanced qw( extract_variable );
  17         3541418  
  17         1781  
9              
10 17     17   166 use Perl::ToPerl6::Utils qw{ :characters :severities };
  17         28  
  17         1310  
11 17     17   5675 use Perl::ToPerl6::Utils::PPI qw{ set_string };
  17         36  
  17         986  
12              
13 17     17   100 use base 'Perl::ToPerl6::Transformer';
  17         30  
  17         32454  
14              
15             our $VERSION = '0.03';
16              
17             #-----------------------------------------------------------------------------
18              
19             Readonly::Scalar my $DESC => q{Rewrite interpolated strings};
20             Readonly::Scalar my $EXPL => q{Rewrite interpolated strings};
21              
22             #-----------------------------------------------------------------------------
23              
24 40     40 0 1478 sub supported_parameters { return () }
25 28     28 1 125 sub default_severity { return $SEVERITY_HIGHEST }
26 25     25 1 88 sub default_themes { return qw(core bugs) }
27             sub applies_to {
28 4     4 1 13 return 'PPI::Token::Quote::Interpolate',
29             'PPI::Token::Quote::Double'
30             }
31              
32             #-----------------------------------------------------------------------------
33             #
34             # A fairly comprehensive list of edge case handling:
35             #
36             # "\o" --> "o". It's illegal in Perl5, but may be encountered.
37              
38             # "\o" is illegal in Perl5, so it shouldn't be encountered.
39             # "\o1" is illegal in Perl5, so it shouldn't be encountered.
40             # "\0" is legal though.
41             # "\c" is illegal in Perl5, so it shouldn't be encountered.
42             #
43             #-----------------------------------------------------------------------------
44              
45             # Tokenizer II: Electric Boogaloo.
46             #
47             # Since it'll eventually be needed...
48              
49             sub tokenize {
50 0     0 0   my ( $str ) = @_;
51 0           my @c = split //, $str;
52 0           my @token;
53              
54 0           for ( my $i = 0; $i < @c; $i++ ) {
55 0           my ( $v, $la1 ) = @c[ $i, $i + 1 ];
56              
57 0 0         if ( $v eq '\\' ) {
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
58 0 0         if ( $la1 eq 'c' ) {
    0          
    0          
    0          
    0          
    0          
    0          
    0          
59             }
60             elsif ( $la1 eq 'l' ) {
61             }
62             elsif ( $la1 eq 'u' ) {
63             }
64             elsif ( $la1 eq 'E' ) {
65             }
66             elsif ( $la1 eq 'F' ) {
67             }
68             elsif ( $la1 eq 'L' ) {
69             }
70             elsif ( $la1 eq 'Q' ) {
71             }
72             elsif ( $la1 eq 'U' ) {
73             }
74             else {
75             }
76             }
77             elsif ( $v eq '$' ) {
78             }
79             elsif ( $v eq '@' ) {
80             }
81             elsif ( $v eq '%' ) {
82             }
83             elsif ( $v eq '{' ) {
84             }
85             elsif ( $v eq '}' ) {
86             }
87             elsif ( $v eq '(' ) {
88             }
89             elsif ( $v eq ')' ) {
90             }
91             elsif ( $v eq '<' ) {
92             }
93             elsif ( $v eq '>' ) {
94             }
95             else {
96             }
97             }
98              
99 0           return @token;
100             }
101              
102             #-----------------------------------------------------------------------------
103             #
104             # Some more cases get folded away here (hee.)
105             # \U foo \L\E bar \E - 'bar' will get altered here.
106             # \U foo \L$x\E bar \E - 'bar' will *not* get altered here,
107             # even if $x is empty.
108             # \U foo \Lxxx\E bar \E - 'bar' will *not* get altered here,
109             #
110             # So, \L..\E affects the rest of the string only if the contents
111             # are empty. So it's effectively as if it never was there.
112             # Get rid of it.
113             #
114             sub casefold {
115 0     0 0   my ($self, $residue) = @_;
116              
117 0           my @tokens;
118 0           my @split = grep { $_ ne '' } split /( \\[luEFLQU] )/x, $residue;
  0            
119 0           for ( my $i = 0; $i < @split; $i++ ) {
120 0           my ($v, $la1) = @split[$i,$i+1];
121 0 0 0       if ( $v =~ m< ^ \\[FLU] $ >x and
      0        
122             $la1 and $la1 eq '\\E' ) {
123 0           $i+=2;
124             }
125             else {
126 0           push @tokens, $v;
127             }
128             }
129 0           return @tokens;
130             }
131              
132             sub tokenize_variables {
133 0     0 0   my ($self, $elem, $string) = @_;
134 0           my $full_string = $string;
135              
136 0           my @tokens;
137 0           my $iter = 100;
138 0           while ( $string ) {
139 0 0         unless ( --$iter ) {
140 0           my $line_number = $elem->line_number;
141 0           die "Congratulations, you've broken string interpolation. Please report this message, along with the test file you were using to the author: <<$full_string>> on line $line_number\n";
142             }
143 0           my $residue;
144              
145             # '${foo}', '@{foo}' is an interpolated value.
146             #
147 0 0 0       if ( $string =~ s< ^ ( [\$\@] \{ [^}]+ \} ) ><>x ) {
    0          
    0          
    0          
148 0           push @tokens, $1;
149             }
150              
151             # '\c' is a token on its own.
152             # '$\', '@\', '$ ', '@ ' is also its own token. Feels buggy though.
153             #
154             elsif ( $string =~ s< ^ ( \\ c . ) ><>x or
155             $string =~ s< ^ ( [\$\@] (?: \\ | \s ) ) ><>x ) {
156 0 0         if ( @tokens ) {
157 0           $tokens[-1] .= $1;
158             }
159             else {
160 0           push @tokens, $1;
161             }
162             }
163              
164             # Anything that does *not* start with '$' or '@' is its own token,
165             # at least up until the next '$' or '@' encountered.
166             #
167             elsif ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
168 0           $residue .= $1;
169              
170 0   0       while ( $residue and $residue =~ m< \\ $ >x ) {
171 0 0         if ( $string =~ s< ^ ( \$\@ ) ><>x ) {
172 0           $residue .= $1;
173             }
174 0           $string =~ s< (.) ><>x;
175 0 0         $residue .= $1 if $1;
176 0 0         if ( $string =~ s< ^ ( [^\$\@]+ ) ><>x ) {
177 0           $residue .= $1;
178             }
179             else {
180 0           last;
181             }
182             }
183 0           push @tokens, $self->casefold($residue);
184             }
185              
186             # Anything else starting with a '$' or '@' is fair game.
187             #
188             elsif ( $string =~ m< ^ [\$\@] >x ) {
189 0           my ( $var_name, $remainder, $prefix ) =
190             extract_variable( $string );
191 0 0         if ( $var_name ) {
192 0           push @tokens, $var_name;
193 0           $string = $remainder;
194             }
195             #
196             # XXX I"m betting that extract_variable() doesn't quite catch $] etc.
197             #
198             else {
199 0           $string =~ s< ^ ( [\$\@] [^\$\@]* ) ><>x;
200 0 0         if ( @tokens ) {
201 0           $tokens[-1] .= $1;
202             }
203             else {
204 0           push @tokens, $1;
205             }
206             }
207             }
208             else {
209 0           warn "XXX failed\n";
210             }
211             }
212              
213 0           return grep { $_ ne '' } @tokens;
  0            
214             }
215              
216             sub transform {
217 0     0 0   my ($self, $elem, $doc) = @_;
218              
219 0           my $old_string = $elem->string;
220 0           my $new_string;
221              
222 0 0         if ( index( $old_string, '@{[' ) >= 0 ) {
223 0           warn "Interpolating perl code.";
224 0           return;
225             }
226              
227             # Save the delimiters for later. Since they surrounded the original Perl5
228             # string, we can be certain that if we use these for the Perl6 equivalent
229             # they'll be valid.
230             # Yes, even in the case of:
231             #
232             # "\lfoo" -> "{lcfirst("f")}oo".
233             #
234             # Perl6 is smart enough to know which segment of the braid it's on, and
235             # interprets the {..} boundary as a new Perl6 block.
236             #
237 0           my ( $start_delimiter ) =
238             $elem->content =~ m{ ^ ( qq[ ]. | qq. | q[ ]. | q. | . ) }x;
239 0           my $end_delimiter = substr( $elem->content, -1, 1 );
240              
241             # \l or \u followed *directly* by any \l or \u modifier simply ignores
242             # the remaining \l or \u modifiers.
243             #
244 0           $old_string =~ s{ (\\[lu]) (\\[lu])+ }{$1}gx;
245              
246             # \F, \L or \U followed *directly* by any \F, \L or \U modifier is a
247             # syntax error in Perl5, and can be reduced to a single \F, \L or \U.
248             #
249 0           $old_string =~ s{ (\\[FLU]) (\\[FLU])+ }{$1}gx;
250              
251             # \Q followed by anything is still a legal sequence.
252              
253             # \t is unchanged in Perl6.
254             # \n is unchanged in Perl6.
255             # \r is unchanged in Perl6.
256             # \f is unchanged in Perl6.
257             # \b is unchanged in Perl6.
258             # \a is unchanged in Perl6.
259             # \e is unchanged in Perl6.
260              
261             # \v is deprecated
262             #
263 0           $old_string =~ s{ \\v }{v}mgx;
264              
265             # \x{263a} now looks like \x[263a].
266             #
267 0           $old_string =~ s{ \\x \{ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
268              
269             # \x12 is unaltered.
270             # \x1L is unaltered.
271             #
272             # '..\x' is illegal in Perl6.
273             #
274 0           $old_string =~ s{ \\x $ }{}mx;
275              
276             # \N{U+1234} is now \x[1234].
277             # Variants with whitespace are illegal in Perl5, so don't worry about 'em
278             #
279 0           $old_string =~ s{ \\N \{ U \+ ([0-9a-fA-F]+) \} }{\\x[$1]}mgx;
280              
281             # \N{LATIN CAPITAL LETTER X} is now \c[LATIN CAPITAL LETTER X]
282             #
283 0           $old_string =~ s{ \\N \{ ([^\}]+) \} }{\\c[$1]}mgx;
284              
285             # \o{2637} now looks like \o[2637].
286             #
287             # \o12 is unaltered.
288             #
289 0           $old_string =~ s{ \\o \{ ([0-7]*) \) }{\\o[$1]}mgx;
290              
291             # \0123 is now \o[123].
292             #
293 0           $old_string =~ s{ \\0 \{ ([0-7]*) \) }{\\o[$1]}mgx;
294              
295             # \oL is now illegal, and in perl5 it generated nothing.
296             #
297             # "...\o" is a syntax error in both languages, so don't worry about it.
298             #
299 0           $old_string =~ s{ \\o \{ ([^\}]*) \} }{\\o[$1]}mgx;
300              
301             # \c. is unchanged. Or at least I'll treat it as such.
302              
303             # At this point, you'll notice that practically every '{' and '}'
304             # character is out of our target string, with trivial exceptions like
305             # 'c{', which is illegal anyway.
306             #
307             # This is important for two main reasons. The first is that anything inside
308             # {} in Perl6 is considered valid Perl6 code, which is also why a trick
309             # we use later works.
310             #
311             # So, unless {} are part of a variable that can be interpolated, we have
312             # to escape it. And we can't do that if there are constructs like \x{123}
313             # hanging around in the string, because those would get messed up.
314             #
315              
316 0           my @tokens = $self->tokenize_variables($elem,$old_string);
317              
318             # Now on to rewriting \l, \u, \E, \F, \L, \Q, \U in Perl6.
319             #
320             # \F, \L, \Q and \U are "sort of" nested.
321             #
322             # You can see this by running C<print "Start \L lOWER \U Upper Me \E mE \E">
323             # > Start lower UPPER ME mE
324             #
325             # Note how 'lOWER' is case-flattend, but after the \U..E, 'mE' isn't?
326             #
327             # So, rather than having to retain case settings, we can simply stop the
328             # lc(..) block after the first...
329             #
330 0           my $new_content;
331 0           for ( my $i = 0; $i < @tokens; $i++ ) {
332 0           my ( $v, $la1 ) = @tokens[$i,$i+1];
333              
334 0 0         if ( $v =~ m< ^ ( \$ | \@ ) >x ) {
335 0 0         if ( $v =~ s< ^ \$ \@ ><\\\$\@>x ) {
    0          
336             }
337             elsif ( $v =~ s< ^ ( \$ | \@ ) \{ ([^\}]+) \} ><{$1$2}>sx ) {
338             }
339             else {
340 0           $v =~ s< [-][\>] ><.>gx;
341 0           $v =~ s< \{ (\w+) (\s*) \} >< '{' .
  0            
342             $start_delimiter . $1 .
343             $end_delimiter . $2 .
344             '}'>segx;
345             }
346              
347 0           $v =~ s< ^ ( [(<>)] ) ><\\$1>sgx;
348 0           $v =~ s< ( [^\\] ) ( [(<>)] ) ><$1\\$2>sgx;
349             }
350             else {
351             # < > is now a pointy block, { } is now a code block, ( ) is also
352             # used.
353             #
354 0           $v =~ s< ^ ( [{(<>)}] ) ><\\$1>sgx;
355 0           $v =~ s< ( [^\\] ) ( [{(<>)}] ) ><$1\\$2>sgx;
356             }
357 0           $new_content .= $v;
358             }
359              
360             # if ( $v eq '\\l' ) {
361             # }
362             # elsif ( $v eq '\\u' ) {
363             # }
364             #
365             # elsif ( $v eq '\\F' or $v eq '\\L' ) {
366             # $collected .= '{' if @manip == 0;
367             # if ( @manip == 0 ) {
368             # $collected .= 'lc(' . $start_delimiter;
369             # }
370             # else {
371             # $collected .= $end_delimiter . ')~lc(' . $start_delimiter;
372             # }
373             # push @manip, $v;
374             # }
375             # elsif ( $v eq '\\Q' ) {
376             # $collected .= '{' if @manip == 0;
377             # if ( @manip == 0 ) {
378             # $collected .= 'quotemeta(' . $start_delimiter;
379             # }
380             # else {
381             # $collected .= $end_delimiter . ')~quotemeta(' . $start_delimiter;
382             # }
383             # push @manip, $v;
384             # }
385             # elsif ( $v eq '\\U' ) {
386             # $collected .= '{' if @manip == 0;
387             # if ( @manip == 0 ) {
388             # $collected .= 'tc(' . $start_delimiter;
389             # }
390             # else {
391             # $collected .= $end_delimiter . ')~tc(' . $start_delimiter;
392             # }
393             # push @manip, $v;
394             # }
395             # elsif ( $v eq '\\E' ) {
396             # pop @manip;
397             # if ( @manip == 0 ) {
398             # $collected .= $end_delimiter . ')}';
399             # }
400             # else {
401             # $collected .= $end_delimiter . ')';
402             # }
403             # }
404             # elsif ( $v =~ / ^ ( \$ | \@ ) /x ) {
405             # $collected .= $v;
406             # }
407              
408 0           eval {
409 0           set_string($elem,$new_content);
410             };
411 0 0         if ( $@ ) {
412 0           die "set_string broke! Please report this: ".Dump($elem);
413             }
414              
415 0           return $self->transformation( $DESC, $EXPL, $elem );
416             }
417              
418             1;
419              
420             #-----------------------------------------------------------------------------
421              
422             __END__
423              
424             =pod
425              
426             =head1 NAME
427              
428             Perl::ToPerl6::Transformer::BasicTypes::Strings::Interpolation - Format C<${x}> correctly
429              
430              
431             =head1 AFFILIATION
432              
433             This Transformer is part of the core L<Perl::ToPerl6|Perl::ToPerl6>
434             distribution.
435              
436              
437             =head1 DESCRIPTION
438              
439             In Perl6, contents inside {} are now executable code. That means that inside interpolated strings, C<"${x}"> will be parsed as C<"${x()}"> and throw an exception if C<x()> is not defined. As such, this transforms C<"${x}"> into C<"{$x}">:
440              
441             "The $x bit" --> "The $x bit"
442             "The $x-30 bit" --> "The $x\-30 bit"
443             "\N{FOO}" --> "\c{FOO}"
444             "The \l$x bit" --> "The {lc $x} bit"
445             "The \v bit" --> "The bit"
446             "The ${x}rd bit" --> "The {$x}rd bit"
447             "The \${x}rd bit" --> "The \$\{x\}rd bit"
448              
449             Transforms only interpolated strings outside of comments, heredocs and POD.
450              
451             =head1 CONFIGURATION
452              
453             This Transformer is not configurable except for the standard options.
454              
455             =head1 AUTHOR
456              
457             Jeffrey Goff <drforr@pobox.com>
458              
459             =head1 COPYRIGHT
460              
461             Copyright (c) 2015 Jeffrey Goff
462              
463             This program is free software; you can redistribute it and/or modify
464             it under the same terms as Perl itself.
465              
466             =cut
467              
468             ##############################################################################
469             # Local Variables:
470             # mode: cperl
471             # cperl-indent-level: 4
472             # fill-column: 78
473             # indent-tabs-mode: nil
474             # c-indentation-style: bsd
475             # End:
476             # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :