File Coverage

blib/lib/PPIx/Regexp/Token/Literal.pm
Criterion Covered Total %
statement 98 104 94.2
branch 87 100 87.0
condition 18 23 78.2
subroutine 18 19 94.7
pod 5 5 100.0
total 226 251 90.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Literal - Represent a literal character
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C has no descendants.
17              
18             =head1 DESCRIPTION
19              
20             This class represents a literal character, no matter how specified.
21              
22             =head1 METHODS
23              
24             This class provides the following public methods. Methods not documented
25             here are private, and unsupported in the sense that the author reserves
26             the right to change or remove them without notice.
27              
28             =cut
29              
30             package PPIx::Regexp::Token::Literal;
31              
32 9     9   66 use strict;
  9         44  
  9         293  
33 9     9   67 use warnings;
  9         18  
  9         315  
34              
35 9     9   53 use base qw{ PPIx::Regexp::Token };
  9         19  
  9         878  
36              
37 9         1144 use PPIx::Regexp::Constant qw{
38             COOKIE_CLASS COOKIE_REGEX_SET
39             LITERAL_LEFT_CURLY_ALLOWED
40             LITERAL_LEFT_CURLY_REMOVED_PHASE_1
41             LITERAL_LEFT_CURLY_REMOVED_PHASE_2
42             LITERAL_LEFT_CURLY_REMOVED_PHASE_3
43             MINIMUM_PERL MSG_PROHIBITED_BY_STRICT
44             TOKEN_UNKNOWN
45             @CARP_NOT
46 9     9   74 };
  9         20  
47              
48 9     9   80 use PPIx::Regexp::Util qw{ :width_one };
  9         19  
  9         26644  
49              
50             our $VERSION = '0.087_01';
51              
52             sub __new {
53 1275     1275   4757 my ( $class, $content, %arg ) = @_;
54              
55 1275 50       4963 my $self = $class->SUPER::__new( $content, %arg )
56             or return;
57              
58             defined $arg{ordinal}
59 1275 100       3632 and $self->{ordinal} = $arg{ordinal};
60              
61 1275         3675 return $self;
62             }
63              
64             # Return true if the token can be quantified, and false otherwise
65             # sub can_be_quantified { return };
66              
67             sub explain {
68 1     1 1 9 return 'Literal character';
69             }
70              
71             =head2 is_matcher
72              
73             This method returns a true value because a literal matches itself.
74              
75             =cut
76              
77 8     8 1 30 sub is_matcher { return 1; }
78              
79             sub perl_version_introduced {
80 258     258 1 8780 my ( $self ) = @_;
81             exists $self->{perl_version_introduced}
82 258 100       855 and return $self->{perl_version_introduced};
83 176         419 my $content = $self->content();
84 176         546 my $main = $self->main_structure();
85             $main
86             and $content =~ m/ \A \\ N \{ /smx
87             and not $main->interpolates()
88 176 100 100     749 and return ( $self->{perl_version_introduced} = '5.029010' );
      66        
89             $content =~ m/ \A \\ o /smx
90 175 100       403 and return ( $self->{perl_version_introduced} = '5.013003' );
91             $content =~ m/ \A \\ N [{] U [+] /smx
92 174 100       352 and return ( $self->{perl_version_introduced} = '5.008' );
93             $content =~ m/ \A \\ x [{] /smx # }
94 173 100       383 and return ( $self->{perl_version_introduced} = '5.006' );
95             $content =~ m/ \A \\ N /smx
96 172 100       350 and return ( $self->{perl_version_introduced} = '5.006001' );
97 171         522 return ( $self->{perl_version_introduced} = MINIMUM_PERL );
98             }
99              
100             {
101             my %removed = (
102             q<{> => sub {
103             my ( $self ) = @_;
104              
105             my $prev;
106              
107             if ( $prev = $self->sprevious_sibling() ) {
108             # When an unescaped left curly follows something else in
109             # the same structure, the logic on whether it is allowed
110             # lives, for better or worse, on the sibling.
111             return $prev->__following_literal_left_curly_disallowed_in();
112             } elsif ( $prev = $self->sprevious_element() ) {
113             # Perl 5.27.8 deprecated unescaped literal left curlys
114             # after a left paren that introduces a group. Therefore
115             # the left curly has no previous sibling. But the curly
116             # is still legal at the beginning of a regex, even one
117             # delimited by parens, so we can not return when we find
118             # a PPIx::Regexp::Token::Delimiter, which is a subclass
119             # of PPIx::Regexp::Token::Structure.
120             $prev->isa( 'PPIx::Regexp::Token::Structure' )
121             and q<(> eq $prev->content()
122             and not $prev->isa( 'PPIx::Regexp::Token::Delimiter' )
123             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_3;
124             }
125             # When this mess started, the idea was to always allow
126             # unescaped literal left curlies that started a regex or a
127             # group
128             return LITERAL_LEFT_CURLY_ALLOWED;
129             },
130             );
131              
132             sub perl_version_removed {
133 227     227 1 15158 my ( $self ) = @_;
134             exists $self->{perl_version_removed}
135 227 100       878 and return $self->{perl_version_removed};
136 145         226 my $code;
137             return ( $self->{perl_version_removed} =
138 145 100       359 ( $code = $removed{$self->content()} ) ?
139             scalar $code->( $self ) : undef
140             );
141             }
142             }
143              
144             # Some characters may or may not be literals depending on whether we are
145             # inside a character class. The following hash identifies those
146             # characters and says what we should return when outside (index 0) or
147             # inside (index 1) a character class, as judged by the presence of the
148             # relevant cookie.
149             my %double_agent = (
150             '.' => [ undef, 1 ],
151             '*' => [ undef, 1 ],
152             '?' => [ undef, 1 ],
153             '+' => [ undef, 1 ],
154             '-' => [ 1, undef ],
155             '|' => [ undef, 1 ],
156             );
157              
158             # These are the characters that other external tokenizers need to see,
159             # or at least that we need to take a closer look at. All others can be
160             # unconditionally made into single-character literals.
161             my %extra_ordinary = map { $_ => 1 }
162             split qr{}smx, '$@*+?.\\(){}[]^|-#';
163             # $ -> Token::Interpolation, Token::Assertion
164             # @ -> Token::Interpolation
165             # * -> Token::Quantifier
166             # + ? -> Token::Quantifier, Token::Greediness
167             # . -> Token::CharClass::Simple
168             # \ -> Token::Control, Token::CharClass::Simple, Token::Assertion,
169             # Token::Backreference
170             # ( ) { } [ ] -> Token::Structure
171             # ^ -> Token::Assertion
172             # | - -> Token::Operator
173              
174             my %regex_set_operator = map { $_ => 1 } qw{ & + | - ^ ! };
175              
176             # The regex for the extended white space available under regex sets in
177             # Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get
178             # this to work under Perl 5.6.2, so for that we fall back to ASCII white
179             # space. The stringy eval is because I have been unable to get
180             # satisfaction out of either interpolated characters (in general) or
181             # eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto).
182             #
183             # See PPIx::Regexp::Structure::RegexSet for the documentation of this
184             # mess.
185             # my $white_space_re = $] >= 5.008 ?
186             # 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
187             # 'qr< \\A \\s+ >smx';
188             #
189             # RT #91798
190             # The above turns out to be wrong, because \s matches too many
191             # characters. We need the following to get the right match. Note that
192             # \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{}
193             # characters were NOT added (as I originally thought) but were simply
194             # made characters that generated warnings when escaped, in preparation
195             # for adding them. When they actually get added, I will have to add back
196             # the trinary operator. Sigh.
197             # my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx';
198             #
199             # The extended white space characters came back in Perl 5.21.1.
200             my $white_space_re = $] >= 5.008 ?
201             'qr< \\A [\\t\\n\\cK\\f\\r \\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
202             'qr< \\A [\\t\\n\\cK\\f\\r ]+ >smx';
203             $white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval)
204              
205             my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ };
206              
207             sub __PPIX_TOKENIZER__regexp {
208 2824     2824   6392 my ( undef, $tokenizer, $character ) = @_; # Invocant, $char_type unused
209              
210 2824 100       7883 if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) {
211             # If we're inside a regex set no literals are allowed, but not
212             # all characters that get here are seen as literals.
213              
214 105 100       347 $regex_set_operator{$character}
215             and return $tokenizer->make_token(
216             length $character, 'PPIx::Regexp::Token::Operator' );
217              
218 92         152 my $accept;
219              
220             # As of 5.23.4, only space and horizontal tab are legal white
221             # space inside a bracketed class inside an extended character
222             # class
223 92 100       209 $accept = $tokenizer->find_regexp(
    100          
224             $tokenizer->cookie( COOKIE_CLASS ) ?
225             qr{ \A [ \t] }smx :
226             $white_space_re
227             )
228             and return $tokenizer->make_token(
229             $accept, 'PPIx::Regexp::Token::Whitespace' );
230              
231 59 50       198 $accept = _escaped( $tokenizer, $character )
232             and return $accept;
233              
234 59 100       292 $regex_pass_on{$character}
235             and return;
236              
237             # At this point we have a single character which is poised to be
238             # interpreted as a literal. These are not legal in a regex set
239             # except when also in a bracketed class.
240 8 50       47 return $tokenizer->cookie( COOKIE_CLASS ) ?
241             length $character :
242             $tokenizer->make_token(
243             length $character, TOKEN_UNKNOWN, {
244             error => 'Literal not valid in Regex set',
245             },
246             );
247              
248             } else {
249              
250             # Otherwise handle the characters that may or may not be
251             # literals depending on whether or not we are in a character
252             # class.
253 2719 100       8100 if ( my $class = $double_agent{$character} ) {
254 178 100       741 my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0;
255 178         676 return $class->[$inx];
256             }
257             }
258              
259             # If /x is in effect _and_ we are not inside a character class, \s
260             # is whitespace, and '#' introduces a comment. Otherwise they are
261             # both literals.
262 2541 100 100     6690 if ( $tokenizer->modifier( 'x*' ) &&
    100 66        
263             ! $tokenizer->cookie( COOKIE_CLASS ) ) {
264 276         433 my $accept;
265 276 100       736 $accept = $tokenizer->find_regexp( $white_space_re )
266             and return $tokenizer->make_token(
267             $accept, 'PPIx::Regexp::Token::Whitespace' );
268 181 100       904 $accept = $tokenizer->find_regexp(
269             qr{ \A \# [^\n]* (?: \n | \z) }smx )
270             and return $tokenizer->make_token(
271             $accept, 'PPIx::Regexp::Token::Comment' );
272             } elsif ( $tokenizer->modifier( 'xx' ) &&
273             $tokenizer->cookie( COOKIE_CLASS ) ) {
274 10         22 my $accept;
275 10 100       50 $accept = $tokenizer->find_regexp( qr{ \A [ \t] }smx )
276             and return $tokenizer->make_token(
277             $accept, 'PPIx::Regexp::Token::Whitespace',
278             { perl_version_introduced => '5.025009' },
279             );
280             } else {
281 2255 100 100     10899 ( $character eq '#' || $character =~ m/ \A \s \z /smx )
282             and return 1;
283             }
284              
285 2382         4428 my $accept;
286 2382 100       5519 $accept = _escaped( $tokenizer, $character )
287             and return $accept;
288              
289             # All other characters which are not extra ordinary get accepted.
290 2305 100       7778 $extra_ordinary{$character} or return 1;
291              
292 1233         3208 return;
293             }
294              
295             =begin comment
296              
297             The following is from perlop:
298              
299             The character following "\c" is mapped to some other character by
300             converting letters to upper case and then (on ASCII systems) by
301             inverting the 7th bit (0x40). The most interesting range is from '@' to
302             '_' (0x40 through 0x5F), resulting in a control character from 0x00
303             through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only
304             '@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting
305             in 0x00 through 0x1F and 0x7F.
306              
307             =end comment
308              
309             =cut
310              
311             # Recognize all the escaped constructions that generate literal
312             # characters in one gigantic regexp. Technically \1.. through \7.. are
313             # octal literals too, but we can not disambiguate these from back
314             # references until we know how many there are. So the lexer gets another
315             # dirty job.
316              
317             {
318             my %special = (
319             '\\N{}' => sub {
320             my ( $tokenizer, $accept ) = @_;
321              
322             =begin comment
323              
324             $tokenizer->strict()
325             or return $tokenizer->make_token( $accept,
326             'PPIx::Regexp::Token::NoOp', {
327             perl_version_removed => '5.027001',
328             },
329             );
330             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
331             error => join( ' ',
332             'Empty Unicode character name',
333             MSG_PROHIBITED_BY_STRICT ),
334             perl_version_introduced => '5.023008',
335             perl_version_removed => '5.027001',
336             },
337             );
338              
339             =end comment
340              
341             =cut
342              
343             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
344             error => 'Empty Unicode character name',
345             perl_version_introduced => '5.023008',
346             perl_version_removed => '5.027001',
347             },
348             );
349             },
350              
351             '\\o{}' => sub {
352             my ( $tokenizer, $accept ) = @_;
353             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
354             error => q,
355             },
356             );
357             },
358              
359             '\\x{}' => sub {
360             my ( $tokenizer, $accept ) = @_;
361             $tokenizer->strict()
362             and return $tokenizer->make_token( $accept,
363             TOKEN_UNKNOWN, {
364             error =>
365             q,
366             },
367             );
368             return $accept;
369             },
370             );
371              
372             sub _escaped {
373 2441     2441   5031 my ( $tokenizer, $character ) = @_;
374              
375 2441 100       7565 $character eq '\\'
376             or return;
377              
378 304 100       1831 if ( my $accept = $tokenizer->find_regexp( # {
379             qr< \A \\ ( [ox] ) [{] ( [^}]* ) [}] >smx
380             ) ) {
381 8         58 my $match = $tokenizer->match();
382 8         18 my $code;
383 8 100       35 $code = $special{$match}
384             and return $code->( $tokenizer, $accept );
385 5         19 my ( $kind, $value ) = $tokenizer->capture();
386             my $invalid = {
387             o => qr<[^0-7]>smx,
388             x => qr<[[:^xdigit:]]>smx,
389 5         53 }->{$kind};
390 5 100       151 $value =~ m/ $invalid /smxg # /g for pos()
391             or return $accept;
392             $tokenizer->strict()
393             and return $tokenizer->make_token( $accept,
394             TOKEN_UNKNOWN, {
395             error => sprintf(
396             'Non-%s character in \\%s{...}',
397             {
398             o => 'octal',
399             x => 'hex',
400 2 100       126 }->{$kind},
401             $kind,
402             ),
403             },
404             );
405             return $tokenizer->make_token( $accept, __PACKAGE__, {
406             ordinal => {
407 1     1   7 o => sub { oct $_[0] },
408 0     0   0 x => sub { hex $_[0] },
409 1   50     18 }->{$kind}->( substr( $value, 0, pos $value ) || 0 ),
410             },
411             );
412             }
413              
414 296 100       1441 if ( my $accept = $tokenizer->find_regexp(
415             qr< \A \\ (?:
416             [^\w\s] | # delimiters/metas
417             [tnrfae] | # C-style escapes
418             0 [01234567]{0,2} | # octal
419             # [01234567]{1,3} | # made from backref by lexer
420             c [][\@[:alpha:]\\^_?] | # control characters
421             ## x (?: \{ [[:xdigit:]]* \} | [[:xdigit:]]{0,2} ) | # hex
422             ## o [{] [01234567]+ [}] | # octal as of 5.13.3
423             x [[:xdigit:]]{0,2} | # hex - brackets handled above
424             ## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* | # must begin w/ alpha
425             ## U [+] [[:xdigit:]]+ ) \} ) | # unicode
426             N (?: [{] (?= [^0-9] ) [^\}]* [}] ) # unicode
427             ) >smx ) ) {
428 69         330 my $match = $tokenizer->match();
429 69         125 my $code;
430 69 100       264 $code = $special{$match}
431             and return $code->( $tokenizer, $accept );
432 63         361 return $accept;
433             }
434              
435 227         759 return;
436             }
437             }
438              
439             =head2 ordinal
440              
441             print 'The ordinal of ', $token->content(),
442             ' is ', $token->ordinal(), "\n";
443              
444             This method returns the ordinal of the literal if it can figure it out.
445             It is analogous to the C built-in.
446              
447             It will not attempt to determine the ordinal of a unicode name
448             (C<\N{...}>) unless L has been loaded, and supports
449             the L)> function.
450             Instead, it will return C. Users of Perl 5.6.2 and older may be
451             out of luck here.
452              
453             Unicode code points (e.g. C<\N{U+abcd}>) should work independently of
454             L, and just return the value of C.
455              
456             It will never attempt to return the ordinal of an octet (C<\C{...}>)
457             because I don't understand the syntax.
458              
459             =cut
460              
461             {
462              
463             my %escapes = (
464             '\\t' => ord "\t",
465             '\\n' => ord "\n",
466             '\\r' => ord "\r",
467             '\\f' => ord "\f",
468             '\\a' => ord "\a",
469             '\\b' => ord "\b",
470             '\\e' => ord "\e",
471             '\\c?' => ord "\c?",
472             '\\c@' => ord "\c@",
473             '\\cA' => ord "\cA",
474             '\\ca' => ord "\cA",
475             '\\cB' => ord "\cB",
476             '\\cb' => ord "\cB",
477             '\\cC' => ord "\cC",
478             '\\cc' => ord "\cC",
479             '\\cD' => ord "\cD",
480             '\\cd' => ord "\cD",
481             '\\cE' => ord "\cE",
482             '\\ce' => ord "\cE",
483             '\\cF' => ord "\cF",
484             '\\cf' => ord "\cF",
485             '\\cG' => ord "\cG",
486             '\\cg' => ord "\cG",
487             '\\cH' => ord "\cH",
488             '\\ch' => ord "\cH",
489             '\\cI' => ord "\cI",
490             '\\ci' => ord "\cI",
491             '\\cJ' => ord "\cJ",
492             '\\cj' => ord "\cJ",
493             '\\cK' => ord "\cK",
494             '\\ck' => ord "\cK",
495             '\\cL' => ord "\cL",
496             '\\cl' => ord "\cL",
497             '\\cM' => ord "\cM",
498             '\\cm' => ord "\cM",
499             '\\cN' => ord "\cN",
500             '\\cn' => ord "\cN",
501             '\\cO' => ord "\cO",
502             '\\co' => ord "\cO",
503             '\\cP' => ord "\cP",
504             '\\cp' => ord "\cP",
505             '\\cQ' => ord "\cQ",
506             '\\cq' => ord "\cQ",
507             '\\cR' => ord "\cR",
508             '\\cr' => ord "\cR",
509             '\\cS' => ord "\cS",
510             '\\cs' => ord "\cS",
511             '\\cT' => ord "\cT",
512             '\\ct' => ord "\cT",
513             '\\cU' => ord "\cU",
514             '\\cu' => ord "\cU",
515             '\\cV' => ord "\cV",
516             '\\cv' => ord "\cV",
517             '\\cW' => ord "\cW",
518             '\\cw' => ord "\cW",
519             '\\cX' => ord "\cX",
520             '\\cx' => ord "\cX",
521             '\\cY' => ord "\cY",
522             '\\cy' => ord "\cY",
523             '\\cZ' => ord "\cZ",
524             '\\cz' => ord "\cZ",
525             '\\c[' => ord "\c[",
526             '\\c\\\\' => ord "\c\\", # " # Get Vim's head straight.
527             '\\c]' => ord "\c]",
528             '\\c^' => ord "\c^",
529             '\\c_' => ord "\c_",
530             );
531              
532             sub ordinal {
533 17     17 1 41 my ( $self ) = @_;
534 17 100       56 exists $self->{ordinal} and return $self->{ordinal};
535 16         37 return ( $self->{ordinal} = $self->_ordinal() );
536             }
537              
538             my %octal = map {; "$_" => 1 } ( 0 .. 7 );
539              
540             sub _ordinal {
541 16     16   30 my ( $self ) = @_;
542 16         50 my $content = $self->content();
543              
544 16 100       89 $content =~ m/ \A \\ /smx or return ord $content;
545              
546 8 100       34 exists $escapes{$content} and return $escapes{$content};
547              
548 6         20 my $indicator = substr $content, 1, 1;
549              
550 6 100       25 $octal{$indicator} and return oct substr $content, 1;
551              
552 5 100       13 if ( $indicator eq 'x' ) {
553 3 100       29 $content =~ m/ \A \\ x \{ ( [[:xdigit:]]* ) /smx
554             and return hex "0$1";
555 1 50       10 $content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx
556             and return hex $1;
557 0         0 return;
558             }
559              
560 2 50       7 if ( $indicator eq 'o' ) {
561 0 0       0 $content =~ m/ \A \\ o [{] ( [01234567]* ) \z /smx
562             and return oct "0$1";
563 0         0 return; # Shouldn't happen, but ...
564             }
565              
566 2 50       7 if ( $indicator eq 'N' ) {
567 2 100       14 $content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx
568             and return hex $1;
569 1 50       11 $content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx
    50          
570             and return (
571             _have_charnames_vianame() ?
572             charnames::vianame( $1 ) :
573             undef
574             );
575 0         0 return; # Shouldn't happen, but ...
576             }
577              
578 0         0 return ord $indicator;
579             }
580              
581             }
582              
583             sub __following_literal_left_curly_disallowed_in {
584 2     2   10 return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
585             }
586              
587             {
588             my $have_charnames_vianame;
589              
590             sub _have_charnames_vianame {
591 1 50   1   5 defined $have_charnames_vianame
592             and return $have_charnames_vianame;
593             return (
594 1 50       18 $have_charnames_vianame =
595             charnames->can( 'vianame' ) ? 1 : 0
596             );
597              
598             }
599             }
600              
601             sub __perl_requirements_setup {
602 4     4   10 my ( $self ) = @_;
603 4         5 my $prev;
604 4 50 66     10 q<{> eq $self->content() # }
      66        
605             and $prev = $self->sprevious_sibling()
606             and $prev->isa( 'PPIx::Regexp::Token::Literal' )
607             or return $self->SUPER::__perl_requirements_setup();
608             return (
609             {
610 1         11 introduced => MINIMUM_PERL,
611             removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_1,
612             },
613             # TODO the following will be needed if this construction is
614             # re-allowed in 5.26.1:
615             # {
616             # introduced => '5.026001',
617             # removed => '6.027000',
618             # },
619             {
620             introduced => '5.027001',
621             removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_2,
622             },
623             );
624             }
625              
626             *__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
627              
628             1;
629              
630             __END__