File Coverage

blib/lib/PPIx/Regexp/Token/Literal.pm
Criterion Covered Total %
statement 98 104 94.2
branch 87 100 87.0
condition 18 23 78.2
subroutine 18 19 94.7
pod 5 5 100.0
total 226 251 90.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Literal - Represent a literal character
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C has no descendants.
17              
18             =head1 DESCRIPTION
19              
20             This class represents a literal character, no matter how specified.
21              
22             =head1 METHODS
23              
24             This class provides the following public methods. Methods not documented
25             here are private, and unsupported in the sense that the author reserves
26             the right to change or remove them without notice.
27              
28             =cut
29              
30             package PPIx::Regexp::Token::Literal;
31              
32 9     9   67 use strict;
  9         32  
  9         264  
33 9     9   49 use warnings;
  9         20  
  9         247  
34              
35 9     9   47 use base qw{ PPIx::Regexp::Token };
  9         23  
  9         801  
36              
37 9         1096 use PPIx::Regexp::Constant qw{
38             COOKIE_CLASS COOKIE_REGEX_SET
39             LITERAL_LEFT_CURLY_ALLOWED
40             LITERAL_LEFT_CURLY_REMOVED_PHASE_1
41             LITERAL_LEFT_CURLY_REMOVED_PHASE_2
42             LITERAL_LEFT_CURLY_REMOVED_PHASE_3
43             MINIMUM_PERL MSG_PROHIBITED_BY_STRICT
44             TOKEN_UNKNOWN
45             @CARP_NOT
46 9     9   63 };
  9         39  
47              
48 9     9   70 use PPIx::Regexp::Util qw{ :width_one };
  9         18  
  9         25657  
49              
50             our $VERSION = '0.088';
51              
52             sub __new {
53 1275     1275   4425 my ( $class, $content, %arg ) = @_;
54              
55 1275 50       4795 my $self = $class->SUPER::__new( $content, %arg )
56             or return;
57              
58             defined $arg{ordinal}
59 1275 100       3430 and $self->{ordinal} = $arg{ordinal};
60              
61 1275         3917 return $self;
62             }
63              
64             # Return true if the token can be quantified, and false otherwise
65             # sub can_be_quantified { return };
66              
67             sub explain {
68 1     1 1 4 return 'Literal character';
69             }
70              
71             =head2 is_matcher
72              
73             This method returns a true value because a literal matches itself.
74              
75             =cut
76              
77 8     8 1 22 sub is_matcher { return 1; }
78              
79             sub perl_version_introduced {
80 258     258 1 8940 my ( $self ) = @_;
81             exists $self->{perl_version_introduced}
82 258 100       869 and return $self->{perl_version_introduced};
83 176         456 my $content = $self->content();
84 176         567 my $main = $self->main_structure();
85             $main
86             and $content =~ m/ \A \\ N \{ /smx
87             and not $main->interpolates()
88 176 100 100     751 and return ( $self->{perl_version_introduced} = '5.029010' );
      66        
89             $content =~ m/ \A \\ o /smx
90 175 100       397 and return ( $self->{perl_version_introduced} = '5.013003' );
91             $content =~ m/ \A \\ N [{] U [+] /smx
92 174 100       374 and return ( $self->{perl_version_introduced} = '5.008' );
93             $content =~ m/ \A \\ x [{] /smx # }
94 173 100       368 and return ( $self->{perl_version_introduced} = '5.006' );
95             $content =~ m/ \A \\ N /smx
96 172 100       412 and return ( $self->{perl_version_introduced} = '5.006001' );
97 171         620 return ( $self->{perl_version_introduced} = MINIMUM_PERL );
98             }
99              
100             {
101             my %removed = (
102             q<{> => sub {
103             my ( $self ) = @_;
104              
105             my $prev;
106              
107             if ( $prev = $self->sprevious_sibling() ) {
108             # When an unescaped left curly follows something else in
109             # the same structure, the logic on whether it is allowed
110             # lives, for better or worse, on the sibling.
111             return $prev->__following_literal_left_curly_disallowed_in();
112             } elsif ( $prev = $self->sprevious_element() ) {
113             # Perl 5.27.8 deprecated unescaped literal left curlys
114             # after a left paren that introduces a group. Therefore
115             # the left curly has no previous sibling. But the curly
116             # is still legal at the beginning of a regex, even one
117             # delimited by parens, so we can not return when we find
118             # a PPIx::Regexp::Token::Delimiter, which is a subclass
119             # of PPIx::Regexp::Token::Structure.
120             $prev->isa( 'PPIx::Regexp::Token::Structure' )
121             and q<(> eq $prev->content()
122             and not $prev->isa( 'PPIx::Regexp::Token::Delimiter' )
123             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_3;
124             }
125             # When this mess started, the idea was to always allow
126             # unescaped literal left curlies that started a regex or a
127             # group
128             return LITERAL_LEFT_CURLY_ALLOWED;
129             },
130             );
131              
132             sub perl_version_removed {
133 227     227 1 14971 my ( $self ) = @_;
134             exists $self->{perl_version_removed}
135 227 100       799 and return $self->{perl_version_removed};
136 145         238 my $code;
137             return ( $self->{perl_version_removed} =
138 145 100       341 ( $code = $removed{$self->content()} ) ?
139             scalar $code->( $self ) : undef
140             );
141             }
142             }
143              
144             # Some characters may or may not be literals depending on whether we are
145             # inside a character class. The following hash identifies those
146             # characters and says what we should return when outside (index 0) or
147             # inside (index 1) a character class, as judged by the presence of the
148             # relevant cookie.
149             my %double_agent = (
150             '.' => [ undef, 1 ],
151             '*' => [ undef, 1 ],
152             '?' => [ undef, 1 ],
153             '+' => [ undef, 1 ],
154             '-' => [ 1, undef ],
155             '|' => [ undef, 1 ],
156             );
157              
158             # These are the characters that other external tokenizers need to see,
159             # or at least that we need to take a closer look at. All others can be
160             # unconditionally made into single-character literals.
161             my %extra_ordinary = map { $_ => 1 }
162             split qr{}smx, '$@*+?.\\(){}[]^|-#';
163             # $ -> Token::Interpolation, Token::Assertion
164             # @ -> Token::Interpolation
165             # * -> Token::Quantifier
166             # + ? -> Token::Quantifier, Token::Greediness
167             # . -> Token::CharClass::Simple
168             # \ -> Token::Control, Token::CharClass::Simple, Token::Assertion,
169             # Token::Backreference
170             # ( ) { } [ ] -> Token::Structure
171             # ^ -> Token::Assertion
172             # | - -> Token::Operator
173              
174             my %regex_set_operator = map { $_ => 1 } qw{ & + | - ^ ! };
175              
176             # The regex for the extended white space available under regex sets in
177             # Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get
178             # this to work under Perl 5.6.2, so for that we fall back to ASCII white
179             # space. The stringy eval is because I have been unable to get
180             # satisfaction out of either interpolated characters (in general) or
181             # eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto).
182             #
183             # See PPIx::Regexp::Structure::RegexSet for the documentation of this
184             # mess.
185             # my $white_space_re = $] >= 5.008 ?
186             # 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
187             # 'qr< \\A \\s+ >smx';
188             #
189             # RT #91798
190             # The above turns out to be wrong, because \s matches too many
191             # characters. We need the following to get the right match. Note that
192             # \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{}
193             # characters were NOT added (as I originally thought) but were simply
194             # made characters that generated warnings when escaped, in preparation
195             # for adding them. When they actually get added, I will have to add back
196             # the trinary operator. Sigh.
197             # my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx';
198             #
199             # The extended white space characters came back in Perl 5.21.1.
200             my $white_space_re = $] >= 5.008 ?
201             'qr< \\A [\\t\\n\\cK\\f\\r \\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
202             'qr< \\A [\\t\\n\\cK\\f\\r ]+ >smx';
203             $white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval)
204              
205             my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ };
206              
207             sub __PPIX_TOKENIZER__regexp {
208 2824     2824   6394 my ( undef, $tokenizer, $character ) = @_; # Invocant, $char_type unused
209              
210 2824 100       7458 if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) {
211             # If we're inside a regex set no literals are allowed, but not
212             # all characters that get here are seen as literals.
213              
214 105 100       363 $regex_set_operator{$character}
215             and return $tokenizer->make_token(
216             length $character, 'PPIx::Regexp::Token::Operator' );
217              
218 92         142 my $accept;
219              
220             # As of 5.23.4, only space and horizontal tab are legal white
221             # space inside a bracketed class inside an extended character
222             # class
223 92 100       208 $accept = $tokenizer->find_regexp(
    100          
224             $tokenizer->cookie( COOKIE_CLASS ) ?
225             qr{ \A [ \t] }smx :
226             $white_space_re
227             )
228             and return $tokenizer->make_token(
229             $accept, 'PPIx::Regexp::Token::Whitespace' );
230              
231 59 50       179 $accept = _escaped( $tokenizer, $character )
232             and return $accept;
233              
234 59 100       217 $regex_pass_on{$character}
235             and return;
236              
237             # At this point we have a single character which is poised to be
238             # interpreted as a literal. These are not legal in a regex set
239             # except when also in a bracketed class.
240 8 50       50 return $tokenizer->cookie( COOKIE_CLASS ) ?
241             length $character :
242             $tokenizer->make_token(
243             length $character, TOKEN_UNKNOWN, {
244             error => 'Literal not valid in Regex set',
245             },
246             );
247              
248             } else {
249              
250             # Otherwise handle the characters that may or may not be
251             # literals depending on whether or not we are in a character
252             # class.
253 2719 100       8372 if ( my $class = $double_agent{$character} ) {
254 178 100       660 my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0;
255 178         738 return $class->[$inx];
256             }
257             }
258              
259             # If /x is in effect _and_ we are not inside a character class, \s
260             # is whitespace, and '#' introduces a comment. Otherwise they are
261             # both literals.
262 2541 100 100     7235 if ( $tokenizer->modifier( 'x*' ) &&
    100 66        
263             ! $tokenizer->cookie( COOKIE_CLASS ) ) {
264 276         500 my $accept;
265 276 100       791 $accept = $tokenizer->find_regexp( $white_space_re )
266             and return $tokenizer->make_token(
267             $accept, 'PPIx::Regexp::Token::Whitespace' );
268 181 100       783 $accept = $tokenizer->find_regexp(
269             qr{ \A \# [^\n]* (?: \n | \z) }smx )
270             and return $tokenizer->make_token(
271             $accept, 'PPIx::Regexp::Token::Comment' );
272             } elsif ( $tokenizer->modifier( 'xx' ) &&
273             $tokenizer->cookie( COOKIE_CLASS ) ) {
274 10         21 my $accept;
275 10 100       47 $accept = $tokenizer->find_regexp( qr{ \A [ \t] }smx )
276             and return $tokenizer->make_token(
277             $accept, 'PPIx::Regexp::Token::Whitespace',
278             { perl_version_introduced => '5.025009' },
279             );
280             } else {
281 2255 100 100     10672 ( $character eq '#' || $character =~ m/ \A \s \z /smx )
282             and return 1;
283             }
284              
285 2382         4319 my $accept;
286 2382 100       5957 $accept = _escaped( $tokenizer, $character )
287             and return $accept;
288              
289             # All other characters which are not extra ordinary get accepted.
290 2305 100       7619 $extra_ordinary{$character} or return 1;
291              
292 1233         2960 return;
293             }
294              
295             =begin comment
296              
297             The following is from perlop:
298              
299             The character following "\c" is mapped to some other character by
300             converting letters to upper case and then (on ASCII systems) by
301             inverting the 7th bit (0x40). The most interesting range is from '@' to
302             '_' (0x40 through 0x5F), resulting in a control character from 0x00
303             through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only
304             '@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting
305             in 0x00 through 0x1F and 0x7F.
306              
307             =end comment
308              
309             =cut
310              
311             # Recognize all the escaped constructions that generate literal
312             # characters in one gigantic regexp. Technically \1.. through \7.. are
313             # octal literals too, but we can not disambiguate these from back
314             # references until we know how many there are. So the lexer gets another
315             # dirty job.
316              
317             {
318             my %special = (
319             '\\N{}' => sub {
320             my ( $tokenizer, $accept ) = @_;
321              
322             =begin comment
323              
324             $tokenizer->strict()
325             or return $tokenizer->make_token( $accept,
326             'PPIx::Regexp::Token::NoOp', {
327             perl_version_removed => '5.027001',
328             },
329             );
330             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
331             error => join( ' ',
332             'Empty Unicode character name',
333             MSG_PROHIBITED_BY_STRICT ),
334             perl_version_introduced => '5.023008',
335             perl_version_removed => '5.027001',
336             },
337             );
338              
339             =end comment
340              
341             =cut
342              
343             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
344             error => 'Empty Unicode character name',
345             perl_version_introduced => '5.023008',
346             perl_version_removed => '5.027001',
347             },
348             );
349             },
350              
351             '\\o{}' => sub {
352             my ( $tokenizer, $accept ) = @_;
353             return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
354             error => q,
355             },
356             );
357             },
358              
359             '\\x{}' => sub {
360             my ( $tokenizer, $accept ) = @_;
361             $tokenizer->strict()
362             and return $tokenizer->make_token( $accept,
363             TOKEN_UNKNOWN, {
364             error =>
365             q,
366             },
367             );
368             return $accept;
369             },
370             );
371              
372             sub _escaped {
373 2441     2441   4573 my ( $tokenizer, $character ) = @_;
374              
375 2441 100       7371 $character eq '\\'
376             or return;
377              
378 304 100       1660 if ( my $accept = $tokenizer->find_regexp( # {
379             qr< \A \\ ( [ox] ) [{] ( [^}]* ) [}] >smx
380             ) ) {
381 8         39 my $match = $tokenizer->match();
382 8         19 my $code;
383 8 100       49 $code = $special{$match}
384             and return $code->( $tokenizer, $accept );
385 5         20 my ( $kind, $value ) = $tokenizer->capture();
386             my $invalid = {
387             o => qr<[^0-7]>smx,
388             x => qr<[[:^xdigit:]]>smx,
389 5         61 }->{$kind};
390 5 100       143 $value =~ m/ $invalid /smxg # /g for pos()
391             or return $accept;
392             $tokenizer->strict()
393             and return $tokenizer->make_token( $accept,
394             TOKEN_UNKNOWN, {
395             error => sprintf(
396             'Non-%s character in \\%s{...}',
397             {
398             o => 'octal',
399             x => 'hex',
400 2 100       140 }->{$kind},
401             $kind,
402             ),
403             },
404             );
405             return $tokenizer->make_token( $accept, __PACKAGE__, {
406             ordinal => {
407 1     1   8 o => sub { oct $_[0] },
408 0     0   0 x => sub { hex $_[0] },
409 1   50     18 }->{$kind}->( substr( $value, 0, pos $value ) || 0 ),
410             },
411             );
412             }
413              
414 296 100       1392 if ( my $accept = $tokenizer->find_regexp(
415             qr< \A \\ (?:
416             [^\w\s] | # delimiters/metas
417             [tnrfae] | # C-style escapes
418             0 [01234567]{0,2} | # octal
419             # [01234567]{1,3} | # made from backref by lexer
420             c [][\@[:alpha:]\\^_?] | # control characters
421             ## x (?: \{ [[:xdigit:]]* \} | [[:xdigit:]]{0,2} ) | # hex
422             ## o [{] [01234567]+ [}] | # octal as of 5.13.3
423             x [[:xdigit:]]{0,2} | # hex - brackets handled above
424             ## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* | # must begin w/ alpha
425             ## U [+] [[:xdigit:]]+ ) \} ) | # unicode
426             N (?: [{] (?= [^0-9] ) [^\}]* [}] ) # unicode
427             ) >smx ) ) {
428 69         298 my $match = $tokenizer->match();
429 69         139 my $code;
430 69 100       266 $code = $special{$match}
431             and return $code->( $tokenizer, $accept );
432 63         348 return $accept;
433             }
434              
435 227         799 return;
436             }
437             }
438              
439             =head2 ordinal
440              
441             print 'The ordinal of ', $token->content(),
442             ' is ', $token->ordinal(), "\n";
443              
444             This method returns the ordinal of the literal if it can figure it out.
445             It is analogous to the C built-in.
446              
447             It will not attempt to determine the ordinal of a unicode name
448             (C<\N{...}>) unless L has been loaded, and supports
449             the L)> function.
450             Instead, it will return C. Users of Perl 5.6.2 and older may be
451             out of luck here.
452              
453             Unicode code points (e.g. C<\N{U+abcd}>) should work independently of
454             L, and just return the value of C.
455              
456             It will never attempt to return the ordinal of an octet (C<\C{...}>)
457             because I don't understand the syntax.
458              
459             =cut
460              
461             {
462              
463             my %escapes = (
464             '\\t' => ord "\t",
465             '\\n' => ord "\n",
466             '\\r' => ord "\r",
467             '\\f' => ord "\f",
468             '\\a' => ord "\a",
469             '\\b' => ord "\b",
470             '\\e' => ord "\e",
471             '\\c?' => ord "\c?",
472             '\\c@' => ord "\c@",
473             '\\cA' => ord "\cA",
474             '\\ca' => ord "\cA",
475             '\\cB' => ord "\cB",
476             '\\cb' => ord "\cB",
477             '\\cC' => ord "\cC",
478             '\\cc' => ord "\cC",
479             '\\cD' => ord "\cD",
480             '\\cd' => ord "\cD",
481             '\\cE' => ord "\cE",
482             '\\ce' => ord "\cE",
483             '\\cF' => ord "\cF",
484             '\\cf' => ord "\cF",
485             '\\cG' => ord "\cG",
486             '\\cg' => ord "\cG",
487             '\\cH' => ord "\cH",
488             '\\ch' => ord "\cH",
489             '\\cI' => ord "\cI",
490             '\\ci' => ord "\cI",
491             '\\cJ' => ord "\cJ",
492             '\\cj' => ord "\cJ",
493             '\\cK' => ord "\cK",
494             '\\ck' => ord "\cK",
495             '\\cL' => ord "\cL",
496             '\\cl' => ord "\cL",
497             '\\cM' => ord "\cM",
498             '\\cm' => ord "\cM",
499             '\\cN' => ord "\cN",
500             '\\cn' => ord "\cN",
501             '\\cO' => ord "\cO",
502             '\\co' => ord "\cO",
503             '\\cP' => ord "\cP",
504             '\\cp' => ord "\cP",
505             '\\cQ' => ord "\cQ",
506             '\\cq' => ord "\cQ",
507             '\\cR' => ord "\cR",
508             '\\cr' => ord "\cR",
509             '\\cS' => ord "\cS",
510             '\\cs' => ord "\cS",
511             '\\cT' => ord "\cT",
512             '\\ct' => ord "\cT",
513             '\\cU' => ord "\cU",
514             '\\cu' => ord "\cU",
515             '\\cV' => ord "\cV",
516             '\\cv' => ord "\cV",
517             '\\cW' => ord "\cW",
518             '\\cw' => ord "\cW",
519             '\\cX' => ord "\cX",
520             '\\cx' => ord "\cX",
521             '\\cY' => ord "\cY",
522             '\\cy' => ord "\cY",
523             '\\cZ' => ord "\cZ",
524             '\\cz' => ord "\cZ",
525             '\\c[' => ord "\c[",
526             '\\c\\\\' => ord "\c\\", # " # Get Vim's head straight.
527             '\\c]' => ord "\c]",
528             '\\c^' => ord "\c^",
529             '\\c_' => ord "\c_",
530             );
531              
532             sub ordinal {
533 17     17 1 47 my ( $self ) = @_;
534 17 100       50 exists $self->{ordinal} and return $self->{ordinal};
535 16         57 return ( $self->{ordinal} = $self->_ordinal() );
536             }
537              
538             my %octal = map {; "$_" => 1 } ( 0 .. 7 );
539              
540             sub _ordinal {
541 16     16   31 my ( $self ) = @_;
542 16         46 my $content = $self->content();
543              
544 16 100       89 $content =~ m/ \A \\ /smx or return ord $content;
545              
546 8 100       33 exists $escapes{$content} and return $escapes{$content};
547              
548 6         19 my $indicator = substr $content, 1, 1;
549              
550 6 100       30 $octal{$indicator} and return oct substr $content, 1;
551              
552 5 100       15 if ( $indicator eq 'x' ) {
553 3 100       27 $content =~ m/ \A \\ x \{ ( [[:xdigit:]]* ) /smx
554             and return hex "0$1";
555 1 50       12 $content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx
556             and return hex $1;
557 0         0 return;
558             }
559              
560 2 50       7 if ( $indicator eq 'o' ) {
561 0 0       0 $content =~ m/ \A \\ o [{] ( [01234567]* ) \z /smx
562             and return oct "0$1";
563 0         0 return; # Shouldn't happen, but ...
564             }
565              
566 2 50       6 if ( $indicator eq 'N' ) {
567 2 100       14 $content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx
568             and return hex $1;
569 1 50       9 $content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx
    50          
570             and return (
571             _have_charnames_vianame() ?
572             charnames::vianame( $1 ) :
573             undef
574             );
575 0         0 return; # Shouldn't happen, but ...
576             }
577              
578 0         0 return ord $indicator;
579             }
580              
581             }
582              
583             sub __following_literal_left_curly_disallowed_in {
584 2     2   11 return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
585             }
586              
587             {
588             my $have_charnames_vianame;
589              
590             sub _have_charnames_vianame {
591 1 50   1   4 defined $have_charnames_vianame
592             and return $have_charnames_vianame;
593             return (
594 1 50       17 $have_charnames_vianame =
595             charnames->can( 'vianame' ) ? 1 : 0
596             );
597              
598             }
599             }
600              
601             sub __perl_requirements_setup {
602 4     4   11 my ( $self ) = @_;
603 4         7 my $prev;
604 4 50 66     9 q<{> eq $self->content() # }
      66        
605             and $prev = $self->sprevious_sibling()
606             and $prev->isa( 'PPIx::Regexp::Token::Literal' )
607             or return $self->SUPER::__perl_requirements_setup();
608             return (
609             {
610 1         10 introduced => MINIMUM_PERL,
611             removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_1,
612             },
613             # TODO the following will be needed if this construction is
614             # re-allowed in 5.26.1:
615             # {
616             # introduced => '5.026001',
617             # removed => '6.027000',
618             # },
619             {
620             introduced => '5.027001',
621             removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_2,
622             },
623             );
624             }
625              
626             *__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
627              
628             1;
629              
630             __END__