File Coverage

blib/lib/PPIx/Regexp/Token/Literal.pm

Criterion	Covered	Total	%
statement	98	104	94.2
branch	87	100	87.0
condition	18	23	78.2
subroutine	18	19	94.7
pod	5	5	100.0
total	226	251	90.0

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Literal - Represent a literal character
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C has no descendants.
17
18							=head1 DESCRIPTION
19
20							This class represents a literal character, no matter how specified.
21
22							=head1 METHODS
23
24							This class provides the following public methods. Methods not documented
25							here are private, and unsupported in the sense that the author reserves
26							the right to change or remove them without notice.
27
28							=cut
29
30							package PPIx::Regexp::Token::Literal;
31
32	9			9		67	use strict;
	9					32
	9					264
33	9			9		49	use warnings;
	9					20
	9					247
34
35	9			9		47	use base qw{ PPIx::Regexp::Token };
	9					23
	9					801
36
37	9					1096	use PPIx::Regexp::Constant qw{
38							COOKIE_CLASS COOKIE_REGEX_SET
39							LITERAL_LEFT_CURLY_ALLOWED
40							LITERAL_LEFT_CURLY_REMOVED_PHASE_1
41							LITERAL_LEFT_CURLY_REMOVED_PHASE_2
42							LITERAL_LEFT_CURLY_REMOVED_PHASE_3
43							MINIMUM_PERL MSG_PROHIBITED_BY_STRICT
44							TOKEN_UNKNOWN
45							@CARP_NOT
46	9			9		63	};
	9					39
47
48	9			9		70	use PPIx::Regexp::Util qw{ :width_one };
	9					18
	9					25657
49
50							our $VERSION = '0.088';
51
52							sub __new {
53	1275			1275		4425	my ( $class, $content, %arg ) = @_;
54
55	1275	50				4795	my $self = $class->SUPER::__new( $content, %arg )
56							or return;
57
58							defined $arg{ordinal}
59	1275	100				3430	and $self->{ordinal} = $arg{ordinal};
60
61	1275					3917	return $self;
62							}
63
64							# Return true if the token can be quantified, and false otherwise
65							# sub can_be_quantified { return };
66
67							sub explain {
68	1			1	1	4	return 'Literal character';
69							}
70
71							=head2 is_matcher
72
73							This method returns a true value because a literal matches itself.
74
75							=cut
76
77	8			8	1	22	sub is_matcher { return 1; }
78
79							sub perl_version_introduced {
80	258			258	1	8940	my ( $self ) = @_;
81							exists $self->{perl_version_introduced}
82	258	100				869	and return $self->{perl_version_introduced};
83	176					456	my $content = $self->content();
84	176					567	my $main = $self->main_structure();
85							$main
86							and $content =~ m/ \A \\ N \{ /smx
87							and not $main->interpolates()
88	176	100	100			751	and return ( $self->{perl_version_introduced} = '5.029010' );
			66
89							$content =~ m/ \A \\ o /smx
90	175	100				397	and return ( $self->{perl_version_introduced} = '5.013003' );
91							$content =~ m/ \A \\ N [{] U [+] /smx
92	174	100				374	and return ( $self->{perl_version_introduced} = '5.008' );
93							$content =~ m/ \A \\ x [{] /smx # }
94	173	100				368	and return ( $self->{perl_version_introduced} = '5.006' );
95							$content =~ m/ \A \\ N /smx
96	172	100				412	and return ( $self->{perl_version_introduced} = '5.006001' );
97	171					620	return ( $self->{perl_version_introduced} = MINIMUM_PERL );
98							}
99
100							{
101							my %removed = (
102							q<{> => sub {
103							my ( $self ) = @_;
104
105							my $prev;
106
107							if ( $prev = $self->sprevious_sibling() ) {
108							# When an unescaped left curly follows something else in
109							# the same structure, the logic on whether it is allowed
110							# lives, for better or worse, on the sibling.
111							return $prev->__following_literal_left_curly_disallowed_in();
112							} elsif ( $prev = $self->sprevious_element() ) {
113							# Perl 5.27.8 deprecated unescaped literal left curlys
114							# after a left paren that introduces a group. Therefore
115							# the left curly has no previous sibling. But the curly
116							# is still legal at the beginning of a regex, even one
117							# delimited by parens, so we can not return when we find
118							# a PPIx::Regexp::Token::Delimiter, which is a subclass
119							# of PPIx::Regexp::Token::Structure.
120							$prev->isa( 'PPIx::Regexp::Token::Structure' )
121							and q<(> eq $prev->content()
122							and not $prev->isa( 'PPIx::Regexp::Token::Delimiter' )
123							and return LITERAL_LEFT_CURLY_REMOVED_PHASE_3;
124							}
125							# When this mess started, the idea was to always allow
126							# unescaped literal left curlies that started a regex or a
127							# group
128							return LITERAL_LEFT_CURLY_ALLOWED;
129							},
130							);
131
132							sub perl_version_removed {
133	227			227	1	14971	my ( $self ) = @_;
134							exists $self->{perl_version_removed}
135	227	100				799	and return $self->{perl_version_removed};
136	145					238	my $code;
137							return ( $self->{perl_version_removed} =
138	145	100				341	( $code = $removed{$self->content()} ) ?
139							scalar $code->( $self ) : undef
140							);
141							}
142							}
143
144							# Some characters may or may not be literals depending on whether we are
145							# inside a character class. The following hash identifies those
146							# characters and says what we should return when outside (index 0) or
147							# inside (index 1) a character class, as judged by the presence of the
148							# relevant cookie.
149							my %double_agent = (
150							'.' => [ undef, 1 ],
151							'*' => [ undef, 1 ],
152							'?' => [ undef, 1 ],
153							'+' => [ undef, 1 ],
154							'-' => [ 1, undef ],
155							'\|' => [ undef, 1 ],
156							);
157
158							# These are the characters that other external tokenizers need to see,
159							# or at least that we need to take a closer look at. All others can be
160							# unconditionally made into single-character literals.
161							my %extra_ordinary = map { $_ => 1 }
162							split qr{}smx, '$@*+?.\\(){}[]^\|-#';
163							# $ -> Token::Interpolation, Token::Assertion
164							# @ -> Token::Interpolation
165							# * -> Token::Quantifier
166							# + ? -> Token::Quantifier, Token::Greediness
167							# . -> Token::CharClass::Simple
168							# \ -> Token::Control, Token::CharClass::Simple, Token::Assertion,
169							# Token::Backreference
170							# ( ) { } [ ] -> Token::Structure
171							# ^ -> Token::Assertion
172							# \| - -> Token::Operator
173
174							my %regex_set_operator = map { $_ => 1 } qw{ & + \| - ^ ! };
175
176							# The regex for the extended white space available under regex sets in
177							# Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get
178							# this to work under Perl 5.6.2, so for that we fall back to ASCII white
179							# space. The stringy eval is because I have been unable to get
180							# satisfaction out of either interpolated characters (in general) or
181							# eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto).
182							#
183							# See PPIx::Regexp::Structure::RegexSet for the documentation of this
184							# mess.
185							# my $white_space_re = $] >= 5.008 ?
186							# 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
187							# 'qr< \\A \\s+ >smx';
188							#
189							# RT #91798
190							# The above turns out to be wrong, because \s matches too many
191							# characters. We need the following to get the right match. Note that
192							# \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{}
193							# characters were NOT added (as I originally thought) but were simply
194							# made characters that generated warnings when escaped, in preparation
195							# for adding them. When they actually get added, I will have to add back
196							# the trinary operator. Sigh.
197							# my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx';
198							#
199							# The extended white space characters came back in Perl 5.21.1.
200							my $white_space_re = $] >= 5.008 ?
201							'qr< \\A [\\t\\n\\cK\\f\\r \\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
202							'qr< \\A [\\t\\n\\cK\\f\\r ]+ >smx';
203							$white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval)
204
205							my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ };
206
207							sub __PPIX_TOKENIZER__regexp {
208	2824			2824		6394	my ( undef, $tokenizer, $character ) = @_; # Invocant, $char_type unused
209
210	2824	100				7458	if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) {
211							# If we're inside a regex set no literals are allowed, but not
212							# all characters that get here are seen as literals.
213
214	105	100				363	$regex_set_operator{$character}
215							and return $tokenizer->make_token(
216							length $character, 'PPIx::Regexp::Token::Operator' );
217
218	92					142	my $accept;
219
220							# As of 5.23.4, only space and horizontal tab are legal white
221							# space inside a bracketed class inside an extended character
222							# class
223	92	100				208	$accept = $tokenizer->find_regexp(
		100
224							$tokenizer->cookie( COOKIE_CLASS ) ?
225							qr{ \A [ \t] }smx :
226							$white_space_re
227							)
228							and return $tokenizer->make_token(
229							$accept, 'PPIx::Regexp::Token::Whitespace' );
230
231	59	50				179	$accept = _escaped( $tokenizer, $character )
232							and return $accept;
233
234	59	100				217	$regex_pass_on{$character}
235							and return;
236
237							# At this point we have a single character which is poised to be
238							# interpreted as a literal. These are not legal in a regex set
239							# except when also in a bracketed class.
240	8	50				50	return $tokenizer->cookie( COOKIE_CLASS ) ?
241							length $character :
242							$tokenizer->make_token(
243							length $character, TOKEN_UNKNOWN, {
244							error => 'Literal not valid in Regex set',
245							},
246							);
247
248							} else {
249
250							# Otherwise handle the characters that may or may not be
251							# literals depending on whether or not we are in a character
252							# class.
253	2719	100				8372	if ( my $class = $double_agent{$character} ) {
254	178	100				660	my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0;
255	178					738	return $class->[$inx];
256							}
257							}
258
259							# If /x is in effect _and_ we are not inside a character class, \s
260							# is whitespace, and '#' introduces a comment. Otherwise they are
261							# both literals.
262	2541	100	100			7235	if ( $tokenizer->modifier( 'x*' ) &&
		100	66
263							! $tokenizer->cookie( COOKIE_CLASS ) ) {
264	276					500	my $accept;
265	276	100				791	$accept = $tokenizer->find_regexp( $white_space_re )
266							and return $tokenizer->make_token(
267							$accept, 'PPIx::Regexp::Token::Whitespace' );
268	181	100				783	$accept = $tokenizer->find_regexp(
269							qr{ \A \# [^\n]* (?: \n \| \z) }smx )
270							and return $tokenizer->make_token(
271							$accept, 'PPIx::Regexp::Token::Comment' );
272							} elsif ( $tokenizer->modifier( 'xx' ) &&
273							$tokenizer->cookie( COOKIE_CLASS ) ) {
274	10					21	my $accept;
275	10	100				47	$accept = $tokenizer->find_regexp( qr{ \A [ \t] }smx )
276							and return $tokenizer->make_token(
277							$accept, 'PPIx::Regexp::Token::Whitespace',
278							{ perl_version_introduced => '5.025009' },
279							);
280							} else {
281	2255	100	100			10672	( $character eq '#' \|\| $character =~ m/ \A \s \z /smx )
282							and return 1;
283							}
284
285	2382					4319	my $accept;
286	2382	100				5957	$accept = _escaped( $tokenizer, $character )
287							and return $accept;
288
289							# All other characters which are not extra ordinary get accepted.
290	2305	100				7619	$extra_ordinary{$character} or return 1;
291
292	1233					2960	return;
293							}
294
295							=begin comment
296
297							The following is from perlop:
298
299							The character following "\c" is mapped to some other character by
300							converting letters to upper case and then (on ASCII systems) by
301							inverting the 7th bit (0x40). The most interesting range is from '@' to
302							'_' (0x40 through 0x5F), resulting in a control character from 0x00
303							through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only
304							'@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting
305							in 0x00 through 0x1F and 0x7F.
306
307							=end comment
308
309							=cut
310
311							# Recognize all the escaped constructions that generate literal
312							# characters in one gigantic regexp. Technically \1.. through \7.. are
313							# octal literals too, but we can not disambiguate these from back
314							# references until we know how many there are. So the lexer gets another
315							# dirty job.
316
317							{
318							my %special = (
319							'\\N{}' => sub {
320							my ( $tokenizer, $accept ) = @_;
321
322							=begin comment
323
324							$tokenizer->strict()
325							or return $tokenizer->make_token( $accept,
326							'PPIx::Regexp::Token::NoOp', {
327							perl_version_removed => '5.027001',
328							},
329							);
330							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
331							error => join( ' ',
332							'Empty Unicode character name',
333							MSG_PROHIBITED_BY_STRICT ),
334							perl_version_introduced => '5.023008',
335							perl_version_removed => '5.027001',
336							},
337							);
338
339							=end comment
340
341							=cut
342
343							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
344							error => 'Empty Unicode character name',
345							perl_version_introduced => '5.023008',
346							perl_version_removed => '5.027001',
347							},
348							);
349							},
350
351							'\\o{}' => sub {
352							my ( $tokenizer, $accept ) = @_;
353							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
354							error => q,
355							},
356							);
357							},
358
359							'\\x{}' => sub {
360							my ( $tokenizer, $accept ) = @_;
361							$tokenizer->strict()
362							and return $tokenizer->make_token( $accept,
363							TOKEN_UNKNOWN, {
364							error =>
365							q,
366							},
367							);
368							return $accept;
369							},
370							);
371
372							sub _escaped {
373	2441			2441		4573	my ( $tokenizer, $character ) = @_;
374
375	2441	100				7371	$character eq '\\'
376							or return;
377
378	304	100				1660	if ( my $accept = $tokenizer->find_regexp( # {
379							qr< \A \\ ( [ox] ) [{] ( [^}]* ) [}] >smx
380							) ) {
381	8					39	my $match = $tokenizer->match();
382	8					19	my $code;
383	8	100				49	$code = $special{$match}
384							and return $code->( $tokenizer, $accept );
385	5					20	my ( $kind, $value ) = $tokenizer->capture();
386							my $invalid = {
387							o => qr<[^0-7]>smx,
388							x => qr<[[:^xdigit:]]>smx,
389	5					61	}->{$kind};
390	5	100				143	$value =~ m/ $invalid /smxg # /g for pos()
391							or return $accept;
392							$tokenizer->strict()
393							and return $tokenizer->make_token( $accept,
394							TOKEN_UNKNOWN, {
395							error => sprintf(
396							'Non-%s character in \\%s{...}',
397							{
398							o => 'octal',
399							x => 'hex',
400	2	100				140	}->{$kind},
401							$kind,
402							),
403							},
404							);
405							return $tokenizer->make_token( $accept, __PACKAGE__, {
406							ordinal => {
407	1			1		8	o => sub { oct $_[0] },
408	0			0		0	x => sub { hex $_[0] },
409	1		50			18	}->{$kind}->( substr( $value, 0, pos $value ) \|\| 0 ),
410							},
411							);
412							}
413
414	296	100				1392	if ( my $accept = $tokenizer->find_regexp(
415							qr< \A \\ (?:
416							[^\w\s] \| # delimiters/metas
417							[tnrfae] \| # C-style escapes
418							0 [01234567]{0,2} \| # octal
419							# [01234567]{1,3} \| # made from backref by lexer
420							c [][\@[:alpha:]\\^_?] \| # control characters
421							## x (?: \{ [[:xdigit:]]* \} \| [[:xdigit:]]{0,2} ) \| # hex
422							## o [{] [01234567]+ [}] \| # octal as of 5.13.3
423							x [[:xdigit:]]{0,2} \| # hex - brackets handled above
424							## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* \| # must begin w/ alpha
425							## U [+] [[:xdigit:]]+ ) \} ) \| # unicode
426							N (?: [{] (?= [^0-9] ) [^\}]* [}] ) # unicode
427							) >smx ) ) {
428	69					298	my $match = $tokenizer->match();
429	69					139	my $code;
430	69	100				266	$code = $special{$match}
431							and return $code->( $tokenizer, $accept );
432	63					348	return $accept;
433							}
434
435	227					799	return;
436							}
437							}
438
439							=head2 ordinal
440
441							print 'The ordinal of ', $token->content(),
442							' is ', $token->ordinal(), "\n";
443
444							This method returns the ordinal of the literal if it can figure it out.
445							It is analogous to the C built-in.
446
447							It will not attempt to determine the ordinal of a unicode name
448							(C<\N{...}>) unless L has been loaded, and supports
449							the L)> function.
450							Instead, it will return C. Users of Perl 5.6.2 and older may be
451							out of luck here.
452
453							Unicode code points (e.g. C<\N{U+abcd}>) should work independently of
454							L, and just return the value of C.
455
456							It will never attempt to return the ordinal of an octet (C<\C{...}>)
457							because I don't understand the syntax.
458
459							=cut
460
461							{
462
463							my %escapes = (
464							'\\t' => ord "\t",
465							'\\n' => ord "\n",
466							'\\r' => ord "\r",
467							'\\f' => ord "\f",
468							'\\a' => ord "\a",
469							'\\b' => ord "\b",
470							'\\e' => ord "\e",
471							'\\c?' => ord "\c?",
472							'\\c@' => ord "\c@",
473							'\\cA' => ord "\cA",
474							'\\ca' => ord "\cA",
475							'\\cB' => ord "\cB",
476							'\\cb' => ord "\cB",
477							'\\cC' => ord "\cC",
478							'\\cc' => ord "\cC",
479							'\\cD' => ord "\cD",
480							'\\cd' => ord "\cD",
481							'\\cE' => ord "\cE",
482							'\\ce' => ord "\cE",
483							'\\cF' => ord "\cF",
484							'\\cf' => ord "\cF",
485							'\\cG' => ord "\cG",
486							'\\cg' => ord "\cG",
487							'\\cH' => ord "\cH",
488							'\\ch' => ord "\cH",
489							'\\cI' => ord "\cI",
490							'\\ci' => ord "\cI",
491							'\\cJ' => ord "\cJ",
492							'\\cj' => ord "\cJ",
493							'\\cK' => ord "\cK",
494							'\\ck' => ord "\cK",
495							'\\cL' => ord "\cL",
496							'\\cl' => ord "\cL",
497							'\\cM' => ord "\cM",
498							'\\cm' => ord "\cM",
499							'\\cN' => ord "\cN",
500							'\\cn' => ord "\cN",
501							'\\cO' => ord "\cO",
502							'\\co' => ord "\cO",
503							'\\cP' => ord "\cP",
504							'\\cp' => ord "\cP",
505							'\\cQ' => ord "\cQ",
506							'\\cq' => ord "\cQ",
507							'\\cR' => ord "\cR",
508							'\\cr' => ord "\cR",
509							'\\cS' => ord "\cS",
510							'\\cs' => ord "\cS",
511							'\\cT' => ord "\cT",
512							'\\ct' => ord "\cT",
513							'\\cU' => ord "\cU",
514							'\\cu' => ord "\cU",
515							'\\cV' => ord "\cV",
516							'\\cv' => ord "\cV",
517							'\\cW' => ord "\cW",
518							'\\cw' => ord "\cW",
519							'\\cX' => ord "\cX",
520							'\\cx' => ord "\cX",
521							'\\cY' => ord "\cY",
522							'\\cy' => ord "\cY",
523							'\\cZ' => ord "\cZ",
524							'\\cz' => ord "\cZ",
525							'\\c[' => ord "\c[",
526							'\\c\\\\' => ord "\c\\", # " # Get Vim's head straight.
527							'\\c]' => ord "\c]",
528							'\\c^' => ord "\c^",
529							'\\c_' => ord "\c_",
530							);
531
532							sub ordinal {
533	17			17	1	47	my ( $self ) = @_;
534	17	100				50	exists $self->{ordinal} and return $self->{ordinal};
535	16					57	return ( $self->{ordinal} = $self->_ordinal() );
536							}
537
538							my %octal = map {; "$_" => 1 } ( 0 .. 7 );
539
540							sub _ordinal {
541	16			16		31	my ( $self ) = @_;
542	16					46	my $content = $self->content();
543
544	16	100				89	$content =~ m/ \A \\ /smx or return ord $content;
545
546	8	100				33	exists $escapes{$content} and return $escapes{$content};
547
548	6					19	my $indicator = substr $content, 1, 1;
549
550	6	100				30	$octal{$indicator} and return oct substr $content, 1;
551
552	5	100				15	if ( $indicator eq 'x' ) {
553	3	100				27	$content =~ m/ \A \\ x \{ ( [[:xdigit:]]* ) /smx
554							and return hex "0$1";
555	1	50				12	$content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx
556							and return hex $1;
557	0					0	return;
558							}
559
560	2	50				7	if ( $indicator eq 'o' ) {
561	0	0				0	$content =~ m/ \A \\ o [{] ( [01234567]* ) \z /smx
562							and return oct "0$1";
563	0					0	return; # Shouldn't happen, but ...
564							}
565
566	2	50				6	if ( $indicator eq 'N' ) {
567	2	100				14	$content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx
568							and return hex $1;
569	1	50				9	$content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx
		50
570							and return (
571							_have_charnames_vianame() ?
572							charnames::vianame( $1 ) :
573							undef
574							);
575	0					0	return; # Shouldn't happen, but ...
576							}
577
578	0					0	return ord $indicator;
579							}
580
581							}
582
583							sub __following_literal_left_curly_disallowed_in {
584	2			2		11	return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
585							}
586
587							{
588							my $have_charnames_vianame;
589
590							sub _have_charnames_vianame {
591	1	50		1		4	defined $have_charnames_vianame
592							and return $have_charnames_vianame;
593							return (
594	1	50				17	$have_charnames_vianame =
595							charnames->can( 'vianame' ) ? 1 : 0
596							);
597
598							}
599							}
600
601							sub __perl_requirements_setup {
602	4			4		11	my ( $self ) = @_;
603	4					7	my $prev;
604	4	50	66			9	q<{> eq $self->content() # }
			66
605							and $prev = $self->sprevious_sibling()
606							and $prev->isa( 'PPIx::Regexp::Token::Literal' )
607							or return $self->SUPER::__perl_requirements_setup();
608							return (
609							{
610	1					10	introduced => MINIMUM_PERL,
611							removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_1,
612							},
613							# TODO the following will be needed if this construction is
614							# re-allowed in 5.26.1:
615							# {
616							# introduced => '5.026001',
617							# removed => '6.027000',
618							# },
619							{
620							introduced => '5.027001',
621							removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_2,
622							},
623							);
624							}
625
626							*__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
627
628							1;
629
630							__END__