File Coverage

blib/lib/PPIx/Regexp/Token/CharClass/Simple.pm

Criterion	Covered	Total	%
statement	73	73	100.0
branch	29	34	85.2
condition			n/a
subroutine	15	15	100.0
pod	3	3	100.0
total	120	125	96.0

line	stmt	bran	sub	pod	time	code
1						=head1 NAME
2
3						PPIx::Regexp::Token::CharClass::Simple - This class represents a simple character class
4
5						=head1 SYNOPSIS
6
7						use PPIx::Regexp::Dumper;
8						PPIx::Regexp::Dumper->new( 'qr{\w}smx' )
9						->print();
10
11						=head1 INHERITANCE
12
13						C is a
14						L.
15
16						C has no descendants.
17
18						=head1 DESCRIPTION
19
20						This class represents one of the simple character classes that can occur
21						anywhere in a regular expression. This includes not only the truly
22						simple things like \w, but also Unicode properties, including properties
23						with wildcard values.
24
25						=head1 METHODS
26
27						This class provides no public methods beyond those provided by its
28						superclass.
29
30						=cut
31
32						package PPIx::Regexp::Token::CharClass::Simple;
33
34	9		9		68	use strict;
	9				26
	9				257
35	9		9		45	use warnings;
	9				28
	9				251
36
37	9		9		42	use base qw{ PPIx::Regexp::Token::CharClass };
	9				21
	9				797
38
39	9				2218	use PPIx::Regexp::Constant qw{
40						COOKIE_CLASS
41						LITERAL_LEFT_CURLY_REMOVED_PHASE_1
42						LITERAL_LEFT_CURLY_REMOVED_PHASE_2
43						MINIMUM_PERL
44						TOKEN_LITERAL
45						TOKEN_UNKNOWN
46						@CARP_NOT
47	9		9		60	};
	9				17
48
49						our $VERSION = '0.087_01';
50
51	9				730	use constant UNICODE_PROPERTY_LITERAL_VALUE => qr/
52						\{ \s* \^? \w [\w:=\s-]* \} \|
53						[CLMNPSZ] # perluniprops for 5.26.1
54	9		9		70	/smx;
	9				17
55	9				19	use constant UNICODE_PROPERTY_LITERAL =>
56						qr/ \A \\ [Pp] (?:
57	9				2201	@{[ UNICODE_PROPERTY_LITERAL_VALUE ]}
58	9		9		61	) /smx;
	9				20
59
60						# CAVEAT: The following regular expression, despite its name, matches
61						# ALL unicode property values. To actually match a wildcard property you
62						# must first eliminate anything that matches UNICODE_PROPERTY_LITERAL or
63						# UNICODE_PROPERTY_NAME_MATCH
64	9				780	use constant UNICODE_PROPERTY_WILDCARD =>
65	9		9		71	qr/ \A \\ [Pp] \{ \s* [\w\s-]+ [:=] [^}]+ \} /smx;
	9				22
66
67	9				695	use constant UNICODE_PROPERTY_NAME_MATCH =>
68	9		9		55	qr< \A \\ [Pp] \{ \s* na (?: me? )? [:=] / [^/]+ / \} >smx;
	9				20
69
70	9				24	use constant UNICODE_PROPERTY =>
71	9				24	qr/ @{[ UNICODE_PROPERTY_LITERAL ]} \|
72	9				19	@{[ UNICODE_PROPERTY_NAME_MATCH ]} \|
73	9		9		54	@{[ UNICODE_PROPERTY_WILDCARD ]} /smx;
	9				27
	9				9224
74
75						{
76
77						my %kind_of_match = (
78						p => 'with',
79						P => 'without',
80						);
81
82						my %explanation = (
83						'.' => 'Match any character',
84						'\\C' => 'Match a single octet (removed in 5.23.0)',
85						'\\D' => 'Match any character but a decimal digit',
86						'\\H' => 'Match a non-horizontal-white-space character',
87						'\\N' => 'Match any character but a new-line character',
88						'\\R' => 'Match a generic new-line character',
89						'\\S' => 'Match non-white-space character',
90						'\\V' => 'Match a non-vertical-white-space character',
91						'\\W' => 'Match non-word character',
92						'\\X' => 'Match a Unicode extended grapheme cluster',
93						'\\d' => 'Match decimal digit',
94						'\\h' => 'Match a horizontal-white-space character',
95						'\\s' => 'Match white-space character',
96						'\\v' => 'Match a vertical-white-space character',
97						'\\w' => 'Match word character',
98						);
99
100						sub __explanation {
101	15		15		48	return \%explanation;
102						}
103
104						sub explain {
105	18		18	1	51	my ( $self ) = @_;
106	18	100			57	if ( $self->content() =~ m/ \A \\ ( [Pp] ) ( [{] .* [}] \| . ) \z /smx ) {
107	3				14	my ( $kind, $prop ) = ( $1, $2 );
108
109	3				7	my $literal = ( $prop =~
110	3				151	m/ \A @{[ UNICODE_PROPERTY_LITERAL_VALUE ]} \z /smx );
111
112	3	50			14	if ( 1 < length $prop ) {
113	3				11	$prop =~ s/ \A [{] //smx;
114	3				14	$prop =~ s/ [}] \z //smx;
115						}
116
117						$literal
118						and return sprintf
119						q,
120	3	100			23	$kind_of_match{$kind}, $prop;
121
122						return sprintf
123						q,
124	1				11	$kind_of_match{$kind}, $prop;
125						}
126	15				55	return $self->SUPER::explain();
127						}
128
129						}
130
131						##=head2 is_case_sensitive
132						##
133						##This override of the superclass method returns true for Unicode
134						##properties that specify case, and false (but defined) for all
135						##other character classes.
136						##
137						##The classes that specify case are documented in
138						##L.
139						##
140						##B This method returns false (but defined) for user-defined
141						##Unicode properties. It should return C. This bug B be fixed
142						##if I find a way to identify all system-defined Unicode properties.
143						##
144						##=cut
145						##
146						##sub is_case_sensitive {
147						## my ( $self ) = @_;
148						## exists $self->{is_case_sensitive}
149						## and return $self->{is_case_sensitive};
150						## return ( $self->{is_case_sensitive} = $self->_is_case_sensitive() );
151						##}
152
153						##{
154						## my %case_sensitive = map { $_ => 1 } qw{
155						## generalcategory=lowercaseletter generalcategory=ll
156						## gc=lowercaseletter gc=ll
157						## generalcategory=titlecaseletter generalcategory=lt
158						## gc=titlecaseletter gc=lt
159						## generalcategory=uppercaseletter generalcategory=lu
160						## gc=uppercaseletter gc=lu
161						## lowercaseletter lowercase lower ll
162						## titlecaseletter titlecase title lt
163						## uppercaseletter uppercase upper lu
164						## lowercase=y lower=y lowercase=n lower=n
165						## titlecase=y title=y titlecase=n title=n
166						## uppercase=y upper=y uppercase=n upper=n
167						## };
168						##
169						## sub _is_case_sensitive {
170						## my ( $self ) = @_;
171						## my $content = $self->content();
172						## $content =~ m/ \A \\ p [{] ( .* ) [}] /smxi
173						## or return 0;
174						## $content = lc $1;
175						## $content =~ s/ \A ^ //smx;
176						## $content =~ s/ [\s_-] //smxg;
177						## $content =~ s/ \A is //smx;
178						## $content =~ s/ : /=/smxg;
179						## $content =~ s/ = (?: yes \| t \| true ) \b /=y/smxg;
180						## $content =~ s/ = (?: no \| f \| false ) \b /=n/smxg;
181						## return $case_sensitive{$content} \|\| 0;
182						## }
183						##
184						##}
185
186						{
187
188						my %introduced = (
189						'\\h' => '5.009005', # Before this, parsed as 'h'
190						'\\v' => '5.009005', # Before this, parsed as 'v'
191						'\\H' => '5.009005', # Before this, parsed as 'H'
192						'\\N' => '5.011', # Before this, an error.
193						'\\V' => '5.009005', # Before this, parsed as 'V'
194						'\\R' => '5.009005',
195						'\\C' => '5.006',
196						'\\X' => '5.006',
197						);
198
199						sub perl_version_introduced {
200	31		31	1	5549	my ( $self ) = @_;
201	31				104	my $content = $self->content();
202	31	100			135	if ( defined( my $minver = $introduced{$content} ) ) {
203	9				27	return $minver;
204						}
205						# I must have read perl5113delta and thought this
206						# represented the change they were talking about, but I sure
207						# don't see it now. So, until things become clearer ...
208						# $content =~ m/ \G .*? [\s=-] /smxgc
209						# and return '5.011003';
210	22	100			121	$content =~ UNICODE_PROPERTY_LITERAL
211						and return '5.006001';
212	19	50			62	$content =~ UNICODE_PROPERTY_NAME_MATCH
213						and return '5.031010';
214	19	100			58	$content =~ UNICODE_PROPERTY_WILDCARD
215						and return '5.029009';
216	18				50	return MINIMUM_PERL;
217						}
218
219						}
220
221						{
222						my %removed = (
223						'\\C' => '5.023', # Before this, matched an octet
224						);
225
226						sub perl_version_removed {
227	31		31	1	11686	my ( $self ) = @_;
228	31				100	return $removed{ $self->content() };
229						}
230						}
231
232						# This is one of the larger complications of
233						# https://rt.perl.org/Public/Bug/Display.html?id=128213
234						# where it transpired that un-escaped literal left curlies were not
235						# giving warnings/errors in /.{/, /\p{...}{/, and /\P{...}{/, but were
236						# for all the others that bin into this class (e.g. /\s{/).
237						# Note that the perldelta for 5.25.1 and 5.26.0 do not acknowledge tha
238						# phased deprecation, and pretend that everything was done on the phase
239						# 1 schedule. This appears to be deliberate per
240						# https://rt.perl.org/Ticket/Display.html?id=131352
241						sub __following_literal_left_curly_disallowed_in {
242	4		4		10	my ( $self ) = @_;
243	4	100			11	q<.> eq ( my $content = $self->content() )
244						and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
245	3	100			20	$content =~ m/ \A \\ p \{ /smxi
246						and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
247	1				9	return LITERAL_LEFT_CURLY_REMOVED_PHASE_1;
248						}
249
250						sub __PPIX_TOKENIZER__regexp {
251	1286		1286		2803	my ( undef, $tokenizer, $character ) = @_;
252
253	1286				3302	my $in_class = $tokenizer->cookie( COOKIE_CLASS );
254
255	1286	100			3196	if ( $character eq '.' ) {
256	18	50			70	$in_class
257						and return $tokenizer->make_token( 1, TOKEN_LITERAL );
258	18				46	return 1;
259						}
260
261	1268	100			6114	if ( my $accept = $tokenizer->find_regexp(
262						qr{ \A \\ [wWsSdDvVhHXRNC] }smx
263						) ) {
264	83	100			334	if ( $in_class ) {
265	9				48	my $match = $tokenizer->match();
266						# As of Perl 5.11.5, [\N] is a fatal error.
267	9	50			39	'\\N' eq $match
268						and return $tokenizer->make_token(
269						$accept, TOKEN_UNKNOWN, {
270						error => '\\N invalid inside character class',
271						},
272						);
273						# \R is not recognized inside a character class. It
274						# eventually ends up as a literal.
275	9	50			46	'\\R' eq $match and return;
276						}
277	83				279	return $accept;
278						}
279
280	1185	100			3988	if ( my $accept = $tokenizer->find_regexp( UNICODE_PROPERTY ) ) {
281	13				65	return $accept;
282						}
283
284	1172	100			4598	if ( my $accept = $tokenizer->find_regexp( qr< \A \\ p [{] \s* [}] >smx )
285						) {
286	1				7	return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
287						error => 'Empty \\p{} is an error',
288						},
289						);
290						}
291
292	1171				3521	return;
293						}
294
295						1;
296
297						__END__