File Coverage

blib/lib/PPIx/Regexp/Token/CharClass/Simple.pm
Criterion Covered Total %
statement 73 73 100.0
branch 29 34 85.2
condition n/a
subroutine 15 15 100.0
pod 3 3 100.0
total 120 125 96.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::CharClass::Simple - This class represents a simple character class
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{\w}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C has no descendants.
17              
18             =head1 DESCRIPTION
19              
20             This class represents one of the simple character classes that can occur
21             anywhere in a regular expression. This includes not only the truly
22             simple things like \w, but also Unicode properties, including properties
23             with wildcard values.
24              
25             =head1 METHODS
26              
27             This class provides no public methods beyond those provided by its
28             superclass.
29              
30             =cut
31              
32             package PPIx::Regexp::Token::CharClass::Simple;
33              
34 9     9   68 use strict;
  9         26  
  9         257  
35 9     9   45 use warnings;
  9         28  
  9         251  
36              
37 9     9   42 use base qw{ PPIx::Regexp::Token::CharClass };
  9         21  
  9         797  
38              
39 9         2218 use PPIx::Regexp::Constant qw{
40             COOKIE_CLASS
41             LITERAL_LEFT_CURLY_REMOVED_PHASE_1
42             LITERAL_LEFT_CURLY_REMOVED_PHASE_2
43             MINIMUM_PERL
44             TOKEN_LITERAL
45             TOKEN_UNKNOWN
46             @CARP_NOT
47 9     9   60 };
  9         17  
48              
49             our $VERSION = '0.087_01';
50              
51 9         730 use constant UNICODE_PROPERTY_LITERAL_VALUE => qr/
52             \{ \s* \^? \w [\w:=\s-]* \} |
53             [CLMNPSZ] # perluniprops for 5.26.1
54 9     9   70 /smx;
  9         17  
55 9         19 use constant UNICODE_PROPERTY_LITERAL =>
56             qr/ \A \\ [Pp] (?:
57 9         2201 @{[ UNICODE_PROPERTY_LITERAL_VALUE ]}
58 9     9   61 ) /smx;
  9         20  
59              
60             # CAVEAT: The following regular expression, despite its name, matches
61             # ALL unicode property values. To actually match a wildcard property you
62             # must first eliminate anything that matches UNICODE_PROPERTY_LITERAL or
63             # UNICODE_PROPERTY_NAME_MATCH
64 9         780 use constant UNICODE_PROPERTY_WILDCARD =>
65 9     9   71 qr/ \A \\ [Pp] \{ \s* [\w\s-]+ [:=] [^}]+ \} /smx;
  9         22  
66              
67 9         695 use constant UNICODE_PROPERTY_NAME_MATCH =>
68 9     9   55 qr< \A \\ [Pp] \{ \s* na (?: me? )? [:=] / [^/]+ / \} >smx;
  9         20  
69              
70 9         24 use constant UNICODE_PROPERTY =>
71 9         24 qr/ @{[ UNICODE_PROPERTY_LITERAL ]} |
72 9         19 @{[ UNICODE_PROPERTY_NAME_MATCH ]} |
73 9     9   54 @{[ UNICODE_PROPERTY_WILDCARD ]} /smx;
  9         27  
  9         9224  
74              
75             {
76              
77             my %kind_of_match = (
78             p => 'with',
79             P => 'without',
80             );
81              
82             my %explanation = (
83             '.' => 'Match any character',
84             '\\C' => 'Match a single octet (removed in 5.23.0)',
85             '\\D' => 'Match any character but a decimal digit',
86             '\\H' => 'Match a non-horizontal-white-space character',
87             '\\N' => 'Match any character but a new-line character',
88             '\\R' => 'Match a generic new-line character',
89             '\\S' => 'Match non-white-space character',
90             '\\V' => 'Match a non-vertical-white-space character',
91             '\\W' => 'Match non-word character',
92             '\\X' => 'Match a Unicode extended grapheme cluster',
93             '\\d' => 'Match decimal digit',
94             '\\h' => 'Match a horizontal-white-space character',
95             '\\s' => 'Match white-space character',
96             '\\v' => 'Match a vertical-white-space character',
97             '\\w' => 'Match word character',
98             );
99              
100             sub __explanation {
101 15     15   48 return \%explanation;
102             }
103              
104             sub explain {
105 18     18 1 51 my ( $self ) = @_;
106 18 100       57 if ( $self->content() =~ m/ \A \\ ( [Pp] ) ( [{] .* [}] | . ) \z /smx ) {
107 3         14 my ( $kind, $prop ) = ( $1, $2 );
108              
109 3         7 my $literal = ( $prop =~
110 3         151 m/ \A @{[ UNICODE_PROPERTY_LITERAL_VALUE ]} \z /smx );
111              
112 3 50       14 if ( 1 < length $prop ) {
113 3         11 $prop =~ s/ \A [{] //smx;
114 3         14 $prop =~ s/ [}] \z //smx;
115             }
116              
117             $literal
118             and return sprintf
119             q,
120 3 100       23 $kind_of_match{$kind}, $prop;
121              
122             return sprintf
123             q,
124 1         11 $kind_of_match{$kind}, $prop;
125             }
126 15         55 return $self->SUPER::explain();
127             }
128              
129             }
130              
131             ##=head2 is_case_sensitive
132             ##
133             ##This override of the superclass method returns true for Unicode
134             ##properties that specify case, and false (but defined) for all
135             ##other character classes.
136             ##
137             ##The classes that specify case are documented in
138             ##L.
139             ##
140             ##B This method returns false (but defined) for user-defined
141             ##Unicode properties. It should return C. This bug B be fixed
142             ##if I find a way to identify all system-defined Unicode properties.
143             ##
144             ##=cut
145             ##
146             ##sub is_case_sensitive {
147             ## my ( $self ) = @_;
148             ## exists $self->{is_case_sensitive}
149             ## and return $self->{is_case_sensitive};
150             ## return ( $self->{is_case_sensitive} = $self->_is_case_sensitive() );
151             ##}
152              
153             ##{
154             ## my %case_sensitive = map { $_ => 1 } qw{
155             ## generalcategory=lowercaseletter generalcategory=ll
156             ## gc=lowercaseletter gc=ll
157             ## generalcategory=titlecaseletter generalcategory=lt
158             ## gc=titlecaseletter gc=lt
159             ## generalcategory=uppercaseletter generalcategory=lu
160             ## gc=uppercaseletter gc=lu
161             ## lowercaseletter lowercase lower ll
162             ## titlecaseletter titlecase title lt
163             ## uppercaseletter uppercase upper lu
164             ## lowercase=y lower=y lowercase=n lower=n
165             ## titlecase=y title=y titlecase=n title=n
166             ## uppercase=y upper=y uppercase=n upper=n
167             ## };
168             ##
169             ## sub _is_case_sensitive {
170             ## my ( $self ) = @_;
171             ## my $content = $self->content();
172             ## $content =~ m/ \A \\ p [{] ( .* ) [}] /smxi
173             ## or return 0;
174             ## $content = lc $1;
175             ## $content =~ s/ \A ^ //smx;
176             ## $content =~ s/ [\s_-] //smxg;
177             ## $content =~ s/ \A is //smx;
178             ## $content =~ s/ : /=/smxg;
179             ## $content =~ s/ = (?: yes | t | true ) \b /=y/smxg;
180             ## $content =~ s/ = (?: no | f | false ) \b /=n/smxg;
181             ## return $case_sensitive{$content} || 0;
182             ## }
183             ##
184             ##}
185              
186             {
187              
188             my %introduced = (
189             '\\h' => '5.009005', # Before this, parsed as 'h'
190             '\\v' => '5.009005', # Before this, parsed as 'v'
191             '\\H' => '5.009005', # Before this, parsed as 'H'
192             '\\N' => '5.011', # Before this, an error.
193             '\\V' => '5.009005', # Before this, parsed as 'V'
194             '\\R' => '5.009005',
195             '\\C' => '5.006',
196             '\\X' => '5.006',
197             );
198              
199             sub perl_version_introduced {
200 31     31 1 5549 my ( $self ) = @_;
201 31         104 my $content = $self->content();
202 31 100       135 if ( defined( my $minver = $introduced{$content} ) ) {
203 9         27 return $minver;
204             }
205             # I must have read perl5113delta and thought this
206             # represented the change they were talking about, but I sure
207             # don't see it now. So, until things become clearer ...
208             # $content =~ m/ \G .*? [\s=-] /smxgc
209             # and return '5.011003';
210 22 100       121 $content =~ UNICODE_PROPERTY_LITERAL
211             and return '5.006001';
212 19 50       62 $content =~ UNICODE_PROPERTY_NAME_MATCH
213             and return '5.031010';
214 19 100       58 $content =~ UNICODE_PROPERTY_WILDCARD
215             and return '5.029009';
216 18         50 return MINIMUM_PERL;
217             }
218              
219             }
220              
221             {
222             my %removed = (
223             '\\C' => '5.023', # Before this, matched an octet
224             );
225              
226             sub perl_version_removed {
227 31     31 1 11686 my ( $self ) = @_;
228 31         100 return $removed{ $self->content() };
229             }
230             }
231              
232             # This is one of the larger complications of
233             # https://rt.perl.org/Public/Bug/Display.html?id=128213
234             # where it transpired that un-escaped literal left curlies were not
235             # giving warnings/errors in /.{/, /\p{...}{/, and /\P{...}{/, but were
236             # for all the others that bin into this class (e.g. /\s{/).
237             # Note that the perldelta for 5.25.1 and 5.26.0 do not acknowledge tha
238             # phased deprecation, and pretend that everything was done on the phase
239             # 1 schedule. This appears to be deliberate per
240             # https://rt.perl.org/Ticket/Display.html?id=131352
241             sub __following_literal_left_curly_disallowed_in {
242 4     4   10 my ( $self ) = @_;
243 4 100       11 q<.> eq ( my $content = $self->content() )
244             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
245 3 100       20 $content =~ m/ \A \\ p \{ /smxi
246             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
247 1         9 return LITERAL_LEFT_CURLY_REMOVED_PHASE_1;
248             }
249              
250             sub __PPIX_TOKENIZER__regexp {
251 1286     1286   2803 my ( undef, $tokenizer, $character ) = @_;
252              
253 1286         3302 my $in_class = $tokenizer->cookie( COOKIE_CLASS );
254              
255 1286 100       3196 if ( $character eq '.' ) {
256 18 50       70 $in_class
257             and return $tokenizer->make_token( 1, TOKEN_LITERAL );
258 18         46 return 1;
259             }
260              
261 1268 100       6114 if ( my $accept = $tokenizer->find_regexp(
262             qr{ \A \\ [wWsSdDvVhHXRNC] }smx
263             ) ) {
264 83 100       334 if ( $in_class ) {
265 9         48 my $match = $tokenizer->match();
266             # As of Perl 5.11.5, [\N] is a fatal error.
267 9 50       39 '\\N' eq $match
268             and return $tokenizer->make_token(
269             $accept, TOKEN_UNKNOWN, {
270             error => '\\N invalid inside character class',
271             },
272             );
273             # \R is not recognized inside a character class. It
274             # eventually ends up as a literal.
275 9 50       46 '\\R' eq $match and return;
276             }
277 83         279 return $accept;
278             }
279              
280 1185 100       3988 if ( my $accept = $tokenizer->find_regexp( UNICODE_PROPERTY ) ) {
281 13         65 return $accept;
282             }
283              
284 1172 100       4598 if ( my $accept = $tokenizer->find_regexp( qr< \A \\ p [{] \s* [}] >smx )
285             ) {
286 1         7 return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
287             error => 'Empty \\p{} is an error',
288             },
289             );
290             }
291              
292 1171         3521 return;
293             }
294              
295             1;
296              
297             __END__