File Coverage

blib/lib/PPIx/Regexp/Token/Modifier.pm

Criterion	Covered	Total	%
statement	111	120	92.5
branch	51	68	75.0
condition	37	50	74.0
subroutine	20	20	100.0
pod	7	7	100.0
total	226	265	85.2

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Modifier - Represent modifiers.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9							->print();
10
11							The trailing C will be represented by this class.
12
13							This class also represents the whole of things like C<(?ismx)>. But the
14							modifiers in something like C<(?i:foo)> are represented by a
15							L.
16
17							=head1 INHERITANCE
18
19							C is a
20							L.
21
22							C is the parent of
23							L.
24
25							=head1 DESCRIPTION
26
27							This class represents modifier characters at the end of the regular
28							expression. For example, in C this class would represent
29							the terminal C.
30
31							=head2 The C, C, C, C, and C modifiers
32
33							The C, C, C, C, and C modifiers, introduced starting in
34							Perl 5.13.6, are used to force either Unicode pattern semantics (C),
35							locale semantics (C) default semantics (C the traditional Perl
36							semantics, which can also mean 'dual' since it means Unicode if the
37							string's UTF-8 bit is on, and locale if the UTF-8 bit is off), or
38							restricted default semantics (C). These are mutually exclusive, and
39							only one can be asserted at a time. Asserting any of these overrides
40							the inherited value of any of the others. The C method
41							reports as asserted the last one it sees, or none of them if it has seen
42							none.
43
44							For example, given C C<$elem>
45							representing the invalid regular expression fragment C<(?dul)>,
46							C<< $elem->asserted( 'l' ) >> would return true, but
47							C<< $elem->asserted( 'u' ) >> would return false. Note that
48							C<< $elem->negated( 'u' ) >> would also return false, since C is not
49							explicitly negated.
50
51							If C<$elem> represented regular expression fragment C<(?i)>,
52							C<< $elem->asserted( 'd' ) >> would return false, since even though C
53							represents the default behavior it is not explicitly asserted.
54
55							=head2 The caret (C<^>) modifier
56
57							Calling C<^> a modifier is a bit of a misnomer. The C<(?^...)>
58							construction was introduced in Perl 5.13.6, to prevent the inheritance
59							of modifiers. The documentation calls the caret a shorthand equivalent
60							for C, and that it the way this class handles it.
61
62							For example, given C C<$elem>
63							representing regular expression fragment C<(?^i)>,
64							C<< $elem->asserts( 'd' ) >> would return true, since in the absence of
65							an explicit C or C this class considers the C<^> to explicitly
66							assert C.
67
68							The caret handling is complicated by the fact that the C<'n'> modifier
69							was introduced in 5.21.8, at which point the caret became equivalent to
70							C. I did not feel I could unconditionally add the C<-n> to the
71							expansion of the caret, because that would produce confusing output from
72							methods like L. Nor could I
73							make it conditional on the minimum perl version, because that
74							information is not available early enough in the parse. What I did was
75							to expand the caret into C if and only if C<'n'> was in effect
76							at some point in the scope in which the modifier was parsed.
77
78							Continuing the above example, C<< $elem->asserts( 'n' ) >> and
79							C<< $elem->modifier_asserted( 'n' ) >> would both return false, but
80							C<< $elem->negates( 'n' ) >> would return true if and only if the C
81							modifier has been asserted somewhere before and in-scope from this
82							token. The
83							L
84							method is inherited from L.
85
86							=head1 METHODS
87
88							This class provides the following public methods. Methods not documented
89							here are private, and unsupported in the sense that the author reserves
90							the right to change or remove them without notice.
91
92							=cut
93
94							package PPIx::Regexp::Token::Modifier;
95
96	9			9		67	use strict;
	9					17
	9					248
97	9			9		50	use warnings;
	9					71
	9					238
98
99	9			9		49	use base qw{ PPIx::Regexp::Token };
	9					18
	9					729
100
101	9			9		64	use Carp;
	9					32
	9					636
102	9					1834	use PPIx::Regexp::Constant qw{
103							MINIMUM_PERL
104							MODIFIER_GROUP_MATCH_SEMANTICS
105							@CARP_NOT
106	9			9		62	};
	9					31
107
108							our $VERSION = '0.087_01';
109
110							# Define modifiers that are to be aggregated internally for ease of
111							# computation.
112							my %aggregate = (
113							a => MODIFIER_GROUP_MATCH_SEMANTICS,
114							aa => MODIFIER_GROUP_MATCH_SEMANTICS,
115							d => MODIFIER_GROUP_MATCH_SEMANTICS,
116							l => MODIFIER_GROUP_MATCH_SEMANTICS,
117							u => MODIFIER_GROUP_MATCH_SEMANTICS,
118							);
119							my %de_aggregate;
120							foreach my $value ( values %aggregate ) {
121							$de_aggregate{$value}++;
122							}
123
124							# Note that we do NOT want the /o modifier on regexen that make use of
125							# this, because it is already compiled.
126							my $capture_group_leader = qr{ [?/(] }smx; # );
127
128	9			9		112	use constant TOKENIZER_ARGUMENT_REQUIRED => 1;
	9					32
	9					17289
129
130							sub __new {
131	586			586		2922	my ( $class, $content, %arg ) = @_;
132
133	586	50				2842	my $self = $class->SUPER::__new( $content, %arg )
134							or return;
135
136							$content =~ m{ \A $capture_group_leader* \^ }smx # no /o!
137							and defined $arg{tokenizer}->modifier_seen( 'n' )
138	586	100	100			5075	and $self->{__caret_undoes_n} = 1;
139
140	586					2502	$arg{tokenizer}->modifier_modify( $self->modifiers() );
141
142	586					2185	return $self;
143							}
144
145							=head2 asserts
146
147							$token->asserts( 'i' ) and print "token asserts i";
148							foreach ( $token->asserts() ) { print "token asserts $_\n" }
149
150							This method returns true if the token explicitly asserts the given
151							modifier. The example would return true for the modifier in
152							C<(?i:foo)>, but false for C<(?-i:foo)>.
153
154							Starting with version 0.036_01, if the argument is a
155							single-character modifier followed by an asterisk (intended as a wild
156							card character), the return is the number of times that modifier
157							appears. In this case an exception will be thrown if you specify a
158							multi-character modifier (e.g. C<'ee*'>).
159
160							If called without an argument, or with an undef argument, all modifiers
161							explicitly asserted by this token are returned.
162
163							=cut
164
165							sub asserts {
166	410			410	1	798	my ( $self, $modifier ) = @_;
167	410		33			855	$self->{modifiers} \|\|= $self->_decode();
168	410	50				779	if ( defined $modifier ) {
169	410					719	return __asserts( $self->{modifiers}, $modifier );
170							} else {
171	0	0				0	return ( sort grep { defined $_ && $self->{modifiers}{$_} }
172	0	0				0	map { $de_aggregate{$_} ? $self->{modifiers}{$_} : $_ }
173	0					0	keys %{ $self->{modifiers} } );
	0					0
174							}
175							}
176
177							# This is a kluge for both determining whether the object asserts
178							# modifiers (hence the 'ductype') and determining whether the given
179							# modifier is actually asserted. The signature is the invocant and the
180							# modifier name, which must not be undef. The return is a boolean.
181							*__ducktype_modifier_asserted = \&asserts;
182
183							sub __asserts {
184	5461			5461		9699	my ( $present, $modifier ) = @_;
185	5461					19747	my $wild = $modifier =~ s/ [*] \z //smx;
186	5461	50	66			18469	not $wild
187							or 1 == length $modifier
188							or croak "Can not use wild card on multi-character modifier '$modifier*'";
189	5461	100				11814	if ( my $bin = $aggregate{$modifier} ) {
190	15					38	my $aggr = $present->{$bin};
191	15	50	100			95	$wild
192							or return ( defined $aggr && $modifier eq $aggr );
193	0	0				0	defined $aggr
194							or return 0;
195	0	0				0	$aggr =~ m/ \A ( (?: \Q$modifier\E )* ) \z /smx
196							or return 0;
197	0					0	return length $1;
198							}
199	5446	100				10413	if ( $wild ) {
200	2583		100			16416	return $present->{$modifier} \|\| 0;
201							}
202	2863					4867	my $len = length $modifier;
203	2863					5561	$modifier = substr $modifier, 0, 1;
204	2863		100			12101	return $present->{$modifier} && $len == $present->{$modifier};
205							}
206
207	8			8	1	32	sub can_be_quantified { return };
208
209							{
210							my %explanation = (
211							'm' => 'm: ^ and $ match within string',
212							'-m' => '-m: ^ and $ match only at ends of string',
213							's' => 's: . can match newline',
214							'-s' => '-s: . can not match newline',
215							'i' => 'i: do case-insensitive matching',
216							'-i' => '-i: do case-sensitive matching',
217							'x' => 'x: ignore whitespace and comments',
218							'xx' => 'xx: ignore whitespace even in bracketed character classes',
219							'-x' => '-x: regard whitespace as literal',
220							'p' => 'p: provide ${^PREMATCH} etc (pre 5.20)',
221							'-p' => '-p: no ${^PREMATCH} etc (pre 5.20)',
222							'a' => 'a: restrict non-Unicode classes to ASCII',
223							'aa' => 'aa: restrict non-Unicode classes & ASCII-Unicode matches',
224							'd' => 'd: match using default semantics',
225							'l' => 'l: match using locale semantics',
226							'u' => 'u: match using Unicode semantics',
227							'n' => 'n: parentheses do not capture',
228							'-n' => '-n: parentheses capture',
229							'c' => 'c: preserve current position on match failure',
230							'g' => 'g: match repeatedly',
231							'e' => 'e: substitution string is an expression',
232							'ee' => 'ee: substitution is expression to eval()',
233							'o' => 'o: only interpolate once',
234							'r' => 'r: aubstitution returns modified string',
235							);
236
237							sub explain {
238	4			4	1	16	my ( $self ) = @_;
239	4					8	my @rslt;
240	4					10	my %mods = $self->modifiers();
241	4	50				18	if ( defined( my $val = delete $mods{match_semantics} ) ) {
242	4					17	push @rslt, $explanation{$val};
243							}
244	4					18	foreach my $key ( sort keys %mods ) {
245	14	100				33	if ( my $val = $mods{$key} ) {
246	4					12	push @rslt, $explanation{ $key x $val };
247							} else {
248	10					26	push @rslt, $explanation{ "-$key" };
249							}
250							}
251	4	50				27	return wantarray ? @rslt : join '; ', @rslt;
252							}
253							}
254
255							=head2 match_semantics
256
257							my $sem = $token->match_semantics();
258							defined $sem or $sem = 'undefined';
259							print "This token has $sem match semantics\n";
260
261							This method returns the match semantics asserted by the token, as one of
262							the strings C<'a'>, C<'aa'>, C<'d'>, C<'l'>, or C<'u'>. If no explicit
263							match semantics are asserted, this method returns C.
264
265							=cut
266
267							sub match_semantics {
268	96			96	1	248	my ( $self ) = @_;
269	96		33			284	$self->{modifiers} \|\|= $self->_decode();
270	96					238	return $self->{modifiers}{ MODIFIER_GROUP_MATCH_SEMANTICS() };
271							}
272
273							=head2 modifiers
274
275							my %mods = $token->modifiers();
276
277							Returns all modifiers asserted or negated by this token, and the values
278							set (true for asserted, false for negated). If called in scalar context,
279							returns a reference to a hash containing the values.
280
281							=cut
282
283							sub modifiers {
284	591			591	1	1411	my ( $self ) = @_;
285	591		66			3576	$self->{modifiers} \|\|= $self->_decode();
286	591					970	my %mods = %{ $self->{modifiers} };
	591					2039
287	591					2040	foreach my $bin ( keys %de_aggregate ) {
288	591	100				2252	defined ( my $val = delete $mods{$bin} )
289							or next;
290	30					99	$mods{$bin} = $val;
291							}
292	591	50				3791	return wantarray ? %mods : \%mods;
293							}
294
295							=head2 negates
296
297							$token->negates( 'i' ) and print "token negates i\n";
298							foreach ( $token->negates() ) { print "token negates $_\n" }
299
300							This method returns true if the token explicitly negates the given
301							modifier. The example would return true for the modifier in
302							C<(?-i:foo)>, but false for C<(?i:foo)>.
303
304							If called without an argument, or with an undef argument, all modifiers
305							explicitly negated by this token are returned.
306
307							=cut
308
309							sub negates {
310	5			5	1	13	my ( $self, $modifier ) = @_;
311	5		33			14	$self->{modifiers} \|\|= $self->_decode();
312							# Note that since the values of hash entries that represent
313							# aggregated modifiers will never be false (at least, not unless '0'
314							# becomes a modifier) we need no special logic to handle them.
315							defined $modifier
316	0					0	or return ( sort grep { ! $self->{modifiers}{$_} }
317	5	50				16	keys %{ $self->{modifiers} } );
	0					0
318							return exists $self->{modifiers}{$modifier}
319	5		66			55	&& ! $self->{modifiers}{$modifier};
320							}
321
322							sub perl_version_introduced {
323	132			132	1	9900	my ( $self ) = @_;
324							return ( $self->{perl_version_introduced} \|\|=
325	132		66			706	$self->_perl_version_introduced() );
326							}
327
328							sub _perl_version_introduced {
329	93			93		222	my ( $self ) = @_;
330	93					274	my $content = $self->content();
331	93					343	my $is_statement_modifier = ( $content !~ m/ \A [(]? [?] /smx );
332	93					577	my $match_semantics = $self->match_semantics();
333
334	93	100				459	$self->asserts( 'xx' )
335							and return '5.025009';
336
337							# Disabling capture with /n was introduced in 5.21.8
338	92	100				327	$self->asserts( 'n' )
339							and return '5.021008';
340
341							# Match semantics modifiers became available as regular expression
342							# modifiers in 5.13.10.
343	91	100	100			389	defined $match_semantics
344							and $is_statement_modifier
345							and return '5.013010';
346
347							# /aa was introduced in 5.13.10.
348	85	100	100			334	defined $match_semantics
349							and 'aa' eq $match_semantics
350							and return '5.013010';
351
352							# /a was introduced in 5.13.9, but only in (?...), not as modifier
353							# of the entire regular expression.
354	84	100	66			277	defined $match_semantics
			100
355							and not $is_statement_modifier
356							and 'a' eq $match_semantics
357							and return '5.013009';
358
359							# /d, /l, and /u were introduced in 5.13.6, but only in (?...), not
360							# as modifiers of the entire regular expression.
361	83	100	66			316	defined $match_semantics
362							and not $is_statement_modifier
363							and return '5.013006';
364
365							# The '^' reassert-defaults modifier in embedded modifiers was
366							# introduced in 5.13.6.
367	73	50	66			222	not $is_statement_modifier
368							and $content =~ m/ \^ /smx
369							and return '5.013006';
370
371	73	100				180	$self->asserts( 'r' ) and return '5.013002';
372	70	100				257	$self->asserts( 'p' ) and return '5.009005';
373	68	100				297	$self->content() =~ m/ \A [(]? [?] .* - /smx
374							and return '5.005';
375	66	100				186	$self->asserts( 'c' ) and return '5.004';
376	65					328	return MINIMUM_PERL;
377							}
378
379							# Return true if the token can be quantified, and false otherwise
380							# sub can_be_quantified { return };
381
382							# $present => __aggregate_modifiers( 'modifiers', ... );
383							#
384							# This subroutine is private to the PPIx::Regexp package. It may change
385							# or be retracted without notice. Its purpose is to support defaulted
386							# modifiers.
387							#
388							# Aggregate the given modifiers left-to-right, returning a hash of those
389							# present and their values.
390
391							sub __aggregate_modifiers {
392	1110			1110		2425	my ( @mods ) = @_;
393	1110					1871	my %present;
394	1110					2336	foreach my $content ( @mods ) {
395	1118					4300	$content =~ s{ \A $capture_group_leader+ }{}smxg; # no /o!
396	1118	100				3291	if ( $content =~ m/ \A \^ /smx ) {
397	8					65	@present{ MODIFIER_GROUP_MATCH_SEMANTICS(), qw{ i s m x } }
398							= qw{ d 0 0 0 0 };
399							}
400
401							# Have to do the global match rather than a split, because the
402							# expression modifiers come through here too, and we need to
403							# distinguish between s/.../.../e and s/.../.../ee. But the
404							# modifiers can be randomized (that is, /eie is the same as
405							# /eei), so we reorder the content first.
406
407							# The following line is WRONG because it ignores the
408							# significance of '-'. This bug was introduced in version 0.035,
409							# specifically by the change that handled multi-character
410							# modifiers.
411							# $content = join '', sort split qr{}smx, $content;
412
413							# The following is better because it re-orders the modifiers
414							# separately. It does not recognize multiple dashes as
415							# representing an error (though it could!), and modifiers that
416							# are both asserted and negated (e.g. '(?i-i:foo)') are simply
417							# considered to be negated (as Perl does as of 5.20.0).
418							$content = join '-',
419	1118					6481	map { join '', sort split qr{}smx }
	247					2423
420							split qr{-}smx, $content;
421	1118					2635	my $value = 1;
422	1118					4507	while ( $content =~ m/ ( ( [[:alpha:]-] ) \2* ) /smxg ) {
423	412	100				1697	if ( '-' eq $1 ) {
		100
424	11					63	$value = 0;
425							} elsif ( my $bin = $aggregate{$1} ) {
426							# Yes, technically the match semantics stuff can't be
427							# negated in a regex. But it can in a 'use re', which
428							# also comes through here, so we have to handle it.
429	26	100				148	$present{$bin} = $value ? $1 : undef;
430							} else {
431							# TODO have to think about this, since I need asserts(
432							# 'e' ) to be 2 if we in fact have 'ee'. Is this
433							# correct?
434							# $present{$1} = $value;
435	375					2036	$present{$2} = $value * length $1;
436							}
437							}
438							}
439	1110					3616	return \%present;
440							}
441
442							# This must be implemented by tokens which do not recognize themselves.
443							# The return is a list of list references. Each list reference must
444							# contain a regular expression that recognizes the token, and optionally
445							# a reference to a hash to pass to make_token as the class-specific
446							# arguments. The regular expression MUST be anchored to the beginning of
447							# the string.
448							sub __PPIX_TOKEN__recognize {
449							return (
450	9			9		51	[ qr{ \A [(] [?] [[:lower:]]* -? [[:lower:]]* [)] }smx ],
451							[ qr{ \A [(] [?] \^ [[:lower:]]* [)] }smx ],
452							);
453							}
454
455							{
456
457							# Called by the tokenizer to modify the current modifiers with a new
458							# set. Both are passed as hash references, and a reference to the
459							# new hash is returned.
460							sub __PPIX_TOKENIZER__modifier_modify {
461	592			592		1432	my ( @args ) = @_;
462
463	592					1040	my %merged;
464	592					1276	foreach my $hash ( @args ) {
465	1184					1982	while ( my ( $key, $val ) = each %{ $hash } ) {
	1469					4953
466	285	100				630	if ( $val ) {
467	242					641	$merged{$key} = $val;
468							} else {
469	43					140	delete $merged{$key};
470							}
471							}
472							}
473
474	592					2365	return \%merged;
475
476							}
477
478							# Decode modifiers from the content of the token.
479							sub _decode {
480	586			586		1326	my ( $self ) = @_;
481	586					2086	my $mod = __aggregate_modifiers( $self->content() );
482							$self->{__caret_undoes_n}
483	586	100				1699	and $mod->{n} = 0;
484	586					1835	return $mod;
485							}
486							}
487
488							1;
489
490							__END__