File Coverage

blib/lib/PPIx/Regexp/Token/Modifier.pm
Criterion Covered Total %
statement 111 120 92.5
branch 51 68 75.0
condition 37 50 74.0
subroutine 20 20 100.0
pod 7 7 100.0
total 226 265 85.2


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Modifier - Represent modifiers.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9             ->print();
10              
11             The trailing C will be represented by this class.
12              
13             This class also represents the whole of things like C<(?ismx)>. But the
14             modifiers in something like C<(?i:foo)> are represented by a
15             L.
16              
17             =head1 INHERITANCE
18              
19             C is a
20             L.
21              
22             C is the parent of
23             L.
24              
25             =head1 DESCRIPTION
26              
27             This class represents modifier characters at the end of the regular
28             expression. For example, in C this class would represent
29             the terminal C.
30              
31             =head2 The C, C, C, C, and C modifiers
32              
33             The C, C, C, C, and C modifiers, introduced starting in
34             Perl 5.13.6, are used to force either Unicode pattern semantics (C),
35             locale semantics (C) default semantics (C the traditional Perl
36             semantics, which can also mean 'dual' since it means Unicode if the
37             string's UTF-8 bit is on, and locale if the UTF-8 bit is off), or
38             restricted default semantics (C). These are mutually exclusive, and
39             only one can be asserted at a time. Asserting any of these overrides
40             the inherited value of any of the others. The C method
41             reports as asserted the last one it sees, or none of them if it has seen
42             none.
43              
44             For example, given C C<$elem>
45             representing the invalid regular expression fragment C<(?dul)>,
46             C<< $elem->asserted( 'l' ) >> would return true, but
47             C<< $elem->asserted( 'u' ) >> would return false. Note that
48             C<< $elem->negated( 'u' ) >> would also return false, since C is not
49             explicitly negated.
50              
51             If C<$elem> represented regular expression fragment C<(?i)>,
52             C<< $elem->asserted( 'd' ) >> would return false, since even though C
53             represents the default behavior it is not explicitly asserted.
54              
55             =head2 The caret (C<^>) modifier
56              
57             Calling C<^> a modifier is a bit of a misnomer. The C<(?^...)>
58             construction was introduced in Perl 5.13.6, to prevent the inheritance
59             of modifiers. The documentation calls the caret a shorthand equivalent
60             for C, and that it the way this class handles it.
61              
62             For example, given C C<$elem>
63             representing regular expression fragment C<(?^i)>,
64             C<< $elem->asserts( 'd' ) >> would return true, since in the absence of
65             an explicit C or C this class considers the C<^> to explicitly
66             assert C.
67              
68             The caret handling is complicated by the fact that the C<'n'> modifier
69             was introduced in 5.21.8, at which point the caret became equivalent to
70             C. I did not feel I could unconditionally add the C<-n> to the
71             expansion of the caret, because that would produce confusing output from
72             methods like L. Nor could I
73             make it conditional on the minimum perl version, because that
74             information is not available early enough in the parse. What I did was
75             to expand the caret into C if and only if C<'n'> was in effect
76             at some point in the scope in which the modifier was parsed.
77              
78             Continuing the above example, C<< $elem->asserts( 'n' ) >> and
79             C<< $elem->modifier_asserted( 'n' ) >> would both return false, but
80             C<< $elem->negates( 'n' ) >> would return true if and only if the C
81             modifier has been asserted somewhere before and in-scope from this
82             token. The
83             L
84             method is inherited from L.
85              
86             =head1 METHODS
87              
88             This class provides the following public methods. Methods not documented
89             here are private, and unsupported in the sense that the author reserves
90             the right to change or remove them without notice.
91              
92             =cut
93              
94             package PPIx::Regexp::Token::Modifier;
95              
96 9     9   67 use strict;
  9         17  
  9         248  
97 9     9   50 use warnings;
  9         71  
  9         238  
98              
99 9     9   49 use base qw{ PPIx::Regexp::Token };
  9         18  
  9         729  
100              
101 9     9   64 use Carp;
  9         32  
  9         636  
102 9         1834 use PPIx::Regexp::Constant qw{
103             MINIMUM_PERL
104             MODIFIER_GROUP_MATCH_SEMANTICS
105             @CARP_NOT
106 9     9   62 };
  9         31  
107              
108             our $VERSION = '0.087_01';
109              
110             # Define modifiers that are to be aggregated internally for ease of
111             # computation.
112             my %aggregate = (
113             a => MODIFIER_GROUP_MATCH_SEMANTICS,
114             aa => MODIFIER_GROUP_MATCH_SEMANTICS,
115             d => MODIFIER_GROUP_MATCH_SEMANTICS,
116             l => MODIFIER_GROUP_MATCH_SEMANTICS,
117             u => MODIFIER_GROUP_MATCH_SEMANTICS,
118             );
119             my %de_aggregate;
120             foreach my $value ( values %aggregate ) {
121             $de_aggregate{$value}++;
122             }
123              
124             # Note that we do NOT want the /o modifier on regexen that make use of
125             # this, because it is already compiled.
126             my $capture_group_leader = qr{ [?/(] }smx; # );
127              
128 9     9   112 use constant TOKENIZER_ARGUMENT_REQUIRED => 1;
  9         32  
  9         17289  
129              
130             sub __new {
131 586     586   2922 my ( $class, $content, %arg ) = @_;
132              
133 586 50       2842 my $self = $class->SUPER::__new( $content, %arg )
134             or return;
135              
136             $content =~ m{ \A $capture_group_leader* \^ }smx # no /o!
137             and defined $arg{tokenizer}->modifier_seen( 'n' )
138 586 100 100     5075 and $self->{__caret_undoes_n} = 1;
139              
140 586         2502 $arg{tokenizer}->modifier_modify( $self->modifiers() );
141              
142 586         2185 return $self;
143             }
144              
145             =head2 asserts
146              
147             $token->asserts( 'i' ) and print "token asserts i";
148             foreach ( $token->asserts() ) { print "token asserts $_\n" }
149              
150             This method returns true if the token explicitly asserts the given
151             modifier. The example would return true for the modifier in
152             C<(?i:foo)>, but false for C<(?-i:foo)>.
153              
154             Starting with version 0.036_01, if the argument is a
155             single-character modifier followed by an asterisk (intended as a wild
156             card character), the return is the number of times that modifier
157             appears. In this case an exception will be thrown if you specify a
158             multi-character modifier (e.g. C<'ee*'>).
159              
160             If called without an argument, or with an undef argument, all modifiers
161             explicitly asserted by this token are returned.
162              
163             =cut
164              
165             sub asserts {
166 410     410 1 798 my ( $self, $modifier ) = @_;
167 410   33     855 $self->{modifiers} ||= $self->_decode();
168 410 50       779 if ( defined $modifier ) {
169 410         719 return __asserts( $self->{modifiers}, $modifier );
170             } else {
171 0 0       0 return ( sort grep { defined $_ && $self->{modifiers}{$_} }
172 0 0       0 map { $de_aggregate{$_} ? $self->{modifiers}{$_} : $_ }
173 0         0 keys %{ $self->{modifiers} } );
  0         0  
174             }
175             }
176              
177             # This is a kluge for both determining whether the object asserts
178             # modifiers (hence the 'ductype') and determining whether the given
179             # modifier is actually asserted. The signature is the invocant and the
180             # modifier name, which must not be undef. The return is a boolean.
181             *__ducktype_modifier_asserted = \&asserts;
182              
183             sub __asserts {
184 5461     5461   9699 my ( $present, $modifier ) = @_;
185 5461         19747 my $wild = $modifier =~ s/ [*] \z //smx;
186 5461 50 66     18469 not $wild
187             or 1 == length $modifier
188             or croak "Can not use wild card on multi-character modifier '$modifier*'";
189 5461 100       11814 if ( my $bin = $aggregate{$modifier} ) {
190 15         38 my $aggr = $present->{$bin};
191 15 50 100     95 $wild
192             or return ( defined $aggr && $modifier eq $aggr );
193 0 0       0 defined $aggr
194             or return 0;
195 0 0       0 $aggr =~ m/ \A ( (?: \Q$modifier\E )* ) \z /smx
196             or return 0;
197 0         0 return length $1;
198             }
199 5446 100       10413 if ( $wild ) {
200 2583   100     16416 return $present->{$modifier} || 0;
201             }
202 2863         4867 my $len = length $modifier;
203 2863         5561 $modifier = substr $modifier, 0, 1;
204 2863   100     12101 return $present->{$modifier} && $len == $present->{$modifier};
205             }
206              
207 8     8 1 32 sub can_be_quantified { return };
208              
209             {
210             my %explanation = (
211             'm' => 'm: ^ and $ match within string',
212             '-m' => '-m: ^ and $ match only at ends of string',
213             's' => 's: . can match newline',
214             '-s' => '-s: . can not match newline',
215             'i' => 'i: do case-insensitive matching',
216             '-i' => '-i: do case-sensitive matching',
217             'x' => 'x: ignore whitespace and comments',
218             'xx' => 'xx: ignore whitespace even in bracketed character classes',
219             '-x' => '-x: regard whitespace as literal',
220             'p' => 'p: provide ${^PREMATCH} etc (pre 5.20)',
221             '-p' => '-p: no ${^PREMATCH} etc (pre 5.20)',
222             'a' => 'a: restrict non-Unicode classes to ASCII',
223             'aa' => 'aa: restrict non-Unicode classes & ASCII-Unicode matches',
224             'd' => 'd: match using default semantics',
225             'l' => 'l: match using locale semantics',
226             'u' => 'u: match using Unicode semantics',
227             'n' => 'n: parentheses do not capture',
228             '-n' => '-n: parentheses capture',
229             'c' => 'c: preserve current position on match failure',
230             'g' => 'g: match repeatedly',
231             'e' => 'e: substitution string is an expression',
232             'ee' => 'ee: substitution is expression to eval()',
233             'o' => 'o: only interpolate once',
234             'r' => 'r: aubstitution returns modified string',
235             );
236              
237             sub explain {
238 4     4 1 16 my ( $self ) = @_;
239 4         8 my @rslt;
240 4         10 my %mods = $self->modifiers();
241 4 50       18 if ( defined( my $val = delete $mods{match_semantics} ) ) {
242 4         17 push @rslt, $explanation{$val};
243             }
244 4         18 foreach my $key ( sort keys %mods ) {
245 14 100       33 if ( my $val = $mods{$key} ) {
246 4         12 push @rslt, $explanation{ $key x $val };
247             } else {
248 10         26 push @rslt, $explanation{ "-$key" };
249             }
250             }
251 4 50       27 return wantarray ? @rslt : join '; ', @rslt;
252             }
253             }
254              
255             =head2 match_semantics
256              
257             my $sem = $token->match_semantics();
258             defined $sem or $sem = 'undefined';
259             print "This token has $sem match semantics\n";
260              
261             This method returns the match semantics asserted by the token, as one of
262             the strings C<'a'>, C<'aa'>, C<'d'>, C<'l'>, or C<'u'>. If no explicit
263             match semantics are asserted, this method returns C.
264              
265             =cut
266              
267             sub match_semantics {
268 96     96 1 248 my ( $self ) = @_;
269 96   33     284 $self->{modifiers} ||= $self->_decode();
270 96         238 return $self->{modifiers}{ MODIFIER_GROUP_MATCH_SEMANTICS() };
271             }
272              
273             =head2 modifiers
274              
275             my %mods = $token->modifiers();
276              
277             Returns all modifiers asserted or negated by this token, and the values
278             set (true for asserted, false for negated). If called in scalar context,
279             returns a reference to a hash containing the values.
280              
281             =cut
282              
283             sub modifiers {
284 591     591 1 1411 my ( $self ) = @_;
285 591   66     3576 $self->{modifiers} ||= $self->_decode();
286 591         970 my %mods = %{ $self->{modifiers} };
  591         2039  
287 591         2040 foreach my $bin ( keys %de_aggregate ) {
288 591 100       2252 defined ( my $val = delete $mods{$bin} )
289             or next;
290 30         99 $mods{$bin} = $val;
291             }
292 591 50       3791 return wantarray ? %mods : \%mods;
293             }
294              
295             =head2 negates
296              
297             $token->negates( 'i' ) and print "token negates i\n";
298             foreach ( $token->negates() ) { print "token negates $_\n" }
299              
300             This method returns true if the token explicitly negates the given
301             modifier. The example would return true for the modifier in
302             C<(?-i:foo)>, but false for C<(?i:foo)>.
303              
304             If called without an argument, or with an undef argument, all modifiers
305             explicitly negated by this token are returned.
306              
307             =cut
308              
309             sub negates {
310 5     5 1 13 my ( $self, $modifier ) = @_;
311 5   33     14 $self->{modifiers} ||= $self->_decode();
312             # Note that since the values of hash entries that represent
313             # aggregated modifiers will never be false (at least, not unless '0'
314             # becomes a modifier) we need no special logic to handle them.
315             defined $modifier
316 0         0 or return ( sort grep { ! $self->{modifiers}{$_} }
317 5 50       16 keys %{ $self->{modifiers} } );
  0         0  
318             return exists $self->{modifiers}{$modifier}
319 5   66     55 && ! $self->{modifiers}{$modifier};
320             }
321              
322             sub perl_version_introduced {
323 132     132 1 9900 my ( $self ) = @_;
324             return ( $self->{perl_version_introduced} ||=
325 132   66     706 $self->_perl_version_introduced() );
326             }
327              
328             sub _perl_version_introduced {
329 93     93   222 my ( $self ) = @_;
330 93         274 my $content = $self->content();
331 93         343 my $is_statement_modifier = ( $content !~ m/ \A [(]? [?] /smx );
332 93         577 my $match_semantics = $self->match_semantics();
333              
334 93 100       459 $self->asserts( 'xx' )
335             and return '5.025009';
336              
337             # Disabling capture with /n was introduced in 5.21.8
338 92 100       327 $self->asserts( 'n' )
339             and return '5.021008';
340              
341             # Match semantics modifiers became available as regular expression
342             # modifiers in 5.13.10.
343 91 100 100     389 defined $match_semantics
344             and $is_statement_modifier
345             and return '5.013010';
346              
347             # /aa was introduced in 5.13.10.
348 85 100 100     334 defined $match_semantics
349             and 'aa' eq $match_semantics
350             and return '5.013010';
351              
352             # /a was introduced in 5.13.9, but only in (?...), not as modifier
353             # of the entire regular expression.
354 84 100 66     277 defined $match_semantics
      100        
355             and not $is_statement_modifier
356             and 'a' eq $match_semantics
357             and return '5.013009';
358              
359             # /d, /l, and /u were introduced in 5.13.6, but only in (?...), not
360             # as modifiers of the entire regular expression.
361 83 100 66     316 defined $match_semantics
362             and not $is_statement_modifier
363             and return '5.013006';
364              
365             # The '^' reassert-defaults modifier in embedded modifiers was
366             # introduced in 5.13.6.
367 73 50 66     222 not $is_statement_modifier
368             and $content =~ m/ \^ /smx
369             and return '5.013006';
370              
371 73 100       180 $self->asserts( 'r' ) and return '5.013002';
372 70 100       257 $self->asserts( 'p' ) and return '5.009005';
373 68 100       297 $self->content() =~ m/ \A [(]? [?] .* - /smx
374             and return '5.005';
375 66 100       186 $self->asserts( 'c' ) and return '5.004';
376 65         328 return MINIMUM_PERL;
377             }
378              
379             # Return true if the token can be quantified, and false otherwise
380             # sub can_be_quantified { return };
381              
382             # $present => __aggregate_modifiers( 'modifiers', ... );
383             #
384             # This subroutine is private to the PPIx::Regexp package. It may change
385             # or be retracted without notice. Its purpose is to support defaulted
386             # modifiers.
387             #
388             # Aggregate the given modifiers left-to-right, returning a hash of those
389             # present and their values.
390              
391             sub __aggregate_modifiers {
392 1110     1110   2425 my ( @mods ) = @_;
393 1110         1871 my %present;
394 1110         2336 foreach my $content ( @mods ) {
395 1118         4300 $content =~ s{ \A $capture_group_leader+ }{}smxg; # no /o!
396 1118 100       3291 if ( $content =~ m/ \A \^ /smx ) {
397 8         65 @present{ MODIFIER_GROUP_MATCH_SEMANTICS(), qw{ i s m x } }
398             = qw{ d 0 0 0 0 };
399             }
400              
401             # Have to do the global match rather than a split, because the
402             # expression modifiers come through here too, and we need to
403             # distinguish between s/.../.../e and s/.../.../ee. But the
404             # modifiers can be randomized (that is, /eie is the same as
405             # /eei), so we reorder the content first.
406              
407             # The following line is WRONG because it ignores the
408             # significance of '-'. This bug was introduced in version 0.035,
409             # specifically by the change that handled multi-character
410             # modifiers.
411             # $content = join '', sort split qr{}smx, $content;
412              
413             # The following is better because it re-orders the modifiers
414             # separately. It does not recognize multiple dashes as
415             # representing an error (though it could!), and modifiers that
416             # are both asserted and negated (e.g. '(?i-i:foo)') are simply
417             # considered to be negated (as Perl does as of 5.20.0).
418             $content = join '-',
419 1118         6481 map { join '', sort split qr{}smx }
  247         2423  
420             split qr{-}smx, $content;
421 1118         2635 my $value = 1;
422 1118         4507 while ( $content =~ m/ ( ( [[:alpha:]-] ) \2* ) /smxg ) {
423 412 100       1697 if ( '-' eq $1 ) {
    100          
424 11         63 $value = 0;
425             } elsif ( my $bin = $aggregate{$1} ) {
426             # Yes, technically the match semantics stuff can't be
427             # negated in a regex. But it can in a 'use re', which
428             # also comes through here, so we have to handle it.
429 26 100       148 $present{$bin} = $value ? $1 : undef;
430             } else {
431             # TODO have to think about this, since I need asserts(
432             # 'e' ) to be 2 if we in fact have 'ee'. Is this
433             # correct?
434             # $present{$1} = $value;
435 375         2036 $present{$2} = $value * length $1;
436             }
437             }
438             }
439 1110         3616 return \%present;
440             }
441              
442             # This must be implemented by tokens which do not recognize themselves.
443             # The return is a list of list references. Each list reference must
444             # contain a regular expression that recognizes the token, and optionally
445             # a reference to a hash to pass to make_token as the class-specific
446             # arguments. The regular expression MUST be anchored to the beginning of
447             # the string.
448             sub __PPIX_TOKEN__recognize {
449             return (
450 9     9   51 [ qr{ \A [(] [?] [[:lower:]]* -? [[:lower:]]* [)] }smx ],
451             [ qr{ \A [(] [?] \^ [[:lower:]]* [)] }smx ],
452             );
453             }
454              
455             {
456              
457             # Called by the tokenizer to modify the current modifiers with a new
458             # set. Both are passed as hash references, and a reference to the
459             # new hash is returned.
460             sub __PPIX_TOKENIZER__modifier_modify {
461 592     592   1432 my ( @args ) = @_;
462              
463 592         1040 my %merged;
464 592         1276 foreach my $hash ( @args ) {
465 1184         1982 while ( my ( $key, $val ) = each %{ $hash } ) {
  1469         4953  
466 285 100       630 if ( $val ) {
467 242         641 $merged{$key} = $val;
468             } else {
469 43         140 delete $merged{$key};
470             }
471             }
472             }
473              
474 592         2365 return \%merged;
475              
476             }
477              
478             # Decode modifiers from the content of the token.
479             sub _decode {
480 586     586   1326 my ( $self ) = @_;
481 586         2086 my $mod = __aggregate_modifiers( $self->content() );
482             $self->{__caret_undoes_n}
483 586 100       1699 and $mod->{n} = 0;
484 586         1835 return $mod;
485             }
486             }
487              
488             1;
489              
490             __END__