File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm
Criterion Covered Total %
statement 107 108 99.0
branch 47 54 87.0
condition 5 5 100.0
subroutine 18 18 100.0
pod 3 3 100.0
total 180 188 95.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Structure - Represent structural elements.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C is the parent of
17             L.
18              
19             =head1 DESCRIPTION
20              
21             This class represents things that define the structure of the regular
22             expression. This typically means brackets of various sorts, but to
23             prevent proliferation of token classes the type of the regular
24             expression is stored here.
25              
26             =head1 METHODS
27              
28             This class provides no public methods beyond those provided by its
29             superclass.
30              
31             =cut
32              
33             package PPIx::Regexp::Token::Structure;
34              
35 9     9   73 use strict;
  9         16  
  9         241  
36 9     9   44 use warnings;
  9         29  
  9         245  
37              
38 9     9   49 use base qw{ PPIx::Regexp::Token };
  9         23  
  9         845  
39              
40 9         955 use PPIx::Regexp::Constant qw{
41             COOKIE_CLASS
42             COOKIE_QUANT
43             COOKIE_REGEX_SET
44             MINIMUM_PERL
45             TOKEN_LITERAL
46             @CARP_NOT
47 9     9   72 };
  9         17  
48              
49             # Tokens we are responsible for making, under at least some
50             # circumstances.
51 9     9   64 use PPIx::Regexp::Token::Comment ();
  9         14  
  9         212  
52 9     9   4514 use PPIx::Regexp::Token::Modifier ();
  9         26  
  9         227  
53 9     9   70 use PPIx::Regexp::Token::Backreference ();
  9         17  
  9         137  
54 9     9   48 use PPIx::Regexp::Token::Backtrack ();
  9         23  
  9         117  
55 9     9   4658 use PPIx::Regexp::Token::Recursion ();
  9         25  
  9         10452  
56              
57             our $VERSION = '0.088';
58              
59             # Return true if the token can be quantified, and false otherwise
60              
61             my %quant = map { $_ => 1 } ')', ']';
62             sub can_be_quantified {
63 524     524 1 1147 my ( $self ) = @_;
64 524 50       1443 ref $self or return;
65 524         1590 return $quant{ $self->content() };
66             };
67              
68             {
69              
70             my %explanation = (
71             '' => 'Match regexp',
72             '(' => 'Capture or grouping',
73             '(?[' => 'Extended character class',
74             ')' => 'End capture or grouping',
75             '[' => 'Character class',
76             ']' => 'End character class',
77             '])' => 'End extended character class',
78             'm' => 'Match regexp',
79             'qr' => 'Regexp object definition',
80             's' => 'Replace regexp with string or expression',
81             '{' => 'Explicit quantifier',
82             '}' => 'End explicit quantifier',
83             );
84              
85             sub __explanation {
86 7     7   17 return \%explanation;
87             }
88              
89             }
90              
91             sub is_quantifier {
92 540     540 1 1249 my ( $self ) = @_;
93 540 50       1436 ref $self or return;
94 540         2128 return $self->{is_quantifier};
95             }
96              
97             {
98              
99             # Note that the implementation equivocates on the ::Token::Structure
100             # class, using it both for the initial token that determines the
101             # type of the regex and things like parentheses internal to the
102             # regex. Rather than sort out this equivocation, I have relied on
103             # the currently-true assumption that 'qr' will not satisfy the
104             # ::Token::Structure recognition logic, and the only way this class
105             # can acquire this content is by the brute-force approach used to
106             # generate the initial token object.
107              
108             my %perl_version_introduced = (
109             qr => '5.005',
110             '(?[' => '5.017008',
111             );
112              
113             sub perl_version_introduced {
114 222     222 1 3775 my ( $self ) = @_;
115             return $self->{perl_version_introduced} ||
116 222   100     1029 $perl_version_introduced{ $self->content() } ||
117             MINIMUM_PERL;
118             }
119             }
120            
121             {
122              
123             my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124              
125             # Regular expressions to match various parenthesized tokens, and the
126             # classes to make them into.
127              
128             my @paren_token = map {
129             [ $_ => $_->__PPIX_TOKEN__recognize() ]
130             }
131             'PPIx::Regexp::Token::Comment',
132             'PPIx::Regexp::Token::Modifier',
133             'PPIx::Regexp::Token::Backreference',
134             'PPIx::Regexp::Token::Backtrack',
135             'PPIx::Regexp::Token::Recursion',
136             ;
137              
138             sub __PPIX_TOKENIZER__regexp {
139 1046     1046   2523 my ( undef, $tokenizer, $character ) = @_;
140              
141             # We are not interested in anything but delimiters.
142 1046 100       3126 $delim{$character} or return;
143              
144             # Inside a character class, all the delimiters are normal characters
145             # except for the close square bracket.
146 893 100       2292 if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147 59 100       251 $character eq ']'
148             or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149 53         173 $tokenizer->cookie( COOKIE_CLASS, undef );
150 53         240 return 1;
151             }
152              
153             # Open parentheses have various interesting possibilities ...
154 834 100       2526 if ( $character eq '(' ) {
155              
156             # Sometimes the whole bunch of parenthesized characters seems
157             # naturally to be a token.
158 347         1055 foreach ( @paren_token ) {
159 1659         2513 my ( $class, @recognize ) = @{ $_ };
  1659         4204  
160 1659         2833 foreach ( @recognize ) {
161 2595         3557 my ( $regexp, $arg ) = @{ $_ };
  2595         4771  
162 2595 100       5336 my $accept = $tokenizer->find_regexp( $regexp ) or next;
163 55         248 return $tokenizer->make_token( $accept, $class, $arg );
164             }
165             }
166              
167             # Modifier changes are local to this parenthesis group
168 292         1231 $tokenizer->modifier_duplicate();
169              
170             # The regex-set functionality introduced with 5.17.8 is most
171             # conveniently handled by treating the initial '(?[' and
172             # final '])' as ::Structure tokens. Fortunately for us,
173             # perl5178delta documents that these may not have interior
174             # spaces.
175              
176 292 100       1451 if ( my $accept = $tokenizer->find_regexp(
177             qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178             )
179             ) {
180 8     105   62 $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
  105         291  
181 8         43 $tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182 8         41 return $accept;
183             }
184              
185             # We expect certain tokens only after a left paren.
186             $tokenizer->expect(
187 284         1611 'PPIx::Regexp::Token::GroupType::Modifier',
188             'PPIx::Regexp::Token::GroupType::NamedCapture',
189             'PPIx::Regexp::Token::GroupType::Assertion',
190             'PPIx::Regexp::Token::GroupType::Code',
191             'PPIx::Regexp::Token::GroupType::BranchReset',
192             'PPIx::Regexp::Token::GroupType::Subexpression',
193             'PPIx::Regexp::Token::GroupType::Switch',
194             'PPIx::Regexp::Token::GroupType::Script_Run',
195             'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196             );
197              
198             # Accept the parenthesis.
199 284         816 return 1;
200             }
201              
202             # Close parentheses end modifier localization
203 487 100       1360 if ( $character eq ')' ) {
204 288         1200 $tokenizer->modifier_pop();
205 288         756 return 1;
206             }
207              
208             # Open curlys are complicated because they may or may not represent
209             # the beginning of a quantifier, depending on what comes before the
210             # close curly. So we set a cookie to monitor the token stream for
211             # interlopers. If all goes well, the right curly will find the
212             # cookie and know it is supposed to be a quantifier.
213 199 100       719 if ( $character eq '{' ) {
214              
215             # If the prior token can not be quantified, all this is
216             # unnecessary.
217 71 100       225 $tokenizer->prior_significant_token( 'can_be_quantified' )
218             or return 1;
219              
220             # We make our token now, before setting the cookie. Otherwise
221             # the cookie has to deal with this token.
222 61         220 my $token = $tokenizer->make_token( 1 );
223              
224             # A cookie for the next '}'.
225 61         187 my $commas = 0;
226 61         142 my $allow_digit = 1;
227             $tokenizer->cookie( COOKIE_QUANT, sub {
228 115     115   326 my ( $tokenizer, $token ) = @_;
229 115 50       324 $token or return 1;
230              
231             # Code for 5.33.6 and after.
232             # We allow {,...}, and we allow space inside and
233             # adjacent to the curlys, and around the comma if
234             # any. But not interior to the numbers.
235 115 100       414 if ( $token->isa( TOKEN_LITERAL ) ) {
236 105         277 my $character = $token->content();
237 105 100       428 if ( $character =~ m/ \A \s \z /smx ) {
238             # Digits only allowed if the prior
239             # significant was an open curly or a comma.
240 4         12 $allow_digit = $tokenizer->prior_significant_token(
241             'content' ) =~ m/ \A [{,] \z /smx; # }
242 4         14 return 1;
243             }
244 101 100       392 $character eq ','
245             and return( ! $commas++ );
246 68 50       193 $allow_digit
247             or return;
248 68         386 return $character =~ m/ \A [0-9] \z /smx;
249             }
250              
251             # Since we do not know what is in an interpolation, we
252             # trustingly accept it.
253 10 100       56 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254             ) {
255 5         24 return 1;
256             }
257              
258 5         50 return;
259             },
260 61         464 );
261              
262 61         208 return $token;
263             }
264              
265             # The close curly bracket is a little complicated because if the
266             # cookie posted by the left curly bracket is still around, we are a
267             # quantifier, otherwise not.
268 128 100       520 if ( $character eq '}' ) {
269 63 100       182 $tokenizer->cookie( COOKIE_QUANT, undef )
270             or return 1;
271 50 100       454 $tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272             and return 1;
273 47         153 my $token = $tokenizer->make_token( 1 );
274 47         157 $token->{is_quantifier} = 1;
275 47         144 return $token;
276             }
277              
278             # The parse rules are different inside a character class, so we set
279             # another cookie. Sigh. If your tool is a hammer ...
280 65 100       278 if ( $character eq '[' ) {
281              
282             # Set our cookie. Since it always returns 1, it does not matter
283             # where in the following mess we set it.
284 53     226   371 $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
  226         691  
285              
286             # Make our token now, since the easiest place to deal with the
287             # beginning-of-character-class strangeness seems to be right
288             # here.
289 53         236 my @tokens = $tokenizer->make_token( 1 );
290              
291             # Get the next character, returning tokens if there is none.
292 53 50       254 defined ( $character = $tokenizer->peek() )
293             or return @tokens;
294              
295             # If we have a caret, it is a negation operator. Make its token
296             # and fetch the next character, returning if none.
297 53 100       227 if ( $character eq '^' ) {
298 5         28 push @tokens, $tokenizer->make_token(
299             1, 'PPIx::Regexp::Token::Operator' );
300 5 50       26 defined ( $character = $tokenizer->peek() )
301             or return @tokens;
302             }
303              
304             # If we have a close square at this point, it is not the end of
305             # the class, but just a literal. Make its token.
306 53 100       161 $character eq ']'
307             and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308              
309             # Return all tokens made.
310 53         179 return @tokens;
311             }
312             # per perlop, the metas inside a [] are -]\^$.
313             # per perlop, the metas outside a [] are {}[]()^$.|*+?\
314             # The difference is that {}[().|*+? are not metas in [], but - is.
315              
316             # Close bracket is complicated by the addition of regex sets.
317             # And more complicated by the fact that you can have an
318             # old-style character class inside a regex set. Fortunately they
319             # have not (yet!) permitted nested regex sets.
320 12 50       77 if ( $character eq ']' ) {
321              
322             # If we find '])' and COOKIE_REGEX_SET is present, we have a
323             # regex set. We need to delete the cookie and accept both
324             # characters.
325 12 100 100     60 if ( ( my $accept = $tokenizer->find_regexp(
326             # help vim - ( [
327             qr{ \A []] [)] }smx
328             ) )
329             && $tokenizer->cookie( COOKIE_REGEX_SET )
330              
331             ) {
332 8         42 $tokenizer->cookie( COOKIE_REGEX_SET, undef );
333 8         47 return $accept;
334             }
335              
336             # Otherwise we assume we're in a bracketed character class,
337             # delete the cookie, and accept the close bracket.
338 4         22 $tokenizer->cookie( COOKIE_CLASS, undef );
339 4         16 return 1;
340             }
341              
342 0         0 return 1;
343             }
344              
345             }
346              
347             # Called by the lexer once it has done its worst to all the tokens.
348             # Called as a method with no arguments. The return is the number of
349             # parse failures discovered when finalizing.
350             sub __PPIX_LEXER__finalize {
351 1452     1452   2536 my ( $self ) = @_;
352 1452         2218 delete $self->{is_quantifier};
353 1452         3118 return 0;
354             }
355              
356             1;
357              
358             __END__