File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm
Criterion Covered Total %
statement 107 108 99.0
branch 47 54 87.0
condition 5 5 100.0
subroutine 18 18 100.0
pod 3 3 100.0
total 180 188 95.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Structure - Represent structural elements.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C is the parent of
17             L.
18              
19             =head1 DESCRIPTION
20              
21             This class represents things that define the structure of the regular
22             expression. This typically means brackets of various sorts, but to
23             prevent proliferation of token classes the type of the regular
24             expression is stored here.
25              
26             =head1 METHODS
27              
28             This class provides no public methods beyond those provided by its
29             superclass.
30              
31             =cut
32              
33             package PPIx::Regexp::Token::Structure;
34              
35 9     9   90 use strict;
  9         42  
  9         257  
36 9     9   45 use warnings;
  9         24  
  9         255  
37              
38 9     9   46 use base qw{ PPIx::Regexp::Token };
  9         19  
  9         826  
39              
40 9         1055 use PPIx::Regexp::Constant qw{
41             COOKIE_CLASS
42             COOKIE_QUANT
43             COOKIE_REGEX_SET
44             MINIMUM_PERL
45             TOKEN_LITERAL
46             @CARP_NOT
47 9     9   80 };
  9         20  
48              
49             # Tokens we are responsible for making, under at least some
50             # circumstances.
51 9     9   76 use PPIx::Regexp::Token::Comment ();
  9         23  
  9         152  
52 9     9   4702 use PPIx::Regexp::Token::Modifier ();
  9         28  
  9         215  
53 9     9   105 use PPIx::Regexp::Token::Backreference ();
  9         22  
  9         134  
54 9     9   56 use PPIx::Regexp::Token::Backtrack ();
  9         18  
  9         138  
55 9     9   4973 use PPIx::Regexp::Token::Recursion ();
  9         32  
  9         10611  
56              
57             our $VERSION = '0.087_01';
58              
59             # Return true if the token can be quantified, and false otherwise
60              
61             my %quant = map { $_ => 1 } ')', ']';
62             sub can_be_quantified {
63 524     524 1 1294 my ( $self ) = @_;
64 524 50       1522 ref $self or return;
65 524         1816 return $quant{ $self->content() };
66             };
67              
68             {
69              
70             my %explanation = (
71             '' => 'Match regexp',
72             '(' => 'Capture or grouping',
73             '(?[' => 'Extended character class',
74             ')' => 'End capture or grouping',
75             '[' => 'Character class',
76             ']' => 'End character class',
77             '])' => 'End extended character class',
78             'm' => 'Match regexp',
79             'qr' => 'Regexp object definition',
80             's' => 'Replace regexp with string or expression',
81             '{' => 'Explicit quantifier',
82             '}' => 'End explicit quantifier',
83             );
84              
85             sub __explanation {
86 7     7   16 return \%explanation;
87             }
88              
89             }
90              
91             sub is_quantifier {
92 540     540 1 1264 my ( $self ) = @_;
93 540 50       1623 ref $self or return;
94 540         2006 return $self->{is_quantifier};
95             }
96              
97             {
98              
99             # Note that the implementation equivocates on the ::Token::Structure
100             # class, using it both for the initial token that determines the
101             # type of the regex and things like parentheses internal to the
102             # regex. Rather than sort out this equivocation, I have relied on
103             # the currently-true assumption that 'qr' will not satisfy the
104             # ::Token::Structure recognition logic, and the only way this class
105             # can acquire this content is by the brute-force approach used to
106             # generate the initial token object.
107              
108             my %perl_version_introduced = (
109             qr => '5.005',
110             '(?[' => '5.017008',
111             );
112              
113             sub perl_version_introduced {
114 222     222 1 3772 my ( $self ) = @_;
115             return $self->{perl_version_introduced} ||
116 222   100     891 $perl_version_introduced{ $self->content() } ||
117             MINIMUM_PERL;
118             }
119             }
120            
121             {
122              
123             my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124              
125             # Regular expressions to match various parenthesized tokens, and the
126             # classes to make them into.
127              
128             my @paren_token = map {
129             [ $_ => $_->__PPIX_TOKEN__recognize() ]
130             }
131             'PPIx::Regexp::Token::Comment',
132             'PPIx::Regexp::Token::Modifier',
133             'PPIx::Regexp::Token::Backreference',
134             'PPIx::Regexp::Token::Backtrack',
135             'PPIx::Regexp::Token::Recursion',
136             ;
137              
138             sub __PPIX_TOKENIZER__regexp {
139 1046     1046   2544 my ( undef, $tokenizer, $character ) = @_;
140              
141             # We are not interested in anything but delimiters.
142 1046 100       3329 $delim{$character} or return;
143              
144             # Inside a character class, all the delimiters are normal characters
145             # except for the close square bracket.
146 893 100       2182 if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147 59 100       289 $character eq ']'
148             or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149 53         205 $tokenizer->cookie( COOKIE_CLASS, undef );
150 53         239 return 1;
151             }
152              
153             # Open parentheses have various interesting possibilities ...
154 834 100       2517 if ( $character eq '(' ) {
155              
156             # Sometimes the whole bunch of parenthesized characters seems
157             # naturally to be a token.
158 347         991 foreach ( @paren_token ) {
159 1659         2642 my ( $class, @recognize ) = @{ $_ };
  1659         4244  
160 1659         2883 foreach ( @recognize ) {
161 2595         3614 my ( $regexp, $arg ) = @{ $_ };
  2595         4833  
162 2595 100       5399 my $accept = $tokenizer->find_regexp( $regexp ) or next;
163 55         278 return $tokenizer->make_token( $accept, $class, $arg );
164             }
165             }
166              
167             # Modifier changes are local to this parenthesis group
168 292         1278 $tokenizer->modifier_duplicate();
169              
170             # The regex-set functionality introduced with 5.17.8 is most
171             # conveniently handled by treating the initial '(?[' and
172             # final '])' as ::Structure tokens. Fortunately for us,
173             # perl5178delta documents that these may not have interior
174             # spaces.
175              
176 292 100       1266 if ( my $accept = $tokenizer->find_regexp(
177             qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178             )
179             ) {
180 8     105   75 $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
  105         296  
181 8         71 $tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182 8         38 return $accept;
183             }
184              
185             # We expect certain tokens only after a left paren.
186             $tokenizer->expect(
187 284         1856 'PPIx::Regexp::Token::GroupType::Modifier',
188             'PPIx::Regexp::Token::GroupType::NamedCapture',
189             'PPIx::Regexp::Token::GroupType::Assertion',
190             'PPIx::Regexp::Token::GroupType::Code',
191             'PPIx::Regexp::Token::GroupType::BranchReset',
192             'PPIx::Regexp::Token::GroupType::Subexpression',
193             'PPIx::Regexp::Token::GroupType::Switch',
194             'PPIx::Regexp::Token::GroupType::Script_Run',
195             'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196             );
197              
198             # Accept the parenthesis.
199 284         848 return 1;
200             }
201              
202             # Close parentheses end modifier localization
203 487 100       1657 if ( $character eq ')' ) {
204 288         1133 $tokenizer->modifier_pop();
205 288         817 return 1;
206             }
207              
208             # Open curlys are complicated because they may or may not represent
209             # the beginning of a quantifier, depending on what comes before the
210             # close curly. So we set a cookie to monitor the token stream for
211             # interlopers. If all goes well, the right curly will find the
212             # cookie and know it is supposed to be a quantifier.
213 199 100       604 if ( $character eq '{' ) {
214              
215             # If the prior token can not be quantified, all this is
216             # unnecessary.
217 71 100       187 $tokenizer->prior_significant_token( 'can_be_quantified' )
218             or return 1;
219              
220             # We make our token now, before setting the cookie. Otherwise
221             # the cookie has to deal with this token.
222 61         247 my $token = $tokenizer->make_token( 1 );
223              
224             # A cookie for the next '}'.
225 61         212 my $commas = 0;
226 61         136 my $allow_digit = 1;
227             $tokenizer->cookie( COOKIE_QUANT, sub {
228 115     115   452 my ( $tokenizer, $token ) = @_;
229 115 50       300 $token or return 1;
230              
231             # Code for 5.33.6 and after.
232             # We allow {,...}, and we allow space inside and
233             # adjacent to the curlys, and around the comma if
234             # any. But not interior to the numbers.
235 115 100       416 if ( $token->isa( TOKEN_LITERAL ) ) {
236 105         300 my $character = $token->content();
237 105 100       391 if ( $character =~ m/ \A \s \z /smx ) {
238             # Digits only allowed if the prior
239             # significant was an open curly or a comma.
240 4         11 $allow_digit = $tokenizer->prior_significant_token(
241             'content' ) =~ m/ \A [{,] \z /smx; # }
242 4         15 return 1;
243             }
244 101 100       447 $character eq ','
245             and return( ! $commas++ );
246 68 50       176 $allow_digit
247             or return;
248 68         379 return $character =~ m/ \A [0-9] \z /smx;
249             }
250              
251             # Since we do not know what is in an interpolation, we
252             # trustingly accept it.
253 10 100       91 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254             ) {
255 5         23 return 1;
256             }
257              
258 5         49 return;
259             },
260 61         497 );
261              
262 61         181 return $token;
263             }
264              
265             # The close curly bracket is a little complicated because if the
266             # cookie posted by the left curly bracket is still around, we are a
267             # quantifier, otherwise not.
268 128 100       412 if ( $character eq '}' ) {
269 63 100       191 $tokenizer->cookie( COOKIE_QUANT, undef )
270             or return 1;
271 50 100       540 $tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272             and return 1;
273 47         170 my $token = $tokenizer->make_token( 1 );
274 47         200 $token->{is_quantifier} = 1;
275 47         139 return $token;
276             }
277              
278             # The parse rules are different inside a character class, so we set
279             # another cookie. Sigh. If your tool is a hammer ...
280 65 100       230 if ( $character eq '[' ) {
281              
282             # Set our cookie. Since it always returns 1, it does not matter
283             # where in the following mess we set it.
284 53     226   378 $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
  226         648  
285              
286             # Make our token now, since the easiest place to deal with the
287             # beginning-of-character-class strangeness seems to be right
288             # here.
289 53         236 my @tokens = $tokenizer->make_token( 1 );
290              
291             # Get the next character, returning tokens if there is none.
292 53 50       242 defined ( $character = $tokenizer->peek() )
293             or return @tokens;
294              
295             # If we have a caret, it is a negation operator. Make its token
296             # and fetch the next character, returning if none.
297 53 100       185 if ( $character eq '^' ) {
298 5         49 push @tokens, $tokenizer->make_token(
299             1, 'PPIx::Regexp::Token::Operator' );
300 5 50       37 defined ( $character = $tokenizer->peek() )
301             or return @tokens;
302             }
303              
304             # If we have a close square at this point, it is not the end of
305             # the class, but just a literal. Make its token.
306 53 100       154 $character eq ']'
307             and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308              
309             # Return all tokens made.
310 53         185 return @tokens;
311             }
312             # per perlop, the metas inside a [] are -]\^$.
313             # per perlop, the metas outside a [] are {}[]()^$.|*+?\
314             # The difference is that {}[().|*+? are not metas in [], but - is.
315              
316             # Close bracket is complicated by the addition of regex sets.
317             # And more complicated by the fact that you can have an
318             # old-style character class inside a regex set. Fortunately they
319             # have not (yet!) permitted nested regex sets.
320 12 50       81 if ( $character eq ']' ) {
321              
322             # If we find '])' and COOKIE_REGEX_SET is present, we have a
323             # regex set. We need to delete the cookie and accept both
324             # characters.
325 12 100 100     76 if ( ( my $accept = $tokenizer->find_regexp(
326             # help vim - ( [
327             qr{ \A []] [)] }smx
328             ) )
329             && $tokenizer->cookie( COOKIE_REGEX_SET )
330              
331             ) {
332 8         50 $tokenizer->cookie( COOKIE_REGEX_SET, undef );
333 8         45 return $accept;
334             }
335              
336             # Otherwise we assume we're in a bracketed character class,
337             # delete the cookie, and accept the close bracket.
338 4         20 $tokenizer->cookie( COOKIE_CLASS, undef );
339 4         12 return 1;
340             }
341              
342 0         0 return 1;
343             }
344              
345             }
346              
347             # Called by the lexer once it has done its worst to all the tokens.
348             # Called as a method with no arguments. The return is the number of
349             # parse failures discovered when finalizing.
350             sub __PPIX_LEXER__finalize {
351 1452     1452   2653 my ( $self ) = @_;
352 1452         2200 delete $self->{is_quantifier};
353 1452         3295 return 0;
354             }
355              
356             1;
357              
358             __END__