File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	47	54	87.0
condition	5	5	100.0
subroutine	18	18	100.0
pod	3	3	100.0
total	180	188	95.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Structure - Represent structural elements.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C is the parent of
17							L.
18
19							=head1 DESCRIPTION
20
21							This class represents things that define the structure of the regular
22							expression. This typically means brackets of various sorts, but to
23							prevent proliferation of token classes the type of the regular
24							expression is stored here.
25
26							=head1 METHODS
27
28							This class provides no public methods beyond those provided by its
29							superclass.
30
31							=cut
32
33							package PPIx::Regexp::Token::Structure;
34
35	9			9		73	use strict;
	9					16
	9					241
36	9			9		44	use warnings;
	9					29
	9					245
37
38	9			9		49	use base qw{ PPIx::Regexp::Token };
	9					23
	9					845
39
40	9					955	use PPIx::Regexp::Constant qw{
41							COOKIE_CLASS
42							COOKIE_QUANT
43							COOKIE_REGEX_SET
44							MINIMUM_PERL
45							TOKEN_LITERAL
46							@CARP_NOT
47	9			9		72	};
	9					17
48
49							# Tokens we are responsible for making, under at least some
50							# circumstances.
51	9			9		64	use PPIx::Regexp::Token::Comment ();
	9					14
	9					212
52	9			9		4514	use PPIx::Regexp::Token::Modifier ();
	9					26
	9					227
53	9			9		70	use PPIx::Regexp::Token::Backreference ();
	9					17
	9					137
54	9			9		48	use PPIx::Regexp::Token::Backtrack ();
	9					23
	9					117
55	9			9		4658	use PPIx::Regexp::Token::Recursion ();
	9					25
	9					10452
56
57							our $VERSION = '0.088';
58
59							# Return true if the token can be quantified, and false otherwise
60
61							my %quant = map { $_ => 1 } ')', ']';
62							sub can_be_quantified {
63	524			524	1	1147	my ( $self ) = @_;
64	524	50				1443	ref $self or return;
65	524					1590	return $quant{ $self->content() };
66							};
67
68							{
69
70							my %explanation = (
71							'' => 'Match regexp',
72							'(' => 'Capture or grouping',
73							'(?[' => 'Extended character class',
74							')' => 'End capture or grouping',
75							'[' => 'Character class',
76							']' => 'End character class',
77							'])' => 'End extended character class',
78							'm' => 'Match regexp',
79							'qr' => 'Regexp object definition',
80							's' => 'Replace regexp with string or expression',
81							'{' => 'Explicit quantifier',
82							'}' => 'End explicit quantifier',
83							);
84
85							sub __explanation {
86	7			7		17	return \%explanation;
87							}
88
89							}
90
91							sub is_quantifier {
92	540			540	1	1249	my ( $self ) = @_;
93	540	50				1436	ref $self or return;
94	540					2128	return $self->{is_quantifier};
95							}
96
97							{
98
99							# Note that the implementation equivocates on the ::Token::Structure
100							# class, using it both for the initial token that determines the
101							# type of the regex and things like parentheses internal to the
102							# regex. Rather than sort out this equivocation, I have relied on
103							# the currently-true assumption that 'qr' will not satisfy the
104							# ::Token::Structure recognition logic, and the only way this class
105							# can acquire this content is by the brute-force approach used to
106							# generate the initial token object.
107
108							my %perl_version_introduced = (
109							qr => '5.005',
110							'(?[' => '5.017008',
111							);
112
113							sub perl_version_introduced {
114	222			222	1	3775	my ( $self ) = @_;
115							return $self->{perl_version_introduced} \|\|
116	222		100			1029	$perl_version_introduced{ $self->content() } \|\|
117							MINIMUM_PERL;
118							}
119							}
120
121							{
122
123							my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124
125							# Regular expressions to match various parenthesized tokens, and the
126							# classes to make them into.
127
128							my @paren_token = map {
129							[ $_ => $_->__PPIX_TOKEN__recognize() ]
130							}
131							'PPIx::Regexp::Token::Comment',
132							'PPIx::Regexp::Token::Modifier',
133							'PPIx::Regexp::Token::Backreference',
134							'PPIx::Regexp::Token::Backtrack',
135							'PPIx::Regexp::Token::Recursion',
136							;
137
138							sub __PPIX_TOKENIZER__regexp {
139	1046			1046		2523	my ( undef, $tokenizer, $character ) = @_;
140
141							# We are not interested in anything but delimiters.
142	1046	100				3126	$delim{$character} or return;
143
144							# Inside a character class, all the delimiters are normal characters
145							# except for the close square bracket.
146	893	100				2292	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147	59	100				251	$character eq ']'
148							or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149	53					173	$tokenizer->cookie( COOKIE_CLASS, undef );
150	53					240	return 1;
151							}
152
153							# Open parentheses have various interesting possibilities ...
154	834	100				2526	if ( $character eq '(' ) {
155
156							# Sometimes the whole bunch of parenthesized characters seems
157							# naturally to be a token.
158	347					1055	foreach ( @paren_token ) {
159	1659					2513	my ( $class, @recognize ) = @{ $_ };
	1659					4204
160	1659					2833	foreach ( @recognize ) {
161	2595					3557	my ( $regexp, $arg ) = @{ $_ };
	2595					4771
162	2595	100				5336	my $accept = $tokenizer->find_regexp( $regexp ) or next;
163	55					248	return $tokenizer->make_token( $accept, $class, $arg );
164							}
165							}
166
167							# Modifier changes are local to this parenthesis group
168	292					1231	$tokenizer->modifier_duplicate();
169
170							# The regex-set functionality introduced with 5.17.8 is most
171							# conveniently handled by treating the initial '(?[' and
172							# final '])' as ::Structure tokens. Fortunately for us,
173							# perl5178delta documents that these may not have interior
174							# spaces.
175
176	292	100				1451	if ( my $accept = $tokenizer->find_regexp(
177							qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178							)
179							) {
180	8			105		62	$tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
	105					291
181	8					43	$tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182	8					41	return $accept;
183							}
184
185							# We expect certain tokens only after a left paren.
186							$tokenizer->expect(
187	284					1611	'PPIx::Regexp::Token::GroupType::Modifier',
188							'PPIx::Regexp::Token::GroupType::NamedCapture',
189							'PPIx::Regexp::Token::GroupType::Assertion',
190							'PPIx::Regexp::Token::GroupType::Code',
191							'PPIx::Regexp::Token::GroupType::BranchReset',
192							'PPIx::Regexp::Token::GroupType::Subexpression',
193							'PPIx::Regexp::Token::GroupType::Switch',
194							'PPIx::Regexp::Token::GroupType::Script_Run',
195							'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196							);
197
198							# Accept the parenthesis.
199	284					816	return 1;
200							}
201
202							# Close parentheses end modifier localization
203	487	100				1360	if ( $character eq ')' ) {
204	288					1200	$tokenizer->modifier_pop();
205	288					756	return 1;
206							}
207
208							# Open curlys are complicated because they may or may not represent
209							# the beginning of a quantifier, depending on what comes before the
210							# close curly. So we set a cookie to monitor the token stream for
211							# interlopers. If all goes well, the right curly will find the
212							# cookie and know it is supposed to be a quantifier.
213	199	100				719	if ( $character eq '{' ) {
214
215							# If the prior token can not be quantified, all this is
216							# unnecessary.
217	71	100				225	$tokenizer->prior_significant_token( 'can_be_quantified' )
218							or return 1;
219
220							# We make our token now, before setting the cookie. Otherwise
221							# the cookie has to deal with this token.
222	61					220	my $token = $tokenizer->make_token( 1 );
223
224							# A cookie for the next '}'.
225	61					187	my $commas = 0;
226	61					142	my $allow_digit = 1;
227							$tokenizer->cookie( COOKIE_QUANT, sub {
228	115			115		326	my ( $tokenizer, $token ) = @_;
229	115	50				324	$token or return 1;
230
231							# Code for 5.33.6 and after.
232							# We allow {,...}, and we allow space inside and
233							# adjacent to the curlys, and around the comma if
234							# any. But not interior to the numbers.
235	115	100				414	if ( $token->isa( TOKEN_LITERAL ) ) {
236	105					277	my $character = $token->content();
237	105	100				428	if ( $character =~ m/ \A \s \z /smx ) {
238							# Digits only allowed if the prior
239							# significant was an open curly or a comma.
240	4					12	$allow_digit = $tokenizer->prior_significant_token(
241							'content' ) =~ m/ \A [{,] \z /smx; # }
242	4					14	return 1;
243							}
244	101	100				392	$character eq ','
245							and return( ! $commas++ );
246	68	50				193	$allow_digit
247							or return;
248	68					386	return $character =~ m/ \A [0-9] \z /smx;
249							}
250
251							# Since we do not know what is in an interpolation, we
252							# trustingly accept it.
253	10	100				56	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254							) {
255	5					24	return 1;
256							}
257
258	5					50	return;
259							},
260	61					464	);
261
262	61					208	return $token;
263							}
264
265							# The close curly bracket is a little complicated because if the
266							# cookie posted by the left curly bracket is still around, we are a
267							# quantifier, otherwise not.
268	128	100				520	if ( $character eq '}' ) {
269	63	100				182	$tokenizer->cookie( COOKIE_QUANT, undef )
270							or return 1;
271	50	100				454	$tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272							and return 1;
273	47					153	my $token = $tokenizer->make_token( 1 );
274	47					157	$token->{is_quantifier} = 1;
275	47					144	return $token;
276							}
277
278							# The parse rules are different inside a character class, so we set
279							# another cookie. Sigh. If your tool is a hammer ...
280	65	100				278	if ( $character eq '[' ) {
281
282							# Set our cookie. Since it always returns 1, it does not matter
283							# where in the following mess we set it.
284	53			226		371	$tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
	226					691
285
286							# Make our token now, since the easiest place to deal with the
287							# beginning-of-character-class strangeness seems to be right
288							# here.
289	53					236	my @tokens = $tokenizer->make_token( 1 );
290
291							# Get the next character, returning tokens if there is none.
292	53	50				254	defined ( $character = $tokenizer->peek() )
293							or return @tokens;
294
295							# If we have a caret, it is a negation operator. Make its token
296							# and fetch the next character, returning if none.
297	53	100				227	if ( $character eq '^' ) {
298	5					28	push @tokens, $tokenizer->make_token(
299							1, 'PPIx::Regexp::Token::Operator' );
300	5	50				26	defined ( $character = $tokenizer->peek() )
301							or return @tokens;
302							}
303
304							# If we have a close square at this point, it is not the end of
305							# the class, but just a literal. Make its token.
306	53	100				161	$character eq ']'
307							and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308
309							# Return all tokens made.
310	53					179	return @tokens;
311							}
312							# per perlop, the metas inside a [] are -]\^$.
313							# per perlop, the metas outside a [] are {}[]()^$.\|*+?\
314							# The difference is that {}[().\|*+? are not metas in [], but - is.
315
316							# Close bracket is complicated by the addition of regex sets.
317							# And more complicated by the fact that you can have an
318							# old-style character class inside a regex set. Fortunately they
319							# have not (yet!) permitted nested regex sets.
320	12	50				77	if ( $character eq ']' ) {
321
322							# If we find '])' and COOKIE_REGEX_SET is present, we have a
323							# regex set. We need to delete the cookie and accept both
324							# characters.
325	12	100	100			60	if ( ( my $accept = $tokenizer->find_regexp(
326							# help vim - ( [
327							qr{ \A []] [)] }smx
328							) )
329							&& $tokenizer->cookie( COOKIE_REGEX_SET )
330
331							) {
332	8					42	$tokenizer->cookie( COOKIE_REGEX_SET, undef );
333	8					47	return $accept;
334							}
335
336							# Otherwise we assume we're in a bracketed character class,
337							# delete the cookie, and accept the close bracket.
338	4					22	$tokenizer->cookie( COOKIE_CLASS, undef );
339	4					16	return 1;
340							}
341
342	0					0	return 1;
343							}
344
345							}
346
347							# Called by the lexer once it has done its worst to all the tokens.
348							# Called as a method with no arguments. The return is the number of
349							# parse failures discovered when finalizing.
350							sub __PPIX_LEXER__finalize {
351	1452			1452		2536	my ( $self ) = @_;
352	1452					2218	delete $self->{is_quantifier};
353	1452					3118	return 0;
354							}
355
356							1;
357
358							__END__