File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	47	54	87.0
condition	5	5	100.0
subroutine	18	18	100.0
pod	3	3	100.0
total	180	188	95.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Structure - Represent structural elements.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C is the parent of
17							L.
18
19							=head1 DESCRIPTION
20
21							This class represents things that define the structure of the regular
22							expression. This typically means brackets of various sorts, but to
23							prevent proliferation of token classes the type of the regular
24							expression is stored here.
25
26							=head1 METHODS
27
28							This class provides no public methods beyond those provided by its
29							superclass.
30
31							=cut
32
33							package PPIx::Regexp::Token::Structure;
34
35	9			9		90	use strict;
	9					42
	9					257
36	9			9		45	use warnings;
	9					24
	9					255
37
38	9			9		46	use base qw{ PPIx::Regexp::Token };
	9					19
	9					826
39
40	9					1055	use PPIx::Regexp::Constant qw{
41							COOKIE_CLASS
42							COOKIE_QUANT
43							COOKIE_REGEX_SET
44							MINIMUM_PERL
45							TOKEN_LITERAL
46							@CARP_NOT
47	9			9		80	};
	9					20
48
49							# Tokens we are responsible for making, under at least some
50							# circumstances.
51	9			9		76	use PPIx::Regexp::Token::Comment ();
	9					23
	9					152
52	9			9		4702	use PPIx::Regexp::Token::Modifier ();
	9					28
	9					215
53	9			9		105	use PPIx::Regexp::Token::Backreference ();
	9					22
	9					134
54	9			9		56	use PPIx::Regexp::Token::Backtrack ();
	9					18
	9					138
55	9			9		4973	use PPIx::Regexp::Token::Recursion ();
	9					32
	9					10611
56
57							our $VERSION = '0.087_01';
58
59							# Return true if the token can be quantified, and false otherwise
60
61							my %quant = map { $_ => 1 } ')', ']';
62							sub can_be_quantified {
63	524			524	1	1294	my ( $self ) = @_;
64	524	50				1522	ref $self or return;
65	524					1816	return $quant{ $self->content() };
66							};
67
68							{
69
70							my %explanation = (
71							'' => 'Match regexp',
72							'(' => 'Capture or grouping',
73							'(?[' => 'Extended character class',
74							')' => 'End capture or grouping',
75							'[' => 'Character class',
76							']' => 'End character class',
77							'])' => 'End extended character class',
78							'm' => 'Match regexp',
79							'qr' => 'Regexp object definition',
80							's' => 'Replace regexp with string or expression',
81							'{' => 'Explicit quantifier',
82							'}' => 'End explicit quantifier',
83							);
84
85							sub __explanation {
86	7			7		16	return \%explanation;
87							}
88
89							}
90
91							sub is_quantifier {
92	540			540	1	1264	my ( $self ) = @_;
93	540	50				1623	ref $self or return;
94	540					2006	return $self->{is_quantifier};
95							}
96
97							{
98
99							# Note that the implementation equivocates on the ::Token::Structure
100							# class, using it both for the initial token that determines the
101							# type of the regex and things like parentheses internal to the
102							# regex. Rather than sort out this equivocation, I have relied on
103							# the currently-true assumption that 'qr' will not satisfy the
104							# ::Token::Structure recognition logic, and the only way this class
105							# can acquire this content is by the brute-force approach used to
106							# generate the initial token object.
107
108							my %perl_version_introduced = (
109							qr => '5.005',
110							'(?[' => '5.017008',
111							);
112
113							sub perl_version_introduced {
114	222			222	1	3772	my ( $self ) = @_;
115							return $self->{perl_version_introduced} \|\|
116	222		100			891	$perl_version_introduced{ $self->content() } \|\|
117							MINIMUM_PERL;
118							}
119							}
120
121							{
122
123							my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124
125							# Regular expressions to match various parenthesized tokens, and the
126							# classes to make them into.
127
128							my @paren_token = map {
129							[ $_ => $_->__PPIX_TOKEN__recognize() ]
130							}
131							'PPIx::Regexp::Token::Comment',
132							'PPIx::Regexp::Token::Modifier',
133							'PPIx::Regexp::Token::Backreference',
134							'PPIx::Regexp::Token::Backtrack',
135							'PPIx::Regexp::Token::Recursion',
136							;
137
138							sub __PPIX_TOKENIZER__regexp {
139	1046			1046		2544	my ( undef, $tokenizer, $character ) = @_;
140
141							# We are not interested in anything but delimiters.
142	1046	100				3329	$delim{$character} or return;
143
144							# Inside a character class, all the delimiters are normal characters
145							# except for the close square bracket.
146	893	100				2182	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147	59	100				289	$character eq ']'
148							or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149	53					205	$tokenizer->cookie( COOKIE_CLASS, undef );
150	53					239	return 1;
151							}
152
153							# Open parentheses have various interesting possibilities ...
154	834	100				2517	if ( $character eq '(' ) {
155
156							# Sometimes the whole bunch of parenthesized characters seems
157							# naturally to be a token.
158	347					991	foreach ( @paren_token ) {
159	1659					2642	my ( $class, @recognize ) = @{ $_ };
	1659					4244
160	1659					2883	foreach ( @recognize ) {
161	2595					3614	my ( $regexp, $arg ) = @{ $_ };
	2595					4833
162	2595	100				5399	my $accept = $tokenizer->find_regexp( $regexp ) or next;
163	55					278	return $tokenizer->make_token( $accept, $class, $arg );
164							}
165							}
166
167							# Modifier changes are local to this parenthesis group
168	292					1278	$tokenizer->modifier_duplicate();
169
170							# The regex-set functionality introduced with 5.17.8 is most
171							# conveniently handled by treating the initial '(?[' and
172							# final '])' as ::Structure tokens. Fortunately for us,
173							# perl5178delta documents that these may not have interior
174							# spaces.
175
176	292	100				1266	if ( my $accept = $tokenizer->find_regexp(
177							qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178							)
179							) {
180	8			105		75	$tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
	105					296
181	8					71	$tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182	8					38	return $accept;
183							}
184
185							# We expect certain tokens only after a left paren.
186							$tokenizer->expect(
187	284					1856	'PPIx::Regexp::Token::GroupType::Modifier',
188							'PPIx::Regexp::Token::GroupType::NamedCapture',
189							'PPIx::Regexp::Token::GroupType::Assertion',
190							'PPIx::Regexp::Token::GroupType::Code',
191							'PPIx::Regexp::Token::GroupType::BranchReset',
192							'PPIx::Regexp::Token::GroupType::Subexpression',
193							'PPIx::Regexp::Token::GroupType::Switch',
194							'PPIx::Regexp::Token::GroupType::Script_Run',
195							'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196							);
197
198							# Accept the parenthesis.
199	284					848	return 1;
200							}
201
202							# Close parentheses end modifier localization
203	487	100				1657	if ( $character eq ')' ) {
204	288					1133	$tokenizer->modifier_pop();
205	288					817	return 1;
206							}
207
208							# Open curlys are complicated because they may or may not represent
209							# the beginning of a quantifier, depending on what comes before the
210							# close curly. So we set a cookie to monitor the token stream for
211							# interlopers. If all goes well, the right curly will find the
212							# cookie and know it is supposed to be a quantifier.
213	199	100				604	if ( $character eq '{' ) {
214
215							# If the prior token can not be quantified, all this is
216							# unnecessary.
217	71	100				187	$tokenizer->prior_significant_token( 'can_be_quantified' )
218							or return 1;
219
220							# We make our token now, before setting the cookie. Otherwise
221							# the cookie has to deal with this token.
222	61					247	my $token = $tokenizer->make_token( 1 );
223
224							# A cookie for the next '}'.
225	61					212	my $commas = 0;
226	61					136	my $allow_digit = 1;
227							$tokenizer->cookie( COOKIE_QUANT, sub {
228	115			115		452	my ( $tokenizer, $token ) = @_;
229	115	50				300	$token or return 1;
230
231							# Code for 5.33.6 and after.
232							# We allow {,...}, and we allow space inside and
233							# adjacent to the curlys, and around the comma if
234							# any. But not interior to the numbers.
235	115	100				416	if ( $token->isa( TOKEN_LITERAL ) ) {
236	105					300	my $character = $token->content();
237	105	100				391	if ( $character =~ m/ \A \s \z /smx ) {
238							# Digits only allowed if the prior
239							# significant was an open curly or a comma.
240	4					11	$allow_digit = $tokenizer->prior_significant_token(
241							'content' ) =~ m/ \A [{,] \z /smx; # }
242	4					15	return 1;
243							}
244	101	100				447	$character eq ','
245							and return( ! $commas++ );
246	68	50				176	$allow_digit
247							or return;
248	68					379	return $character =~ m/ \A [0-9] \z /smx;
249							}
250
251							# Since we do not know what is in an interpolation, we
252							# trustingly accept it.
253	10	100				91	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254							) {
255	5					23	return 1;
256							}
257
258	5					49	return;
259							},
260	61					497	);
261
262	61					181	return $token;
263							}
264
265							# The close curly bracket is a little complicated because if the
266							# cookie posted by the left curly bracket is still around, we are a
267							# quantifier, otherwise not.
268	128	100				412	if ( $character eq '}' ) {
269	63	100				191	$tokenizer->cookie( COOKIE_QUANT, undef )
270							or return 1;
271	50	100				540	$tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272							and return 1;
273	47					170	my $token = $tokenizer->make_token( 1 );
274	47					200	$token->{is_quantifier} = 1;
275	47					139	return $token;
276							}
277
278							# The parse rules are different inside a character class, so we set
279							# another cookie. Sigh. If your tool is a hammer ...
280	65	100				230	if ( $character eq '[' ) {
281
282							# Set our cookie. Since it always returns 1, it does not matter
283							# where in the following mess we set it.
284	53			226		378	$tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
	226					648
285
286							# Make our token now, since the easiest place to deal with the
287							# beginning-of-character-class strangeness seems to be right
288							# here.
289	53					236	my @tokens = $tokenizer->make_token( 1 );
290
291							# Get the next character, returning tokens if there is none.
292	53	50				242	defined ( $character = $tokenizer->peek() )
293							or return @tokens;
294
295							# If we have a caret, it is a negation operator. Make its token
296							# and fetch the next character, returning if none.
297	53	100				185	if ( $character eq '^' ) {
298	5					49	push @tokens, $tokenizer->make_token(
299							1, 'PPIx::Regexp::Token::Operator' );
300	5	50				37	defined ( $character = $tokenizer->peek() )
301							or return @tokens;
302							}
303
304							# If we have a close square at this point, it is not the end of
305							# the class, but just a literal. Make its token.
306	53	100				154	$character eq ']'
307							and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308
309							# Return all tokens made.
310	53					185	return @tokens;
311							}
312							# per perlop, the metas inside a [] are -]\^$.
313							# per perlop, the metas outside a [] are {}[]()^$.\|*+?\
314							# The difference is that {}[().\|*+? are not metas in [], but - is.
315
316							# Close bracket is complicated by the addition of regex sets.
317							# And more complicated by the fact that you can have an
318							# old-style character class inside a regex set. Fortunately they
319							# have not (yet!) permitted nested regex sets.
320	12	50				81	if ( $character eq ']' ) {
321
322							# If we find '])' and COOKIE_REGEX_SET is present, we have a
323							# regex set. We need to delete the cookie and accept both
324							# characters.
325	12	100	100			76	if ( ( my $accept = $tokenizer->find_regexp(
326							# help vim - ( [
327							qr{ \A []] [)] }smx
328							) )
329							&& $tokenizer->cookie( COOKIE_REGEX_SET )
330
331							) {
332	8					50	$tokenizer->cookie( COOKIE_REGEX_SET, undef );
333	8					45	return $accept;
334							}
335
336							# Otherwise we assume we're in a bracketed character class,
337							# delete the cookie, and accept the close bracket.
338	4					20	$tokenizer->cookie( COOKIE_CLASS, undef );
339	4					12	return 1;
340							}
341
342	0					0	return 1;
343							}
344
345							}
346
347							# Called by the lexer once it has done its worst to all the tokens.
348							# Called as a method with no arguments. The return is the number of
349							# parse failures discovered when finalizing.
350							sub __PPIX_LEXER__finalize {
351	1452			1452		2653	my ( $self ) = @_;
352	1452					2200	delete $self->{is_quantifier};
353	1452					3295	return 0;
354							}
355
356							1;
357
358							__END__