File Coverage

blib/lib/PPIx/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	250	264	94.7
branch	64	80	80.0
condition	26	35	74.2
subroutine	44	45	97.7
pod	5	5	100.0
total	389	429	90.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Lexer - Assemble tokenizer output.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Lexer;
8							use PPIx::Regexp::Dumper;
9							my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx');
10							my $dmp = PPIx::Regexp::Dumper->new( $lex );
11							$dmp->print();
12
13							=head1 INHERITANCE
14
15							C is a
16							L.
17
18							C has no descendants.
19
20							=head1 DESCRIPTION
21
22							This class takes the token stream generated by
23							L and generates the
24							parse tree.
25
26							=head1 METHODS
27
28							This class provides the following public methods. Methods not documented
29							here are private, and unsupported in the sense that the author reserves
30							the right to change or remove them without notice.
31
32							=cut
33
34							package PPIx::Regexp::Lexer;
35
36	9			9		65	use strict;
	9					19
	9					285
37	9			9		45	use warnings;
	9					26
	9					288
38
39	9			9		51	use base qw{ PPIx::Regexp::Support };
	9					16
	9					4271
40
41	9			9		64	use Carp qw{ confess };
	9					18
	9					429
42	9					804	use PPIx::Regexp::Constant qw{
43							ARRAY_REF
44							TOKEN_LITERAL
45							TOKEN_UNKNOWN
46							@CARP_NOT
47	9			9		52	};
	9					15
48	9			9		4224	use PPIx::Regexp::Node::Range ();
	9					27
	9					196
49	9			9		3679	use PPIx::Regexp::Node::Unknown ();
	9					22
	9					196
50	9			9		4272	use PPIx::Regexp::Structure ();
	9					28
	9					210
51	9			9		4245	use PPIx::Regexp::Structure::Assertion ();
	9					31
	9					204
52	9			9		3896	use PPIx::Regexp::Structure::Atomic_Script_Run ();
	9					23
	9					195
53	9			9		3934	use PPIx::Regexp::Structure::BranchReset ();
	9					26
	9					189
54	9			9		3945	use PPIx::Regexp::Structure::Code ();
	9					27
	9					195
55	9			9		4064	use PPIx::Regexp::Structure::Capture ();
	9					145
	9					218
56	9			9		3943	use PPIx::Regexp::Structure::CharClass ();
	9					25
	9					182
57	9			9		4098	use PPIx::Regexp::Structure::Subexpression ();
	9					23
	9					190
58	9			9		4005	use PPIx::Regexp::Structure::Main ();
	9					21
	9					182
59	9			9		4053	use PPIx::Regexp::Structure::Modifier ();
	9					24
	9					183
60	9			9		4062	use PPIx::Regexp::Structure::NamedCapture ();
	9					23
	9					181
61	9			9		4118	use PPIx::Regexp::Structure::Quantifier ();
	9					25
	9					231
62	9			9		4027	use PPIx::Regexp::Structure::Regexp ();
	9					39
	9					197
63	9			9		3953	use PPIx::Regexp::Structure::RegexSet ();
	9					26
	9					181
64	9			9		4097	use PPIx::Regexp::Structure::Replacement ();
	9					23
	9					197
65	9			9		4283	use PPIx::Regexp::Structure::Script_Run ();
	9					29
	9					188
66	9			9		4099	use PPIx::Regexp::Structure::Switch ();
	9					24
	9					184
67	9			9		3924	use PPIx::Regexp::Structure::Unknown ();
	9					26
	9					190
68	9			9		4265	use PPIx::Regexp::Token::Unmatched ();
	9					22
	9					195
69	9			9		5560	use PPIx::Regexp::Tokenizer ();
	9					43
	9					357
70	9			9		67	use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance };
	9					21
	9					20577
71
72							our $VERSION = '0.087';
73
74							=head2 new
75
76							This method instantiates the lexer. It takes as its argument either a
77							L or the text to be
78							parsed. In the latter case the tokenizer is instantiated from the text.
79
80							Any optional name/value pairs after the first argument are passed to the
81							tokenizer, which interprets them or not as the case may be.
82
83							=cut
84
85							{
86
87							my $errstr;
88
89							sub new {
90	332			332	1	1410	my ( $class, $tokenizer, %args ) = @_;
91	332	50				1109	ref $class and $class = ref $class;
92
93	332	50				1141	unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) {
94							my $tokenizer_class = __choose_tokenizer_class(
95							$tokenizer, \%args )
96	0	0				0	or do {
97	0					0	$errstr = 'Data not supported';
98	0					0	return;
99							};
100							$tokenizer = $tokenizer_class->new( $tokenizer, %args )
101	0	0				0	or do {
102	0					0	$errstr = $tokenizer_class->errstr();
103	0					0	return;
104							};
105							}
106
107							my $self = {
108							deferred => [], # Deferred tokens
109							failures => 0,
110							strict => $args{strict},
111	332					1957	tokenizer => $tokenizer,
112							};
113
114	332					763	bless $self, $class;
115	332					993	return $self;
116							}
117
118							sub errstr {
119	0			0	1	0	return $errstr;
120							}
121
122							}
123
124							=head2 errstr
125
126							This method returns the error string from the last attempt to
127							instantiate a C. If the last attempt succeeded, the
128							error will be C.
129
130							=cut
131
132							# Defined above
133
134							=head2 failures
135
136							print $lexer->failures(), " parse failures\n";
137
138							This method returns the number of parse failures encountered. A
139							parse failure is either a tokenization failure (see
140							L<< PPIx::Regexp::Tokenizer->failures()\|PPIx::Regexp::Tokenizer/failures >>)
141							or a structural error.
142
143							=cut
144
145							sub failures {
146	332			332	1	797	my ( $self ) = @_;
147	332					1216	return $self->{failures};
148							}
149
150							=head2 lex
151
152							This method lexes the tokens in the text, and returns the lexed list of
153							elements.
154
155							=cut
156
157							sub lex {
158	332			332	1	930	my ( $self ) = @_;
159
160	332					676	my @content;
161	332					898	$self->{failures} = 0;
162
163							# Accept everything up to the first delimiter.
164	332					667	my $kind; # Initial PPIx::Regexp::Token::Structure
165							{
166	332	100				762	my $token = $self->_get_token()
	668					2574
167							or return $self->_finalize( @content );
168	660	100				3279	$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do {
169	336	100	100			2402	not $kind
170							and $token->isa( 'PPIx::Regexp::Token::Structure' )
171							and $kind = $token;
172	336					912	push @content, $token;
173	336					645	redo;
174							};
175	324					2341	$self->_unget_token( $token );
176							}
177
178							my ( $part_0_class, $part_1_class ) =
179	324					1545	$self->{tokenizer}->__part_classes();
180
181							# Accept the first delimited structure.
182	324					1547	push @content, ( my $part_0 = $self->_get_delimited(
183							$part_0_class ) );
184
185							# If we are a substitution ...
186	324	100				1405	if ( defined $part_1_class ) {
187
188							# Accept any insignificant stuff.
189	24					144	while ( my $token = $self->_get_token() ) {
190	28	100				161	if ( $token->significant() ) {
191	24					104	$self->_unget_token( $token );
192	24					65	last;
193							} else {
194	4					17	push @content, $token;
195							}
196							}
197
198							# Figure out if we should expect an opening bracket.
199	24		100			185	my $expect_open_bracket = $self->close_bracket(
200							$part_0->start( 0 ) ) \|\| 0;
201
202							# Accept the next delimited structure.
203	24					116	push @content, $self->_get_delimited(
204							$part_1_class,
205							$expect_open_bracket,
206							);
207							}
208
209							# Accept the modifiers (we hope!) plus any trailing white space.
210	324					1310	while ( my $token = $self->_get_token() ) {
211	326					897	push @content, $token;
212							}
213
214							# Let all the elements finalize themselves, recording any additional
215							# errors as they do so.
216	324					2066	$self->_finalize( @content );
217
218							# If we found a regular expression (and we should have done so) ...
219	324	50	33			2349	if ( $part_0 && $part_0->can( 'max_capture_number' ) ) {
220							# TODO the above line is really ugly. I'm wondering about
221							# string implementations like:
222							# * return a $part_0_class of undef (but that complicates the
223							# lexing of the structure itself);
224							# * hang this logic on the tokenizer somehow (where it seems out
225							# of place)
226							# * hang this logic on PPIx::Regexp::Structure::Regexp and
227							# ::Replacement.
228							# I also need to figure out how to make \n backreferences come
229							# out as literals. Maybe that is a job best done by the
230							# tokenizer.
231
232							# Retrieve the maximum capture group.
233	324					1163	my $max_capture = $part_0->max_capture_number();
234
235							# Hashify the known capture names
236							my $capture_name = {
237	324					1381	map { $_ => 1 } $part_0->capture_names(),
	20					96
238							};
239
240							# For all the backreferences found
241	324	100				766	foreach my $elem ( @{ $part_0->find(
	324					931
242							'PPIx::Regexp::Token::Backreference' ) \|\| [] } ) {
243							# Rebless them as needed, recording any errors found.
244							$self->{failures} +=
245	25					128	$elem->__PPIX_LEXER__rebless(
246							capture_name => $capture_name,
247							max_capture => $max_capture,
248							);
249							}
250							}
251
252	324					1583	return @content;
253
254							}
255
256							=head2 strict
257
258							This method returns true or false based on the value of the C<'strict'>
259							argument to C.
260
261							=cut
262
263							sub strict {
264	13			13	1	42	my ( $self ) = @_;
265	13					66	return $self->{strict};
266							}
267
268							# Finalize the content array, updating the parse failures count as we
269							# go.
270							sub _finalize {
271	332			332		1294	my ( $self, @content ) = @_;
272	332					820	foreach my $elem ( @content ) {
273	1014					4408	$self->{failures} += $elem->__PPIX_LEXER__finalize( $self );
274							}
275	332	100				1163	defined wantarray and return @content;
276	324					803	return;
277							}
278
279							{
280
281							my %bracket = (
282							'{' => '}',
283							'(' => ')',
284							'[' => ']',
285							'(?[' => '])',
286							## '<' => '>',
287							);
288
289							my %unclosed = (
290							'{' => '_recover_curly',
291							);
292
293							sub _get_delimited {
294	348			348		1048	my ( $self, $class, $expect_open_bracket ) = @_;
295	348	100				1187	defined $expect_open_bracket or $expect_open_bracket = 1;
296
297	348					597	my @rslt;
298	348					1132	$self->{_rslt} = \@rslt;
299
300	348	100				1045	if ( $expect_open_bracket ) {
301	329	50				837	if ( my $token = $self->_get_token() ) {
302	329					1067	push @rslt, [];
303	329	50				1364	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
304	329					613	push @{ $rslt[-1] }, '', $token;
	329					1210
305							} else {
306	0					0	push @{ $rslt[-1] }, '', undef;
	0					0
307	0					0	$self->_unget_token( $token );
308							}
309							} else {
310	0					0	return;
311							}
312							} else {
313	19					83	push @rslt, [ '', undef ];
314							}
315
316	348					1046	while ( my $token = $self->_get_token() ) {
317	2296	100				9308	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
318	348					1616	$self->_unget_token( $token );
319	348					845	last;
320							}
321	1948	100				6716	if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) {
322	555					1528	my $content = $token->content();
323
324	555	100	66			2894	if ( my $finish = $bracket{$content} ) {
		100	66
		100
		100
325							# Open bracket
326	276					915	push @rslt, [ $finish, $token ];
327
328							} elsif ( $content eq $rslt[-1][0] ) {
329
330							# Matched close bracket
331	269					1313	$self->_make_node( $token );
332
333							} elsif ( $content ne ')' ) {
334
335							# If the close bracket is not a parenthesis, it becomes
336							# a literal.
337	4					29	TOKEN_LITERAL->__PPIX_ELEM__rebless( $token );
338	4					8	push @{ $rslt[-1] }, $token;
	4					15
339
340							} elsif ( $content eq ')'
341							and @rslt > 1 # Ignore enclosing delimiter
342							and my $recover = $unclosed{$rslt[-1][1]->content()} ) {
343							# If the close bracket is a parenthesis and there is a
344							# recovery procedure, we use it.
345	1					7	$self->$recover( $token );
346
347							} else {
348
349							# Unmatched close with no recovery.
350	5					18	$self->{failures}++;
351	5					67	PPIx::Regexp::Token::Unmatched->
352							__PPIX_ELEM__rebless( $token );
353	5					12	push @{ $rslt[-1] }, $token;
	5					21
354							}
355
356							} else {
357	1393					2366	push @{ $rslt[-1] }, $token;
	1393					3327
358							}
359
360							# We have to hand-roll the Range object.
361	1948	100	100			6299	if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' )
			100
362							&& $rslt[-1][-2]->content() eq '-'
363							&& $rslt[-1][0] eq ']' # It's a character class
364							) {
365	13					40	my @tokens = splice @{ $rslt[-1] }, -3;
	13					58
366	13					33	push @{ $rslt[-1] },
	13					135
367							PPIx::Regexp::Node::Range->__new( @tokens );
368							}
369							}
370
371	348					1558	while ( @rslt > 1 ) {
372	6	100				37	if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) {
373	5					28	$self->$recover();
374							} else {
375	1					5	$self->{failures}++;
376	1					6	$self->_make_node( undef );
377							}
378							}
379
380	348	50				1074	if ( @rslt == 1 ) {
381	348					622	my @last = @{ pop @rslt };
	348					1109
382	348					764	shift @last;
383	348					915	push @last, $self->_get_token();
384	348					2420	return $class->__new( @last );
385							} else {
386	0					0	confess "Missing data";
387							}
388
389							}
390
391							}
392
393							# $token = $self->_get_token();
394							#
395							# This method returns the next token from the tokenizer.
396
397							sub _get_token {
398	4319			4319		7808	my ( $self ) = @_;
399
400	4319	100				6122	if ( @{ $self->{deferred} } ) {
	4319					10259
401	697					1286	return shift @{ $self->{deferred} };
	697					2267
402							}
403
404	3622	100				9806	my $token = $self->{tokenizer}->next_token() or return;
405
406	3290					10787	return $token;
407							}
408
409							{
410
411							my %handler = (
412							'(' => '_round',
413							'[' => '_square',
414							'{' => '_curly',
415							'(?[' => '_regex_set',
416							);
417
418							sub _make_node {
419	270			270		768	my ( $self, $token ) = @_;
420	270					535	my @args = @{ pop @{ $self->{_rslt} } };
	270					399
	270					914
421	270					641	shift @args;
422	270					565	push @args, $token;
423	270					545	my @node;
424	270	50				920	if ( my $method = $handler{ $args[0]->content() } ) {
425	270					1559	@node = $self->$method( \@args );
426							}
427	270	50				926	@node or @node = PPIx::Regexp::Structure->__new( @args );
428	270					627	push @{ $self->{_rslt}[-1] }, @node;
	270					875
429	270					767	return;
430							}
431
432							}
433
434							# Called as $self->$method( ... ) in _make_node(), above
435							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
436	35			35		130	my ( $self, $args ) = @_;
437
438	35	100	66			279	if ( $args->[-1] && $args->[-1]->is_quantifier() ) {
		50
439
440							# If the tokenizer has marked the right curly as a quantifier,
441							# make the whole thing a quantifier structure.
442	29					80	return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } );
	29					239
443
444							} elsif ( $args->[-1] ) {
445
446							# If there is a right curly but it is not a quantifier,
447							# make both curlys into literals.
448	6					39	foreach my $inx ( 0, -1 ) {
449	12					53	TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] );
450							}
451
452							# Try to recover possible quantifiers not recognized because we
453							# thought this was a structure.
454	6					52	$self->_recover_curly_quantifiers( $args );
455
456	6					13	return @{ $args };
	6					28
457
458							} else {
459
460							# If there is no right curly, just make a generic structure
461							# TODO maybe this should be something else?
462	0					0	return PPIx::Regexp::Structure->__new( @{ $args } );
	0					0
463							}
464							}
465
466							# Recover from an unclosed left curly.
467							# Called as $self->$revover( ... ) in _get_delimited, above
468							sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines)
469	6			6		22	my ( $self, $token ) = @_;
470
471							# Get all the stuff we have accumulated for this curly.
472	6					16	my @content = @{ pop @{ $self->{_rslt} } };
	6					11
	6					24
473
474							# Lose the right bracket, which we have already failed to match.
475	6					17	shift @content;
476
477							# Rebless the left curly appropriately
478	6	100	66			94	if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' )
479							&& q<\b> eq $self->{_rslt}[0][-1]->content() ) {
480							# If following \b, it becomes an unknown.
481	1					7	TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0],
482							error => 'Unterminated bound type',
483							);
484							} else {
485							# Rebless the left curly to a literal.
486	5					45	TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] );
487							}
488
489							# Try to recover possible quantifiers not recognized because we
490							# thought this was a structure.
491	6					31	$self->_recover_curly_quantifiers( \@content );
492
493							# Shove the curly and its putative contents into whatever structure
494							# we have going.
495							# The checks are to try to trap things like RT 56864, though on
496							# further reflection it turned out that you could get here with an
497							# empty $self->{_rslt} on things like 'm{)}'. This one did not get
498							# made into an RT ticket, but was fixed by not calling the recovery
499							# code if $self->{_rslt} contained only the enclosing delimiters.
500							ARRAY_REF eq ref $self->{_rslt}
501							or confess 'Programming error - $self->{_rslt} not array ref, ',
502	6	50				38	"parsing '", $self->{tokenizer}->content(), "' at ",
503							$token->content();
504	6					34	@{ $self->{_rslt} }
505							or confess 'Programming error - $self->{_rslt} empty, ',
506	6	50				12	"parsing '", $self->{tokenizer}->content(), "' at ",
507							$token->content();
508	6					14	push @{ $self->{_rslt}[-1] }, @content;
	6					26
509
510							# Shove the mismatched delimiter back into the input so we can have
511							# another crack at it.
512	6	100				25	$token and $self->_unget_token( $token );
513
514							# We gone.
515	6					20	return;
516							}
517
518							sub _recover_curly_quantifiers {
519	12			12		34	my ( undef, $args ) = @_; # Invocant unused
520
521	12	100	100			87	if ( __instance( $args->[0], TOKEN_LITERAL )
			66
522							&& __instance( $args->[1], TOKEN_UNKNOWN )
523							&& PPIx::Regexp::Token::Quantifier->could_be_quantifier(
524							$args->[1]->content() )
525							) {
526	2					29	PPIx::Regexp::Token::Quantifier->
527							__PPIX_ELEM__rebless( $args->[1] );
528
529	2	50	33			12	if ( __instance( $args->[2], TOKEN_UNKNOWN )
530							&& PPIx::Regexp::Token::Greediness->could_be_greediness(
531							$args->[2]->content() )
532							) {
533	2					18	PPIx::Regexp::Token::Greediness
534							->__PPIX_ELEM__rebless( $args->[2] );
535							}
536
537							}
538
539	12					36	return;
540							}
541
542							sub _in_regex_set {
543	193			193		468	my ( $self ) = @_;
544	193					324	foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) {
	193					522
545	302	100				864	$stack_entry->[0] eq '])'
546							and return 1;
547							}
548	189					552	return 0;
549							}
550
551							# Called as $self->$method( ... ) in _make_node(), above
552							sub _round { ## no critic (ProhibitUnusedPrivateSubroutines)
553	193			193		526	my ( $self, $args ) = @_;
554
555							# If we're inside a regex set, parens do not capture.
556							$self->_in_regex_set()
557	193	100				1004	and return PPIx::Regexp::Structure->__new( @{ $args } );
	4					36
558
559							# If /n is asserted, parens do not capture.
560							$self->{tokenizer}->modifier( 'n' )
561	189	100				750	and return PPIx::Regexp::Structure->__new( @{ $args } );
	7					49
562
563							# The instantiator will rebless based on the first token if need be.
564	182					591	return PPIx::Regexp::Structure::Capture->__new( @{ $args } );
	182					1254
565							}
566
567							# Called as $self->$method( ... ) in _make_node(), above
568							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
569	36			36		134	my ( undef, $args ) = @_; # Invocant unused
570	36					74	return PPIx::Regexp::Structure::CharClass->__new( @{ $args } );
	36					284
571							}
572
573							# Called as $self->$method( ... ) in _make_node(), above
574							sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines)
575	6			6		27	my ( undef, $args ) = @_; # Invocant unused
576	6					16	return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } );
	6					70
577							}
578
579							# $self->_unget_token( $token );
580							#
581							# This method caches its argument so that it will be returned by
582							# the next call to C<_get_token()>. If more than one argument is
583							# passed, they will be returned in the order given; that is,
584							# _unget_token/_get_token work like unshift/shift.
585
586							sub _unget_token {
587	697			697		1965	my ( $self, @args ) = @_;
588	697					1116	unshift @{ $self->{deferred} }, @args;
	697					1731
589	697					1542	return $self;
590							}
591
592							1;
593
594							__END__