File Coverage

blib/lib/PPIx/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	250	264	94.7
branch	64	80	80.0
condition	26	35	74.2
subroutine	44	45	97.7
pod	5	5	100.0
total	389	429	90.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Lexer - Assemble tokenizer output.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Lexer;
8							use PPIx::Regexp::Dumper;
9							my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx');
10							my $dmp = PPIx::Regexp::Dumper->new( $lex );
11							$dmp->print();
12
13							=head1 INHERITANCE
14
15							C is a
16							L.
17
18							C has no descendants.
19
20							=head1 DESCRIPTION
21
22							This class takes the token stream generated by
23							L and generates the
24							parse tree.
25
26							=head1 METHODS
27
28							This class provides the following public methods. Methods not documented
29							here are private, and unsupported in the sense that the author reserves
30							the right to change or remove them without notice.
31
32							=cut
33
34							package PPIx::Regexp::Lexer;
35
36	9			9		61	use strict;
	9					17
	9					279
37	9			9		70	use warnings;
	9					18
	9					270
38
39	9			9		50	use base qw{ PPIx::Regexp::Support };
	9					17
	9					4178
40
41	9			9		61	use Carp qw{ confess };
	9					16
	9					461
42	9					806	use PPIx::Regexp::Constant qw{
43							ARRAY_REF
44							TOKEN_LITERAL
45							TOKEN_UNKNOWN
46							@CARP_NOT
47	9			9		52	};
	9					15
48	9			9		4014	use PPIx::Regexp::Node::Range ();
	9					22
	9					186
49	9			9		3590	use PPIx::Regexp::Node::Unknown ();
	9					25
	9					188
50	9			9		4146	use PPIx::Regexp::Structure ();
	9					26
	9					193
51	9			9		4059	use PPIx::Regexp::Structure::Assertion ();
	9					25
	9					197
52	9			9		3848	use PPIx::Regexp::Structure::Atomic_Script_Run ();
	9					28
	9					198
53	9			9		3778	use PPIx::Regexp::Structure::BranchReset ();
	9					26
	9					178
54	9			9		3938	use PPIx::Regexp::Structure::Code ();
	9					25
	9					176
55	9			9		3877	use PPIx::Regexp::Structure::Capture ();
	9					139
	9					201
56	9			9		3819	use PPIx::Regexp::Structure::CharClass ();
	9					26
	9					203
57	9			9		4006	use PPIx::Regexp::Structure::Subexpression ();
	9					26
	9					230
58	9			9		4070	use PPIx::Regexp::Structure::Main ();
	9					25
	9					171
59	9			9		3792	use PPIx::Regexp::Structure::Modifier ();
	9					25
	9					174
60	9			9		3873	use PPIx::Regexp::Structure::NamedCapture ();
	9					26
	9					180
61	9			9		3980	use PPIx::Regexp::Structure::Quantifier ();
	9					34
	9					177
62	9			9		3828	use PPIx::Regexp::Structure::Regexp ();
	9					37
	9					176
63	9			9		4040	use PPIx::Regexp::Structure::RegexSet ();
	9					28
	9					184
64	9			9		3999	use PPIx::Regexp::Structure::Replacement ();
	9					25
	9					232
65	9			9		3910	use PPIx::Regexp::Structure::Script_Run ();
	9					23
	9					174
66	9			9		3843	use PPIx::Regexp::Structure::Switch ();
	9					25
	9					173
67	9			9		3808	use PPIx::Regexp::Structure::Unknown ();
	9					22
	9					171
68	9			9		3961	use PPIx::Regexp::Token::Unmatched ();
	9					25
	9					165
69	9			9		5563	use PPIx::Regexp::Tokenizer ();
	9					48
	9					388
70	9			9		69	use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance };
	9					21
	9					19976
71
72							our $VERSION = '0.087_01';
73
74							=head2 new
75
76							This method instantiates the lexer. It takes as its argument either a
77							L or the text to be
78							parsed. In the latter case the tokenizer is instantiated from the text.
79
80							Any optional name/value pairs after the first argument are passed to the
81							tokenizer, which interprets them or not as the case may be.
82
83							=cut
84
85							{
86
87							my $errstr;
88
89							sub new {
90	332			332	1	1214	my ( $class, $tokenizer, %args ) = @_;
91	332	50				1056	ref $class and $class = ref $class;
92
93	332	50				1223	unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) {
94							my $tokenizer_class = __choose_tokenizer_class(
95							$tokenizer, \%args )
96	0	0				0	or do {
97	0					0	$errstr = 'Data not supported';
98	0					0	return;
99							};
100							$tokenizer = $tokenizer_class->new( $tokenizer, %args )
101	0	0				0	or do {
102	0					0	$errstr = $tokenizer_class->errstr();
103	0					0	return;
104							};
105							}
106
107							my $self = {
108							deferred => [], # Deferred tokens
109							failures => 0,
110							strict => $args{strict},
111	332					2297	tokenizer => $tokenizer,
112							};
113
114	332					850	bless $self, $class;
115	332					930	return $self;
116							}
117
118							sub errstr {
119	0			0	1	0	return $errstr;
120							}
121
122							}
123
124							=head2 errstr
125
126							This method returns the error string from the last attempt to
127							instantiate a C. If the last attempt succeeded, the
128							error will be C.
129
130							=cut
131
132							# Defined above
133
134							=head2 failures
135
136							print $lexer->failures(), " parse failures\n";
137
138							This method returns the number of parse failures encountered. A
139							parse failure is either a tokenization failure (see
140							L<< PPIx::Regexp::Tokenizer->failures()\|PPIx::Regexp::Tokenizer/failures >>)
141							or a structural error.
142
143							=cut
144
145							sub failures {
146	332			332	1	966	my ( $self ) = @_;
147	332					1070	return $self->{failures};
148							}
149
150							=head2 lex
151
152							This method lexes the tokens in the text, and returns the lexed list of
153							elements.
154
155							=cut
156
157							sub lex {
158	332			332	1	926	my ( $self ) = @_;
159
160	332					634	my @content;
161	332					843	$self->{failures} = 0;
162
163							# Accept everything up to the first delimiter.
164	332					730	my $kind; # Initial PPIx::Regexp::Token::Structure
165							{
166	332	100				568	my $token = $self->_get_token()
	668					2042
167							or return $self->_finalize( @content );
168	660	100				2972	$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do {
169	336	100	100			2231	not $kind
170							and $token->isa( 'PPIx::Regexp::Token::Structure' )
171							and $kind = $token;
172	336					972	push @content, $token;
173	336					598	redo;
174							};
175	324					1994	$self->_unget_token( $token );
176							}
177
178							my ( $part_0_class, $part_1_class ) =
179	324					1422	$self->{tokenizer}->__part_classes();
180
181							# Accept the first delimited structure.
182	324					1240	push @content, ( my $part_0 = $self->_get_delimited(
183							$part_0_class ) );
184
185							# If we are a substitution ...
186	324	100				1148	if ( defined $part_1_class ) {
187
188							# Accept any insignificant stuff.
189	24					109	while ( my $token = $self->_get_token() ) {
190	28	100				97	if ( $token->significant() ) {
191	24					104	$self->_unget_token( $token );
192	24					59	last;
193							} else {
194	4					13	push @content, $token;
195							}
196							}
197
198							# Figure out if we should expect an opening bracket.
199	24		100			132	my $expect_open_bracket = $self->close_bracket(
200							$part_0->start( 0 ) ) \|\| 0;
201
202							# Accept the next delimited structure.
203	24					130	push @content, $self->_get_delimited(
204							$part_1_class,
205							$expect_open_bracket,
206							);
207							}
208
209							# Accept the modifiers (we hope!) plus any trailing white space.
210	324					1135	while ( my $token = $self->_get_token() ) {
211	326					1192	push @content, $token;
212							}
213
214							# Let all the elements finalize themselves, recording any additional
215							# errors as they do so.
216	324					2132	$self->_finalize( @content );
217
218							# If we found a regular expression (and we should have done so) ...
219	324	50	33			2455	if ( $part_0 && $part_0->can( 'max_capture_number' ) ) {
220							# TODO the above line is really ugly. I'm wondering about
221							# string implementations like:
222							# * return a $part_0_class of undef (but that complicates the
223							# lexing of the structure itself);
224							# * hang this logic on the tokenizer somehow (where it seems out
225							# of place)
226							# * hang this logic on PPIx::Regexp::Structure::Regexp and
227							# ::Replacement.
228							# I also need to figure out how to make \n backreferences come
229							# out as literals. Maybe that is a job best done by the
230							# tokenizer.
231
232							# Retrieve the maximum capture group.
233	324					1194	my $max_capture = $part_0->max_capture_number();
234
235							# Hashify the known capture names
236							my $capture_name = {
237	324					1255	map { $_ => 1 } $part_0->capture_names(),
	20					99
238							};
239
240							# For all the backreferences found
241	324	100				679	foreach my $elem ( @{ $part_0->find(
	324					840
242							'PPIx::Regexp::Token::Backreference' ) \|\| [] } ) {
243							# Rebless them as needed, recording any errors found.
244							$self->{failures} +=
245	25					148	$elem->__PPIX_LEXER__rebless(
246							capture_name => $capture_name,
247							max_capture => $max_capture,
248							);
249							}
250							}
251
252	324					1593	return @content;
253
254							}
255
256							=head2 strict
257
258							This method returns true or false based on the value of the C<'strict'>
259							argument to C.
260
261							=cut
262
263							sub strict {
264	13			13	1	39	my ( $self ) = @_;
265	13					52	return $self->{strict};
266							}
267
268							# Finalize the content array, updating the parse failures count as we
269							# go.
270							sub _finalize {
271	332			332		1211	my ( $self, @content ) = @_;
272	332					893	foreach my $elem ( @content ) {
273	1014					4025	$self->{failures} += $elem->__PPIX_LEXER__finalize( $self );
274							}
275	332	100				1407	defined wantarray and return @content;
276	324					667	return;
277							}
278
279							{
280
281							my %bracket = (
282							'{' => '}',
283							'(' => ')',
284							'[' => ']',
285							'(?[' => '])',
286							## '<' => '>',
287							);
288
289							my %unclosed = (
290							'{' => '_recover_curly',
291							);
292
293							sub _get_delimited {
294	348			348		942	my ( $self, $class, $expect_open_bracket ) = @_;
295	348	100				1205	defined $expect_open_bracket or $expect_open_bracket = 1;
296
297	348					606	my @rslt;
298	348					874	$self->{_rslt} = \@rslt;
299
300	348	100				884	if ( $expect_open_bracket ) {
301	329	50				780	if ( my $token = $self->_get_token() ) {
302	329					805	push @rslt, [];
303	329	50				1361	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
304	329					628	push @{ $rslt[-1] }, '', $token;
	329					1083
305							} else {
306	0					0	push @{ $rslt[-1] }, '', undef;
	0					0
307	0					0	$self->_unget_token( $token );
308							}
309							} else {
310	0					0	return;
311							}
312							} else {
313	19					94	push @rslt, [ '', undef ];
314							}
315
316	348					944	while ( my $token = $self->_get_token() ) {
317	2296	100				8740	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
318	348					1431	$self->_unget_token( $token );
319	348					977	last;
320							}
321	1948	100				6599	if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) {
322	555					1620	my $content = $token->content();
323
324	555	100	66			2857	if ( my $finish = $bracket{$content} ) {
		100	66
		100
		100
325							# Open bracket
326	276					730	push @rslt, [ $finish, $token ];
327
328							} elsif ( $content eq $rslt[-1][0] ) {
329
330							# Matched close bracket
331	269					1125	$self->_make_node( $token );
332
333							} elsif ( $content ne ')' ) {
334
335							# If the close bracket is not a parenthesis, it becomes
336							# a literal.
337	4					21	TOKEN_LITERAL->__PPIX_ELEM__rebless( $token );
338	4					10	push @{ $rslt[-1] }, $token;
	4					16
339
340							} elsif ( $content eq ')'
341							and @rslt > 1 # Ignore enclosing delimiter
342							and my $recover = $unclosed{$rslt[-1][1]->content()} ) {
343							# If the close bracket is a parenthesis and there is a
344							# recovery procedure, we use it.
345	1					7	$self->$recover( $token );
346
347							} else {
348
349							# Unmatched close with no recovery.
350	5					18	$self->{failures}++;
351	5					70	PPIx::Regexp::Token::Unmatched->
352							__PPIX_ELEM__rebless( $token );
353	5					14	push @{ $rslt[-1] }, $token;
	5					17
354							}
355
356							} else {
357	1393					2298	push @{ $rslt[-1] }, $token;
	1393					3481
358							}
359
360							# We have to hand-roll the Range object.
361	1948	100	100			5930	if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' )
			100
362							&& $rslt[-1][-2]->content() eq '-'
363							&& $rslt[-1][0] eq ']' # It's a character class
364							) {
365	13					35	my @tokens = splice @{ $rslt[-1] }, -3;
	13					66
366	13					26	push @{ $rslt[-1] },
	13					131
367							PPIx::Regexp::Node::Range->__new( @tokens );
368							}
369							}
370
371	348					1650	while ( @rslt > 1 ) {
372	6	100				34	if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) {
373	5					32	$self->$recover();
374							} else {
375	1					2	$self->{failures}++;
376	1					5	$self->_make_node( undef );
377							}
378							}
379
380	348	50				958	if ( @rslt == 1 ) {
381	348					636	my @last = @{ pop @rslt };
	348					1011
382	348					821	shift @last;
383	348					886	push @last, $self->_get_token();
384	348					2208	return $class->__new( @last );
385							} else {
386	0					0	confess "Missing data";
387							}
388
389							}
390
391							}
392
393							# $token = $self->_get_token();
394							#
395							# This method returns the next token from the tokenizer.
396
397							sub _get_token {
398	4319			4319		7878	my ( $self ) = @_;
399
400	4319	100				6213	if ( @{ $self->{deferred} } ) {
	4319					9660
401	697					1228	return shift @{ $self->{deferred} };
	697					1948
402							}
403
404	3622	100				9753	my $token = $self->{tokenizer}->next_token() or return;
405
406	3290					10563	return $token;
407							}
408
409							{
410
411							my %handler = (
412							'(' => '_round',
413							'[' => '_square',
414							'{' => '_curly',
415							'(?[' => '_regex_set',
416							);
417
418							sub _make_node {
419	270			270		667	my ( $self, $token ) = @_;
420	270					463	my @args = @{ pop @{ $self->{_rslt} } };
	270					426
	270					929
421	270					638	shift @args;
422	270					569	push @args, $token;
423	270					413	my @node;
424	270	50				823	if ( my $method = $handler{ $args[0]->content() } ) {
425	270					1340	@node = $self->$method( \@args );
426							}
427	270	50				912	@node or @node = PPIx::Regexp::Structure->__new( @args );
428	270					467	push @{ $self->{_rslt}[-1] }, @node;
	270					742
429	270					706	return;
430							}
431
432							}
433
434							# Called as $self->$method( ... ) in _make_node(), above
435							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
436	35			35		105	my ( $self, $args ) = @_;
437
438	35	100	66			215	if ( $args->[-1] && $args->[-1]->is_quantifier() ) {
		50
439
440							# If the tokenizer has marked the right curly as a quantifier,
441							# make the whole thing a quantifier structure.
442	29					100	return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } );
	29					191
443
444							} elsif ( $args->[-1] ) {
445
446							# If there is a right curly but it is not a quantifier,
447							# make both curlys into literals.
448	6					19	foreach my $inx ( 0, -1 ) {
449	12					54	TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] );
450							}
451
452							# Try to recover possible quantifiers not recognized because we
453							# thought this was a structure.
454	6					40	$self->_recover_curly_quantifiers( $args );
455
456	6					12	return @{ $args };
	6					26
457
458							} else {
459
460							# If there is no right curly, just make a generic structure
461							# TODO maybe this should be something else?
462	0					0	return PPIx::Regexp::Structure->__new( @{ $args } );
	0					0
463							}
464							}
465
466							# Recover from an unclosed left curly.
467							# Called as $self->$revover( ... ) in _get_delimited, above
468							sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines)
469	6			6		21	my ( $self, $token ) = @_;
470
471							# Get all the stuff we have accumulated for this curly.
472	6					16	my @content = @{ pop @{ $self->{_rslt} } };
	6					14
	6					21
473
474							# Lose the right bracket, which we have already failed to match.
475	6					16	shift @content;
476
477							# Rebless the left curly appropriately
478	6	100	66			56	if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' )
479							&& q<\b> eq $self->{_rslt}[0][-1]->content() ) {
480							# If following \b, it becomes an unknown.
481	1					9	TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0],
482							error => 'Unterminated bound type',
483							);
484							} else {
485							# Rebless the left curly to a literal.
486	5					43	TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] );
487							}
488
489							# Try to recover possible quantifiers not recognized because we
490							# thought this was a structure.
491	6					37	$self->_recover_curly_quantifiers( \@content );
492
493							# Shove the curly and its putative contents into whatever structure
494							# we have going.
495							# The checks are to try to trap things like RT 56864, though on
496							# further reflection it turned out that you could get here with an
497							# empty $self->{_rslt} on things like 'm{)}'. This one did not get
498							# made into an RT ticket, but was fixed by not calling the recovery
499							# code if $self->{_rslt} contained only the enclosing delimiters.
500							ARRAY_REF eq ref $self->{_rslt}
501							or confess 'Programming error - $self->{_rslt} not array ref, ',
502	6	50				41	"parsing '", $self->{tokenizer}->content(), "' at ",
503							$token->content();
504	6					22	@{ $self->{_rslt} }
505							or confess 'Programming error - $self->{_rslt} empty, ',
506	6	50				12	"parsing '", $self->{tokenizer}->content(), "' at ",
507							$token->content();
508	6					14	push @{ $self->{_rslt}[-1] }, @content;
	6					33
509
510							# Shove the mismatched delimiter back into the input so we can have
511							# another crack at it.
512	6	100				22	$token and $self->_unget_token( $token );
513
514							# We gone.
515	6					20	return;
516							}
517
518							sub _recover_curly_quantifiers {
519	12			12		36	my ( undef, $args ) = @_; # Invocant unused
520
521	12	100	100			57	if ( __instance( $args->[0], TOKEN_LITERAL )
			66
522							&& __instance( $args->[1], TOKEN_UNKNOWN )
523							&& PPIx::Regexp::Token::Quantifier->could_be_quantifier(
524							$args->[1]->content() )
525							) {
526	2					30	PPIx::Regexp::Token::Quantifier->
527							__PPIX_ELEM__rebless( $args->[1] );
528
529	2	50	33			10	if ( __instance( $args->[2], TOKEN_UNKNOWN )
530							&& PPIx::Regexp::Token::Greediness->could_be_greediness(
531							$args->[2]->content() )
532							) {
533	2					18	PPIx::Regexp::Token::Greediness
534							->__PPIX_ELEM__rebless( $args->[2] );
535							}
536
537							}
538
539	12					43	return;
540							}
541
542							sub _in_regex_set {
543	193			193		398	my ( $self ) = @_;
544	193					283	foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) {
	193					472
545	302	100				852	$stack_entry->[0] eq '])'
546							and return 1;
547							}
548	189					548	return 0;
549							}
550
551							# Called as $self->$method( ... ) in _make_node(), above
552							sub _round { ## no critic (ProhibitUnusedPrivateSubroutines)
553	193			193		425	my ( $self, $args ) = @_;
554
555							# If we're inside a regex set, parens do not capture.
556							$self->_in_regex_set()
557	193	100				649	and return PPIx::Regexp::Structure->__new( @{ $args } );
	4					27
558
559							# If /n is asserted, parens do not capture.
560							$self->{tokenizer}->modifier( 'n' )
561	189	100				634	and return PPIx::Regexp::Structure->__new( @{ $args } );
	7					47
562
563							# The instantiator will rebless based on the first token if need be.
564	182					430	return PPIx::Regexp::Structure::Capture->__new( @{ $args } );
	182					1417
565							}
566
567							# Called as $self->$method( ... ) in _make_node(), above
568							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
569	36			36		121	my ( undef, $args ) = @_; # Invocant unused
570	36					84	return PPIx::Regexp::Structure::CharClass->__new( @{ $args } );
	36					286
571							}
572
573							# Called as $self->$method( ... ) in _make_node(), above
574							sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines)
575	6			6		27	my ( undef, $args ) = @_; # Invocant unused
576	6					18	return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } );
	6					63
577							}
578
579							# $self->_unget_token( $token );
580							#
581							# This method caches its argument so that it will be returned by
582							# the next call to C<_get_token()>. If more than one argument is
583							# passed, they will be returned in the order given; that is,
584							# _unget_token/_get_token work like unshift/shift.
585
586							sub _unget_token {
587	697			697		1846	my ( $self, @args ) = @_;
588	697					1054	unshift @{ $self->{deferred} }, @args;
	697					1702
589	697					1331	return $self;
590							}
591
592							1;
593
594							__END__