File Coverage

blib/lib/Parser/MGC.pm

Criterion	Covered	Total	%
statement	279	282	98.9
branch	92	104	88.4
condition	20	23	86.9
subroutine	56	58	96.5
pod	32	34	94.1
total	479	501	95.6

line	stmt	bran	cond	sub	pod	time	code
1							# You may distribute under the terms of either the GNU General Public License
2							# or the Artistic License (the same terms as Perl itself)
3							#
4							# (C) Paul Evans, 2010-2022 -- leonerd@leonerd.org.uk
5
6							package Parser::MGC 0.20;
7
8	31			31		1418337	use v5.14;
	31					1556
9	31			31		141	use warnings;
	31					44
	31					741
10
11	31			31		127	use Carp;
	31					48
	31					1633
12	31			31		12849	use Feature::Compat::Try;
	31					8602
	31					106
13
14	31			31		71230	use Scalar::Util qw( blessed );
	31					61
	31					3009
15
16							=head1 NAME
17
18							C - build simple recursive-descent parsers
19
20							=head1 SYNOPSIS
21
22							package My::Grammar::Parser;
23							use base qw( Parser::MGC );
24
25							sub parse
26							{
27							my $self = shift;
28
29							$self->sequence_of( sub {
30							$self->any_of(
31							sub { $self->token_int },
32							sub { $self->token_string },
33							sub { \$self->token_ident },
34							sub { $self->scope_of( "(", \&parse, ")" ) }
35							);
36							} );
37							}
38
39							my $parser = My::Grammar::Parser->new;
40
41							my $tree = $parser->from_file( $ARGV[0] );
42
43							...
44
45							=head1 DESCRIPTION
46
47							This base class provides a low-level framework for building recursive-descent
48							parsers that consume a given input string from left to right, returning a
49							parse structure. It takes its name from the C regexps used to implement
50							the token parsing behaviour.
51
52							It provides a number of token-parsing methods, which each extract a
53							grammatical token from the string. It also provides wrapping methods that can
54							be used to build up a possibly-recursive grammar structure, by applying a
55							structure around other parts of parsing code.
56
57							=head2 Backtracking
58
59							Each method, both token and structural, atomically either consumes a prefix of
60							the string and returns its result, or fails and consumes nothing. This makes
61							it simple to implement grammars that require backtracking.
62
63							Several structure-forming methods have some form of "optional" behaviour; they
64							can optionally consume some amount of input or take some particular choice,
65							but if the code invoked inside that subsequently fails, the structure can
66							backtrack and take some different behaviour. This is usually what is required
67							when testing whether the structure of the input string matches some part of
68							the grammar that is optional, or has multiple choices.
69
70							However, once the choice of grammar has been made, it is often useful to be
71							able to fix on that one choice, thus making subsequent failures propagate up
72							rather than taking that alternative behaviour. Control of this backtracking
73							is given by the C method; and careful use of this method is one of the
74							key advantages that C has over more simple parsing using single
75							regexps alone.
76
77							=cut
78
79							=head1 CONSTRUCTOR
80
81							=cut
82
83							=head2 new
84
85							$parser = Parser::MGC->new( %args )
86
87							Returns a new instance of a C object. This must be called on a
88							subclass that provides method of the name provided as C, by default
89							called C.
90
91							Takes the following named arguments
92
93							=over 8
94
95							=item toplevel => STRING
96
97							Name of the toplevel method to use to start the parse from. If not supplied,
98							will try to use a method called C.
99
100							=item patterns => HASH
101
102							Keys in this hash should map to quoted regexp (C) references, to
103							override the default patterns used to match tokens. See C below
104
105							=item accept_0o_oct => BOOL
106
107							If true, the C method will also accept integers with a C<0o> prefix
108							as octal.
109
110							=back
111
112							=cut
113
114							=head1 PATTERNS
115
116							The following pattern names are recognised. They may be passed to the
117							constructor in the C hash, or provided as a class method under the
118							name C>.
119
120							=over 4
121
122							=item * ws
123
124							Pattern used to skip whitespace between tokens. Defaults to C
125
126							=item * comment
127
128							Pattern used to skip comments between tokens. Undefined by default.
129
130							=item * int
131
132							Pattern used to parse an integer by C. Defaults to
133							C. If C is given, then
134							this will be expanded to match C as well.
135
136							=item * float
137
138							Pattern used to parse a floating-point number by C. Defaults to
139							C.
140
141							=item * ident
142
143							Pattern used to parse an identifier by C. Defaults to
144							C
145
146							=item * string_delim
147
148							Pattern used to delimit a string by C. Defaults to C.
149
150							=back
151
152							=cut
153
154							my @patterns = qw(
155							ws
156							comment
157							int
158							float
159							ident
160							string_delim
161							);
162
163	31			31		177	use constant pattern_ws => qr/[\s\n\t]+/;
	31					51
	31					1635
164	31			31		146	use constant pattern_comment => undef;
	31					56
	31					2365
165	31			31		178	use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+\|[[:digit:]]+)/;
	31					58
	31					2394
166	31			31		178	use constant pattern_float => qr/-?(?:\d*\.\d+\|\d+\.)(?:e-?\d+)?\|-?\d+e-?\d+/i;
	31					53
	31					3078
167	31			31		504	use constant pattern_ident => qr/[[:alpha:]_]\w*/;
	31					108
	31					1685
168	31			31		151	use constant pattern_string_delim => qr/["']/;
	31					51
	31					113440
169
170							sub new
171							{
172	46			46	1	2555	my $class = shift;
173	46					114	my %args = @_;
174
175	46		100			231	my $toplevel = $args{toplevel} \|\| "parse";
176
177	46	50				299	$class->can( $toplevel ) or
178							croak "Expected to be a subclass that can ->$toplevel";
179
180	46					204	my $self = bless {
181							toplevel => $toplevel,
182							patterns => {},
183							scope_level => 0,
184							}, $class;
185
186	46		100			241	$self->{patterns}{$_} = $args{patterns}{$_} \|\| $self->${\"pattern_$_"} for @patterns;
187
188	46	100				141	if( $args{accept_0o_oct} ) {
189	1					30	$self->{patterns}{int} = qr/0o[0-7]+\|$self->{patterns}{int}/;
190							}
191
192	46	100				142	if( defined $self->{patterns}{comment} ) {
193	1					23	$self->{patterns}{_skip} = qr/$self->{patterns}{ws}\|$self->{patterns}{comment}/;
194							}
195							else {
196	45					101	$self->{patterns}{_skip} = $self->{patterns}{ws};
197							}
198
199	46					253	return $self;
200							}
201
202							=head1 METHODS
203
204							=cut
205
206							=head2 from_string
207
208							$result = $parser->from_string( $str )
209
210							Parse the given literal string and return the result from the toplevel method.
211
212							=cut
213
214							sub from_string
215							{
216	161			161	1	33770	my $self = shift;
217	161					287	my ( $str ) = @_;
218
219	161					286	$self->{str} = $str;
220
221	161					372	pos $self->{str} = 0;
222
223	161					282	my $toplevel = $self->{toplevel};
224	161					475	my $result = $self->$toplevel;
225
226	143	100				434	$self->at_eos or
227							$self->fail( "Expected end of input" );
228
229	138					602	return $result;
230							}
231
232							=head2 from_file
233
234							$result = $parser->from_file( $file, %opts )
235
236							Parse the given file, which may be a pathname in a string, or an opened IO
237							handle, and return the result from the toplevel method.
238
239							The following options are recognised:
240
241							=over 8
242
243							=item binmode => STRING
244
245							If set, applies the given binmode to the filehandle before reading. Typically
246							this can be used to set the encoding of the file.
247
248							$parser->from_file( $file, binmode => ":encoding(UTF-8)" )
249
250							=back
251
252							=cut
253
254							sub from_file
255							{
256	3			3	1	1350	my $self = shift;
257	3					8	my ( $file, %opts ) = @_;
258
259	3	50				10	defined $file or croak "Expected a filename to ->from_file";
260
261	3					6	$self->{filename} = $file;
262
263	3					5	my $fh;
264	3	100				10	if( ref $file ) {
265	2					4	$fh = $file;
266							}
267							else {
268	1	50				35	open $fh, "<", $file or die "Cannot open $file for reading - $!";
269							}
270
271	3	50				11	binmode $fh, $opts{binmode} if $opts{binmode};
272
273	3					6	$self->from_string( do { local $/; <$fh>; } );
	3					12
	3					106
274							}
275
276							=head2 filename
277
278							$filename = $parser->filename
279
280							I
281
282							Returns the name of the file currently being parsed, if invoked from within
283							L.
284
285							=cut
286
287							sub filename
288							{
289	43			43	1	68	my $self = shift;
290	43					101	return $self->{filename};
291							}
292
293							=head2 from_reader
294
295							$result = $parser->from_reader( \&reader )
296
297							I
298
299							Parse the input which is read by the C function. This function will be
300							called in scalar context to generate portions of string to parse, being passed
301							the C<$parser> object. The function should return C when it has no more
302							string to return.
303
304							$reader->( $parser )
305
306							Note that because it is not generally possible to detect exactly when more
307							input may be required due to failed regexp parsing, the reader function is
308							only invoked during searching for skippable whitespace. This makes it suitable
309							for reading lines of a file in the common case where lines are considered as
310							skippable whitespace, or for reading lines of input interactively from a
311							user. It cannot be used in all cases (for example, reading fixed-size buffers
312							from a file) because two successive invocations may split a single token
313							across the buffer boundaries, and cause parse failures.
314
315							=cut
316
317							sub from_reader
318							{
319	1			1	1	13	my $self = shift;
320	1					2	my ( $reader ) = @_;
321
322	1					2	local $self->{reader} = $reader;
323
324	1					2	$self->{str} = "";
325	1					3	pos $self->{str} = 0;
326
327	1					4	my $result = $self->parse;
328
329	1	50				2	$self->at_eos or
330							$self->fail( "Expected end of input" );
331
332	1					9	return $result;
333							}
334
335							=head2 pos
336
337							$pos = $parser->pos
338
339							I
340
341							Returns the current parse position, as a character offset from the beginning
342							of the file or string.
343
344							=cut
345
346							sub pos
347							{
348	279			279	1	358	my $self = shift;
349	279					600	return pos $self->{str};
350							}
351
352							=head2 take
353
354							$str = $parser->take( $len )
355
356							I
357
358							Returns the next C<$len> characters directly from the input, prior to any
359							whitespace or comment skipping. This does I take account of any
360							end-of-scope marker that may be pending. It is intended for use by parsers of
361							partially-binary protocols, or other situations in which it would be incorrect
362							for the end-of-scope marker to take effect at this time.
363
364							=cut
365
366							sub take
367							{
368	46			46	1	64	my $self = shift;
369	46					66	my ( $len ) = @_;
370
371	46					65	my $start = pos( $self->{str} );
372
373	46					90	pos( $self->{str} ) += $len;
374
375	46					218	return substr( $self->{str}, $start, $len );
376							}
377
378							=head2 where
379
380							( $lineno, $col, $text ) = $parser->where
381
382							Returns the current parse position, as a line and column number, and
383							the entire current line of text. The first line is numbered 1, and the first
384							column is numbered 0.
385
386							=cut
387
388							sub where
389							{
390	49			49	1	79	my $self = shift;
391	49					89	my ( $pos ) = @_;
392
393	49	100				127	defined $pos or $pos = pos $self->{str};
394
395	49					83	my $str = $self->{str};
396
397	49					62	my $sol = $pos;
398	49	100	100			217	$sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
399	49		100			299	$sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
400
401	49					83	my $eol = $pos;
402	49		100			477	$eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
403
404	49					117	my $line = substr( $str, $sol, $eol - $sol );
405
406	49					105	my $col = $pos - $sol;
407	49					146	my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
408
409	49					164	return ( $lineno, $col, $line );
410							}
411
412							=head2 fail
413
414							=head2 fail_from
415
416							$parser->fail( $message )
417
418							$parser->fail_from( $pos, $message )
419
420							I since version 0.09.>
421
422							Aborts the current parse attempt with the given message string. The failure
423							message will include the line and column position, and the line of input that
424							failed at the current parse position (C), or a position earlier obtained
425							using the C method (C).
426
427							This failure will propagate up to the inner-most structure parsing method that
428							has not been committed; or will cause the entire parser to fail if there are
429							no further options to take.
430
431							=cut
432
433							sub fail
434							{
435	272			272	1	346	my $self = shift;
436	272					377	my ( $message ) = @_;
437	272					484	$self->fail_from( $self->pos, $message );
438							}
439
440							sub fail_from
441							{
442	272			272	1	311	my $self = shift;
443	272					374	my ( $pos, $message ) = @_;
444	272					527	die Parser::MGC::Failure->new( $message, $self, $pos );
445							}
446
447							# On perl 5.32 onwards we can use the nicer `isa` infix operator
448							# Problem is it won't even parse correctly on older perls so we'll have to go
449							# the long way around
450							*_isa_failure = ( $^V ge v5.32 )
451							? do { eval 'use experimental "isa"; sub { $_[0] isa Parser::MGC::Failure }' // die $@ }
452							: do { require Scalar::Util;
453	253	100		253		2186	sub { Scalar::Util::blessed($_[0]) and $_[0]->isa( "Parser::MGC::Failure" ) } };
454
455							=head2 die
456
457							=head2 die_from
458
459							$parser->die( $message )
460
461							$parser->die_from( $pos, $message )
462
463							I
464
465							Throws an exception that propagates as normal for C, entirely out of the
466							entire parser and to the caller of the toplevel C method that invoked
467							it, bypassing all of the back-tracking logic.
468
469							This is much like using core's C directly, except that the message string
470							will include the line and column position, and the line of input that the
471							parser was working on, as it does in the L method.
472
473							This method is intended for reporting fatal errors where the parsed input was
474							correctly recognised at a grammar level, but is requesting something that
475							cannot be fulfilled semantically.
476
477							=cut
478
479							sub die :method
480							{
481	1			1	1	6	my $self = shift;
482	1					1	my ( $message ) = @_;
483	1					3	$self->die_from( $self->pos, $message );
484							}
485
486							sub die_from
487							{
488	1			1	1	2	my $self = shift;
489	1					1	my ( $pos, $message ) = @_;
490							# Convenient just to use the ->STRING method of a Failure object but don't
491							# throw it directly
492	1					3	die Parser::MGC::Failure->new( $message, $self, $pos )->STRING;
493							}
494
495							=head2 at_eos
496
497							$eos = $parser->at_eos
498
499							Returns true if the input string is at the end of the string.
500
501							=cut
502
503							sub at_eos
504							{
505	596			596	1	743	my $self = shift;
506
507							# Save pos() before skipping ws so we don't break the substring_before method
508	596					797	my $pos = pos $self->{str};
509
510	596					1050	$self->skip_ws;
511
512	596					632	my $at_eos;
513	596	100				1186	if( pos( $self->{str} ) >= length $self->{str} ) {
		100
514	162					200	$at_eos = 1;
515							}
516							elsif( defined $self->{endofscope} ) {
517	114					396	$at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
518							}
519							else {
520	320					357	$at_eos = 0;
521							}
522
523	596					1022	pos( $self->{str} ) = $pos;
524
525	596					1326	return $at_eos;
526							}
527
528							=head2 scope_level
529
530							$level = $parser->scope_level
531
532							I
533
534							Returns the number of nested C calls that have been made.
535
536							=cut
537
538							sub scope_level
539							{
540	5			5	1	7	my $self = shift;
541	5					34	return $self->{scope_level};
542							}
543
544							=head1 STRUCTURE-FORMING METHODS
545
546							The following methods may be used to build a grammatical structure out of the
547							defined basic token-parsing methods. Each takes at least one code reference,
548							which will be passed the actual C<$parser> object as its first argument.
549
550							Anywhere that a code reference is expected also permits a plain string giving
551							the name of a method to invoke. This is sufficient in many simple cases, such
552							as
553
554							$self->any_of(
555							'token_int',
556							'token_string',
557							...
558							);
559
560							=cut
561
562							=head2 maybe
563
564							$ret = $parser->maybe( $code )
565
566							Attempts to execute the given C<$code> in scalar context, and returns what it
567							returned, accepting that it might fail. C<$code> may either be a CODE
568							reference or a method name given as a string.
569
570							If the code fails (either by calling C itself, or by propagating a
571							failure from another method it invoked) before it has invoked C, then
572							none of the input string will be consumed; the current parsing position will
573							be restored. C will be returned in this case.
574
575							If it calls C then any subsequent failure will be propagated to the
576							caller, rather than returning C.
577
578							This may be considered to be similar to the C regexp qualifier.
579
580							sub parse_declaration
581							{
582							my $self = shift;
583
584							[ $self->parse_type,
585							$self->token_ident,
586							$self->maybe( sub {
587							$self->expect( "=" );
588							$self->parse_expression
589							} ),
590							];
591							}
592
593							=cut
594
595							sub maybe
596							{
597	4			4	1	26	my $self = shift;
598	4					7	my ( $code ) = @_;
599
600	4					5	my $pos = pos $self->{str};
601
602	4					6	my $committed = 0;
603	4			0		10	local $self->{committer} = sub { $committed++ };
	0					0
604
605							try {
606							return $self->$code;
607							}
608	4					10	catch ( $e ) {
609							pos($self->{str}) = $pos;
610
611							die $e if $committed or not _isa_failure( $e );
612							return undef;
613							}
614							}
615
616							=head2 scope_of
617
618							$ret = $parser->scope_of( $start, $code, $stop )
619
620							Expects to find the C<$start> pattern, then attempts to execute the given
621							C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
622							code returned. C<$code> may either be a CODE reference of a method name given
623							as a string.
624
625							While the code is being executed, the C<$stop> pattern will be used by the
626							token parsing methods as an end-of-scope marker; causing them to raise a
627							failure if called at the end of a scope.
628
629							sub parse_block
630							{
631							my $self = shift;
632
633							$self->scope_of( "{", 'parse_statements', "}" );
634							}
635
636							If the C<$start> pattern is undefined, it is presumed the caller has already
637							checked for this. This is useful when the stop pattern needs to be calculated
638							based on the start pattern.
639
640							sub parse_bracketed
641							{
642							my $self = shift;
643
644							my $delim = $self->expect( qr/[\(\[\<\{]/ );
645							$delim =~ tr/([<{/)]>}/;
646
647							$self->scope_of( undef, 'parse_body', $delim );
648							}
649
650							This method does not have any optional parts to it; any failures are
651							immediately propagated to the caller.
652
653							=cut
654
655							sub scope_of
656							{
657	19			19	1	93	my $self = shift;
658	19					53	$self->_scope_of( 0, @_ );
659							}
660
661							sub _scope_of
662							{
663	73			73		79	my $self = shift;
664	73					123	my ( $commit_if_started, $start, $code, $stop ) = @_;
665
666	73	50				426	ref $stop or $stop = qr/\Q$stop/;
667
668	73	100				249	$self->expect( $start ) if defined $start;
669
670	34	100				108	$self->commit if $commit_if_started;
671
672	34					66	local $self->{endofscope} = $stop;
673	34					69	local $self->{scope_level} = $self->{scope_level} + 1;
674
675	34					89	my $ret = $self->$code;
676
677	31					87	$self->expect( $stop );
678
679	30					156	return $ret;
680							}
681
682							=head2 committed_scope_of
683
684							$ret = $parser->committed_scope_of( $start, $code, $stop )
685
686							I
687
688							A variant of L that calls L after a successful match of
689							the start pattern. This is usually what you want if using C from
690							within an C choice, if no other alternative following this one could
691							possibly match if the start pattern has.
692
693							=cut
694
695							sub committed_scope_of
696							{
697	54			54	1	70	my $self = shift;
698	54					99	$self->_scope_of( 1, @_ );
699							}
700
701							=head2 list_of
702
703							$ret = $parser->list_of( $sep, $code )
704
705							Expects to find a list of instances of something parsed by C<$code>,
706							separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
707							the return values from the C<$code>. A single trailing delimiter is allowed,
708							and does not affect the return value. C<$code> may either be a CODE reference
709							or a method name given as a string. It is called in list context, and whatever
710							values it returns are appended to the eventual result - similar to perl's
711							C.
712
713							This method does not consider it an error if the returned list is empty; that
714							is, that the scope ended before any item instances were parsed from it.
715
716							sub parse_numbers
717							{
718							my $self = shift;
719
720							$self->list_of( ",", 'token_int' );
721							}
722
723							If the code fails (either by invoking C itself, or by propagating a
724							failure from another method it invoked) before it has invoked C on a
725							particular item, then the item is aborted and the parsing position will be
726							restored to the beginning of that failed item. The list of results from
727							previous successful attempts will be returned.
728
729							If it calls C within an item then any subsequent failure for that item
730							will cause the entire C to fail, propagating that to the caller.
731
732							=cut
733
734							sub list_of
735							{
736	72			72	1	128	my $self = shift;
737	72					108	my ( $sep, $code ) = @_;
738
739	72	100	33			207	ref $sep or $sep = qr/\Q$sep/ if defined $sep;
740
741	72					81	my $committed;
742	72			14		226	local $self->{committer} = sub { $committed++ };
	14					22
743
744	72					99	my @ret;
745
746	72					140	while( !$self->at_eos ) {
747	127					176	$committed = 0;
748	127					154	my $pos = pos $self->{str};
749
750							try {
751							push @ret, $self->$code;
752							next;
753							}
754							catch ( $e ) {
755							pos($self->{str}) = $pos;
756							die $e if $committed or not _isa_failure( $e );
757
758							last;
759							}
760	127					212	}
761							continue {
762	103	100				224	if( defined $sep ) {
763	32					58	$self->skip_ws;
764	32	100				145	$self->{str} =~ m/\G$sep/gc or last;
765							}
766							}
767
768	71					247	return \@ret;
769							}
770
771							=head2 sequence_of
772
773							$ret = $parser->sequence_of( $code )
774
775							A shortcut for calling C with an empty string as separator; expects
776							to find at least one instance of something parsed by C<$code>, separated only
777							by skipped whitespace.
778
779							This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
780
781							sub parse_statements
782							{
783							my $self = shift;
784
785							$self->sequence_of( 'parse_statement' );
786							}
787
788							The interaction of failures in the code and the C method is identical
789							to that of C.
790
791							=cut
792
793							sub sequence_of
794							{
795	56			56	1	169	my $self = shift;
796	56					91	my ( $code ) = @_;
797
798	56					125	$self->list_of( undef, $code );
799							}
800
801							=head2 any_of
802
803							$ret = $parser->any_of( @codes )
804
805							I
806
807							Expects that one of the given code instances can parse something from the
808							input, returning what it returned. Each code instance may indicate a failure
809							to parse by calling the C method or otherwise propagating a failure.
810							Each code instance may either be a CODE reference or a method name given as a
811							string.
812
813							This may be considered to be similar to the C<\|> regexp operator for forming
814							alternations of possible parse trees.
815
816							sub parse_statement
817							{
818							my $self = shift;
819
820							$self->any_of(
821							sub { $self->parse_declaration; $self->expect(";") },
822							sub { $self->parse_expression; $self->expect(";") },
823							sub { $self->parse_block },
824							);
825							}
826
827							If the code for a given choice fails (either by invoking C itself, or by
828							propagating a failure from another method it invoked) before it has invoked
829							C itself, then the parsing position restored and the next choice will
830							be attempted.
831
832							If it calls C then any subsequent failure for that choice will cause
833							the entire C to fail, propagating that to the caller and no further
834							choices will be attempted.
835
836							If none of the choices match then a simple failure message is printed:
837
838							Found nothing parseable
839
840							As this is unlikely to be helpful to users, a better message can be provided
841							by the final choice instead. Don't forget to C before printing the
842							failure message, or it won't count.
843
844							$self->any_of(
845							'token_int',
846							'token_string',
847							...,
848
849							sub { $self->commit; $self->fail( "Expected an int or string" ) }
850							);
851
852							=cut
853
854							sub any_of
855							{
856	218			218	1	379	my $self = shift;
857
858	218					399	while( @_ ) {
859	433					536	my $code = shift;
860	433					508	my $pos = pos $self->{str};
861
862	433					457	my $committed = 0;
863	433			55		1091	local $self->{committer} = sub { $committed++ };
	55					80
864
865							try {
866							return $self->$code;
867							}
868	433					741	catch ( $e ) {
869							pos( $self->{str} ) = $pos;
870
871							die $e if $committed or not _isa_failure( $e );
872							}
873							}
874
875	12					26	$self->fail( "Found nothing parseable" );
876							}
877
878							sub one_of {
879	0			0	0	0	croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
880							}
881
882							=head2 commit
883
884							$parser->commit
885
886							Calling this method will cancel the backtracking behaviour of the innermost
887							C, C, C, or C structure forming method.
888							That is, if later code then calls C, the exception will be propagated
889							out of C, no further list items will be attempted by C or
890							C, and no further code blocks will be attempted by C.
891
892							Typically this will be called once the grammatical structure alter has been
893							determined, ensuring that any further failures are raised as real exceptions,
894							rather than by attempting other alternatives.
895
896							sub parse_statement
897							{
898							my $self = shift;
899
900							$self->any_of(
901							...
902							sub {
903							$self->scope_of( "{",
904							sub { $self->commit; $self->parse_statements; },
905							"}" ),
906							},
907							);
908							}
909
910							Though in this common pattern, L may be used instead.
911
912							=cut
913
914							sub commit
915							{
916	69			69	1	101	my $self = shift;
917	69	50				117	if( $self->{committer} ) {
918	69					108	$self->{committer}->();
919							}
920							else {
921	0					0	croak "Cannot commit except within a backtrack-able structure";
922							}
923							}
924
925							=head1 TOKEN PARSING METHODS
926
927							The following methods attempt to consume some part of the input string, to be
928							used as part of the parsing process.
929
930							=cut
931
932							sub skip_ws
933							{
934	1283			1283	0	1368	my $self = shift;
935
936	1283					1476	my $pattern = $self->{patterns}{_skip};
937
938							{
939	1283					1283	1 while $self->{str} =~ m/\G$pattern/gc;
	1286					5533
940
941	1286	100				2800	return if pos( $self->{str} ) < length $self->{str};
942
943	222	100				483	return unless $self->{reader};
944
945	4					9	my $more = $self->{reader}->( $self );
946	4	100				12	if( defined $more ) {
947	3					5	my $pos = pos( $self->{str} );
948	3					7	$self->{str} .= $more;
949	3					4	pos( $self->{str} ) = $pos;
950
951	3					7	redo;
952							}
953
954	1					3	undef $self->{reader};
955	1					1	return;
956							}
957							}
958
959							=head2 expect
960
961							$str = $parser->expect( $literal )
962
963							$str = $parser->expect( qr/pattern/ )
964
965							@groups = $parser->expect( qr/pattern/ )
966
967							Expects to find a literal string or regexp pattern match, and consumes it.
968							In scalar context, this method returns the string that was captured. In list
969							context it returns the matching substring and the contents of any subgroups
970							contained in the pattern.
971
972							This method will raise a parse error (by calling C) if the regexp fails
973							to match. Note that if the pattern could match an empty string (such as for
974							example C), the pattern will always match, even if it has to match an
975							empty string. This method will not consider a failure if the regexp matches
976							with zero-width.
977
978							=head2 maybe_expect
979
980							$str = $parser->maybe_expect( ... )
981
982							@groups = $parser->maybe_expect( ... )
983
984							I
985
986							A convenient shortcut equivalent to calling C within C, but
987							implemented more efficiently, avoiding the exception-handling set up by
988							C. Returns C or an empty list if the match fails.
989
990							=cut
991
992							sub maybe_expect
993							{
994	371			371	1	461	my $self = shift;
995	371					482	my ( $expect ) = @_;
996
997	371	100				624	ref $expect or $expect = qr/\Q$expect/;
998
999	371					637	$self->skip_ws;
1000	371	100				3512	$self->{str} =~ m/\G$expect/gc or return;
1001
1002	210	100				1098	return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
1003	22	100				56	return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
	37					226
1004							}
1005
1006							sub expect
1007							{
1008	355			355	1	544	my $self = shift;
1009	355					507	my ( $expect ) = @_;
1010
1011	355	100				2156	ref $expect or $expect = qr/\Q$expect/;
1012
1013	355	100				657	if( wantarray ) {
1014	31	100				64	my @ret = $self->maybe_expect( $expect ) or
1015							$self->fail( "Expected $expect" );
1016	20					80	return @ret;
1017							}
1018							else {
1019	324	100				553	defined( my $ret = $self->maybe_expect( $expect ) ) or
1020							$self->fail( "Expected $expect" );
1021	185					502	return $ret;
1022							}
1023							}
1024
1025							=head2 substring_before
1026
1027							$str = $parser->substring_before( $literal )
1028
1029							$str = $parser->substring_before( qr/pattern/ )
1030
1031							I
1032
1033							Expects to possibly find a literal string or regexp pattern match. If it finds
1034							such, consume all the input text before but excluding this match, and return
1035							it. If it fails to find a match before the end of the current scope, consumes
1036							all the input text until the end of scope and return it.
1037
1038							This method does not consume the part of input that matches, only the text
1039							before it. It is not considered a failure if the substring before this match
1040							is empty. If a non-empty match is required, use the C method:
1041
1042							sub token_nonempty_part
1043							{
1044							my $self = shift;
1045
1046							my $str = $parser->substring_before( "," );
1047							length $str or $self->fail( "Expected a string fragment before ," );
1048
1049							return $str;
1050							}
1051
1052							Note that unlike most of the other token parsing methods, this method does not
1053							consume either leading or trailing whitespace around the substring. It is
1054							expected that this method would be used as part a parser to read quoted
1055							strings, or similar cases where whitespace should be preserved.
1056
1057							=head2 nonempty_substring_before
1058
1059							$str = $parser->nonempty_substring_before( $literal )
1060
1061							$str = $parser->nonempty_substring_before( qr/pattern/ )
1062
1063							I
1064
1065							A variant of L which fails if the matched part is empty.
1066
1067							The example above could have been written:
1068
1069							sub token_nonempty_part
1070							{
1071							my $self = shift;
1072
1073							return $parser->nonempty_substring_before( "," );
1074							}
1075
1076							This is often useful for breaking out of repeating loops; e.g.
1077
1078							sub token_escaped_string
1079							{
1080							my $self = shift;
1081							$self->expect( '"' );
1082
1083							my $ret = "";
1084							1 while $self->any_of(
1085							sub { $ret .= $self->nonempty_substring_before( qr/%\|$/m ); 1 }
1086							sub { my $escape = ( $self->expect( qr/%(.)/ ) )[1];
1087							$ret .= _handle_escape( $escape );
1088							1 },
1089							sub { 0 },
1090							)
1091
1092							return $ret;
1093							}
1094
1095							=cut
1096
1097							sub _substring_before
1098							{
1099	41			41		50	my $self = shift;
1100	41					65	my ( $expect, $fail_if_empty ) = @_;
1101
1102	41	100				150	ref $expect or $expect = qr/\Q$expect/;
1103
1104	41	100				131	my $endre = ( defined $self->{endofscope} ) ?
1105							qr/$expect\|$self->{endofscope}/ :
1106							$expect;
1107
1108							# NO skip_ws
1109
1110	41					59	my $start = pos $self->{str};
1111	41					43	my $end;
1112	41	100				422	if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1113	38					94	$end = $-[1];
1114							}
1115							else {
1116	3					8	$end = length $self->{str};
1117							}
1118
1119	41	100	66			106	$self->fail( "Expected to find a non-empty substring before $expect" )
1120							if $fail_if_empty and $end == $start;
1121
1122	40					92	return $self->take( $end - $start );
1123							}
1124
1125							sub substring_before
1126							{
1127	40			40	1	63	my $self = shift;
1128	40					71	return $self->_substring_before( $_[0], 0 );
1129							}
1130
1131							sub nonempty_substring_before
1132							{
1133	1			1	1	9	my $self = shift;
1134	1					13	return $self->_substring_before( $_[0], 1 );
1135							}
1136
1137							=head2 generic_token
1138
1139							$val = $parser->generic_token( $name, $re, $convert )
1140
1141							I
1142
1143							Expects to find a token matching the precompiled regexp C<$re>. If provided,
1144							the C<$convert> CODE reference can be used to convert the string into a more
1145							convenient form. C<$name> is used in the failure message if the pattern fails
1146							to match.
1147
1148							If provided, the C<$convert> function will be passed the parser and the
1149							matching substring; the value it returns is returned from C.
1150
1151							$convert->( $parser, $substr )
1152
1153							If not provided, the substring will be returned as it stands.
1154
1155							This method is mostly provided for subclasses to define their own token types.
1156							For example:
1157
1158							sub token_hex
1159							{
1160							my $self = shift;
1161							$self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1162							}
1163
1164							=cut
1165
1166							sub generic_token
1167							{
1168	230			230	1	273	my $self = shift;
1169	230					359	my ( $name, $re, $convert ) = @_;
1170
1171	230	50				377	$self->fail( "Expected $name" ) if $self->at_eos;
1172
1173	230					432	$self->skip_ws;
1174	230	100				2985	$self->{str} =~ m/\G$re/gc or
1175							$self->fail( "Expected $name" );
1176
1177	169					762	my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1178
1179	169	100				558	return $convert ? $convert->( $self, $match ) : $match;
1180							}
1181
1182							sub _token_generic
1183							{
1184	226			226		270	my $self = shift;
1185	226					559	my %args = @_;
1186
1187	226					311	my $name = $args{name};
1188	226	50				501	my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1189	226					257	my $convert = $args{convert};
1190
1191	226					433	$self->generic_token( $name, $re, $convert );
1192							}
1193
1194							=head2 token_int
1195
1196							$int = $parser->token_int
1197
1198							Expects to find an integer in decimal, octal or hexadecimal notation, and
1199							consumes it. Negative integers, preceeded by C<->, are also recognised.
1200
1201							=cut
1202
1203							sub token_int
1204							{
1205	134			134	1	289	my $self = shift;
1206							$self->_token_generic(
1207							name => "int",
1208
1209							pattern => "int",
1210							convert => sub {
1211	104			104		151	my $int = $_[1];
1212	104	100				213	my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1213
1214	104					126	$int =~ s/^0o/0/;
1215
1216	104	100				227	return $sign * oct $int if $int =~ m/^0/;
1217	99					659	return $sign * $int;
1218							},
1219	134					463	);
1220							}
1221
1222							=head2 token_float
1223
1224							$float = $parser->token_float
1225
1226							I
1227
1228							Expects to find a number expressed in floating-point notation; a sequence of
1229							digits possibly prefixed by C<->, possibly containing a decimal point,
1230							possibly followed by an exponent specified by C followed by an integer. The
1231							numerical value is then returned.
1232
1233							=cut
1234
1235							sub token_float
1236							{
1237	20			20	1	59	my $self = shift;
1238							$self->_token_generic(
1239							name => "float",
1240
1241							pattern => "float",
1242	18			18		122	convert => sub { $_[1] + 0 },
1243	20					76	);
1244							}
1245
1246							=head2 token_number
1247
1248							$number = $parser->token_number
1249
1250							I
1251
1252							Expects to find a number expressed in either of the above forms.
1253
1254							=cut
1255
1256							sub token_number
1257							{
1258	7			7	1	28	my $self = shift;
1259	7					21	$self->any_of( \&token_float, \&token_int );
1260							}
1261
1262							=head2 token_string
1263
1264							$str = $parser->token_string
1265
1266							Expects to find a quoted string, and consumes it. The string should be quoted
1267							using C<"> or C<'> quote marks.
1268
1269							The content of the quoted string can contain character escapes similar to
1270							those accepted by C or Perl. Specifically, the following forms are recognised:
1271
1272							\a Bell ("alert")
1273							\b Backspace
1274							\e Escape
1275							\f Form feed
1276							\n Newline
1277							\r Return
1278							\t Horizontal Tab
1279							\0, \012 Octal character
1280							\x34, \x{5678} Hexadecimal character
1281
1282							C's C<\v> for vertical tab is not supported as it is rarely used in practice
1283							and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1284							control characters is also not supported.
1285
1286							=cut
1287
1288							my %escapes = (
1289							a => "\a",
1290							b => "\b",
1291							e => "\e",
1292							f => "\f",
1293							n => "\n",
1294							r => "\r",
1295							t => "\t",
1296							);
1297
1298							sub token_string
1299							{
1300	53			53	1	137	my $self = shift;
1301
1302	53	100				102	$self->fail( "Expected string" ) if $self->at_eos;
1303
1304	52					115	my $pos = pos $self->{str};
1305
1306	52					109	$self->skip_ws;
1307	52	100				347	$self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1308							$self->fail( "Expected string delimiter" );
1309
1310	32					89	my $delim = $1;
1311
1312							$self->{str} =~ m/
1313							\G(
1314							(?:
1315							\\[0-7]{1,3} # octal escape
1316							\|\\x[0-9A-F]{2} # 2-digit hex escape
1317							\|\\x\{[0-9A-F]+\} # {}-delimited hex escape
1318							\|\\. # symbolic escape
1319							\|[^\\$delim]+ # plain chunk
1320							)*?
1321							)$delim/gcix or
1322	32	50				780	pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1323
1324	32					84	my $string = $1;
1325
1326	32					80	$string =~ s<\\(?:([0-7]{1,3})\|x([0-9A-F]{2})\|x\{([0-9A-F]+)\}\|(.))>
1327							[defined $1 ? chr oct $1 :
1328							defined $2 ? chr hex $2 :
1329	11	50				67	defined $3 ? chr hex $3 :
		100
		100
		100
1330							exists $escapes{$4} ? $escapes{$4} : $4]egi;
1331	32					101
1332							return $string;
1333							}
1334
1335							=head2 token_ident
1336
1337							$ident = $parser->token_ident
1338
1339							Expects to find an identifier, and consumes it.
1340
1341							=cut
1342
1343							sub token_ident
1344	72			72	1	121	{
1345	72					123	my $self = shift;
1346							$self->_token_generic(
1347							name => "ident",
1348
1349							pattern => "ident",
1350							);
1351							}
1352
1353							=head2 token_kw
1354
1355							$keyword = $parser->token_kw( @keywords )
1356
1357							Expects to find a keyword, and consumes it. A keyword is defined as an
1358							identifier which is exactly one of the literal values passed in.
1359
1360							=cut
1361
1362							sub token_kw
1363	2			2	1	9	{
1364	2					5	my $self = shift;
1365							my @acceptable = @_;
1366	2					6
1367							$self->skip_ws;
1368	2					3
1369							my $pos = pos $self->{str};
1370	2	50				7
1371							defined( my $kw = $self->token_ident ) or
1372							return undef;
1373	4					17
1374	2	100				3	grep { $_ eq $kw } @acceptable or
1375							pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1376	1					3
1377							return $kw;
1378							}
1379
1380							package # hide from indexer
1381							Parser::MGC::Failure;
1382
1383							sub new
1384	273			273		319	{
1385	273					442	my $class = shift;
1386	273					459	my $self = bless {}, $class;
	273					1107
1387	273					1114	@{$self}{qw( message parser pos )} = @_;
1388							return $self;
1389							}
1390	31			31		25447
	31					19299
	31					200
1391							use overload '""' => "STRING";
1392							sub STRING
1393	43			43		3444	{
1394							my $self = shift;
1395	43					88
1396	43					164	my $parser = $self->{parser};
1397							my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1398
1399							# Column number only counts characters. There may be tabs in there.
1400							# Rather than trying to calculate the visual column number, just print the
1401							# indentation as it stands.
1402	43					85
1403	43					122	my $indent = substr( $text, 0, $col );
1404							$indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1405	43					144
1406	43	100	100			152	my $filename = $parser->filename;
1407							my $in_file = ( defined $filename and !ref $filename )
1408							? "in $filename " : "";
1409	43					422
1410							return "$self->{message} ${in_file}on line $linenum at:\n" .
1411							"$text\n" .
1412							"$indent^\n";
1413							}
1414
1415	31			31		6153	# Provide fallback operators for cmp, eq, etc...
	31					58
	31					137
1416							use overload fallback => 1;
1417
1418							=head1 EXAMPLES
1419
1420							=head2 Accumulating Results Using Variables
1421
1422							Although the structure-forming methods all return a value, obtained from their
1423							nested parsing code, it can sometimes be more convenient to use a variable to
1424							accumulate a result in instead. For example, consider the following parser
1425							method, designed to parse a set of C assignments, such as might
1426							be found in a configuration file, or YAML/JSON-style mapping value.
1427
1428							sub parse_dict
1429							{
1430							my $self = shift;
1431
1432							my %ret;
1433							$self->list_of( ",", sub {
1434							my $key = $self->token_ident;
1435							exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1436
1437							$self->expect( ":" );
1438
1439							$ret{$key} = $self->parse_value;
1440							} );
1441
1442							return \%ret
1443							}
1444
1445							Instead of using the return value from C, this method accumulates
1446							values in the C<%ret> hash, eventually returning a reference to it as its
1447							result. Because of this, it can perform some error checking while it parses;
1448							namely, rejecting duplicate keys.
1449
1450							=head1 TODO
1451
1452							=over 4
1453
1454							=item *
1455
1456							Make unescaping of string constants more customisable. Possibly consider
1457							instead a C using a loop over C.
1458
1459							=item *
1460
1461							Easy ability for subclasses to define more token types as methods. Perhaps
1462							provide a class method such as
1463
1464							__PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1465
1466							=item *
1467
1468							Investigate how well C can cope with buffer splitting across
1469							other tokens than simply skippable whitespace
1470
1471							=back
1472
1473							=head1 AUTHOR
1474
1475							Paul Evans
1476
1477							=cut
1478
1479							0x55AA;