File Coverage

blib/lib/Parser/MGC.pm

Criterion	Covered	Total	%
statement	265	268	98.8
branch	89	100	89.0
condition	18	20	90.0
subroutine	52	54	96.3
pod	28	30	93.3
total	452	472	95.7

line	stmt	bran	cond	sub	pod	time	code
1							# You may distribute under the terms of either the GNU General Public License
2							# or the Artistic License (the same terms as Perl itself)
3							#
4							# (C) Paul Evans, 2010-2021 -- leonerd@leonerd.org.uk
5
6							package Parser::MGC 0.19;
7
8	31			31		1726036	use v5.14;
	31					337
9	31			31		179	use warnings;
	31					57
	31					998
10
11	31			31		179	use Carp;
	31					66
	31					2185
12	31			31		16410	use Feature::Compat::Try;
	31					11601
	31					133
13
14	31			31		67687	use File::Slurp::Tiny qw( read_file );
	31					426616
	31					2145
15	31			31		301	use Scalar::Util qw( blessed );
	31					69
	31					3210
16
17							=head1 NAME
18
19							C - build simple recursive-descent parsers
20
21							=head1 SYNOPSIS
22
23							package My::Grammar::Parser;
24							use base qw( Parser::MGC );
25
26							sub parse
27							{
28							my $self = shift;
29
30							$self->sequence_of( sub {
31							$self->any_of(
32							sub { $self->token_int },
33							sub { $self->token_string },
34							sub { \$self->token_ident },
35							sub { $self->scope_of( "(", \&parse, ")" ) }
36							);
37							} );
38							}
39
40							my $parser = My::Grammar::Parser->new;
41
42							my $tree = $parser->from_file( $ARGV[0] );
43
44							...
45
46							=head1 DESCRIPTION
47
48							This base class provides a low-level framework for building recursive-descent
49							parsers that consume a given input string from left to right, returning a
50							parse structure. It takes its name from the C regexps used to implement
51							the token parsing behaviour.
52
53							It provides a number of token-parsing methods, which each extract a
54							grammatical token from the string. It also provides wrapping methods that can
55							be used to build up a possibly-recursive grammar structure, by applying a
56							structure around other parts of parsing code.
57
58							=head2 Backtracking
59
60							Each method, both token and structural, atomically either consumes a prefix of
61							the string and returns its result, or fails and consumes nothing. This makes
62							it simple to implement grammars that require backtracking.
63
64							Several structure-forming methods have some form of "optional" behaviour; they
65							can optionally consume some amount of input or take some particular choice,
66							but if the code invoked inside that subsequently fails, the structure can
67							backtrack and take some different behaviour. This is usually what is required
68							when testing whether the structure of the input string matches some part of
69							the grammar that is optional, or has multiple choices.
70
71							However, once the choice of grammar has been made, it is often useful to be
72							able to fix on that one choice, thus making subsequent failures propagate up
73							rather than taking that alternative behaviour. Control of this backtracking
74							is given by the C method; and careful use of this method is one of the
75							key advantages that C has over more simple parsing using single
76							regexps alone.
77
78							=cut
79
80							=head1 CONSTRUCTOR
81
82							=cut
83
84							=head2 new
85
86							$parser = Parser::MGC->new( %args )
87
88							Returns a new instance of a C object. This must be called on a
89							subclass that provides method of the name provided as C, by default
90							called C.
91
92							Takes the following named arguments
93
94							=over 8
95
96							=item toplevel => STRING
97
98							Name of the toplevel method to use to start the parse from. If not supplied,
99							will try to use a method called C.
100
101							=item patterns => HASH
102
103							Keys in this hash should map to quoted regexp (C) references, to
104							override the default patterns used to match tokens. See C below
105
106							=item accept_0o_oct => BOOL
107
108							If true, the C method will also accept integers with a C<0o> prefix
109							as octal.
110
111							=back
112
113							=cut
114
115							=head1 PATTERNS
116
117							The following pattern names are recognised. They may be passed to the
118							constructor in the C hash, or provided as a class method under the
119							name C>.
120
121							=over 4
122
123							=item * ws
124
125							Pattern used to skip whitespace between tokens. Defaults to C
126
127							=item * comment
128
129							Pattern used to skip comments between tokens. Undefined by default.
130
131							=item * int
132
133							Pattern used to parse an integer by C. Defaults to
134							C. If C is given, then
135							this will be expanded to match C as well.
136
137							=item * float
138
139							Pattern used to parse a floating-point number by C. Defaults to
140							C.
141
142							=item * ident
143
144							Pattern used to parse an identifier by C. Defaults to
145							C
146
147							=item * string_delim
148
149							Pattern used to delimit a string by C. Defaults to C.
150
151							=back
152
153							=cut
154
155							my @patterns = qw(
156							ws
157							comment
158							int
159							float
160							ident
161							string_delim
162							);
163
164	31			31		238	use constant pattern_ws => qr/[\s\n\t]+/;
	31					73
	31					2494
165	31			31		203	use constant pattern_comment => undef;
	31					66
	31					2529
166	31			31		223	use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+\|[[:digit:]]+)/;
	31					52
	31					3351
167	31			31		229	use constant pattern_float => qr/-?(?:\d*\.\d+\|\d+\.)(?:e-?\d+)?\|-?\d+e-?\d+/i;
	31					224
	31					4125
168	31			31		232	use constant pattern_ident => qr/[[:alpha:]_]\w*/;
	31					67
	31					2003
169	31			31		189	use constant pattern_string_delim => qr/["']/;
	31					57
	31					123267
170
171							sub new
172							{
173	46			46	1	2417	my $class = shift;
174	46					146	my %args = @_;
175
176	46		100			312	my $toplevel = $args{toplevel} \|\| "parse";
177
178	46	50				387	$class->can( $toplevel ) or
179							croak "Expected to be a subclass that can ->$toplevel";
180
181	46					255	my $self = bless {
182							toplevel => $toplevel,
183							patterns => {},
184							scope_level => 0,
185							}, $class;
186
187	46		100			303	$self->{patterns}{$_} = $args{patterns}{$_} \|\| $self->${\"pattern_$_"} for @patterns;
188
189	46	100				173	if( $args{accept_0o_oct} ) {
190	1					39	$self->{patterns}{int} = qr/0o[0-7]+\|$self->{patterns}{int}/;
191							}
192
193	46	100				178	if( defined $self->{patterns}{comment} ) {
194	1					42	$self->{patterns}{_skip} = qr/$self->{patterns}{ws}\|$self->{patterns}{comment}/;
195							}
196							else {
197	45					138	$self->{patterns}{_skip} = $self->{patterns}{ws};
198							}
199
200	46					213	return $self;
201							}
202
203							=head1 METHODS
204
205							=cut
206
207							=head2 from_string
208
209							$result = $parser->from_string( $str )
210
211							Parse the given literal string and return the result from the toplevel method.
212
213							=cut
214
215							sub from_string
216							{
217	160			160	1	31108	my $self = shift;
218	160					359	my ( $str ) = @_;
219
220	160					352	$self->{str} = $str;
221
222	160					439	pos $self->{str} = 0;
223
224	160					345	my $toplevel = $self->{toplevel};
225	160					571	my $result = $self->$toplevel;
226
227	143	100				586	$self->at_eos or
228							$self->fail( "Expected end of input" );
229
230	138					766	return $result;
231							}
232
233							=head2 from_file
234
235							$result = $parser->from_file( $file, %opts )
236
237							Parse the given file, which may be a pathname in a string, or an opened IO
238							handle, and return the result from the toplevel method.
239
240							The following options are recognised:
241
242							=over 8
243
244							=item binmode => STRING
245
246							If set, applies the given binmode to the filehandle before reading. Typically
247							this can be used to set the encoding of the file.
248
249							$parser->from_file( $file, binmode => ":encoding(UTF-8)" )
250
251							=back
252
253							=cut
254
255							sub from_file
256							{
257	3			3	1	1372	my $self = shift;
258	3					11	my ( $file, %opts ) = @_;
259
260	3	50				11	defined $file or croak "Expected a filename to ->from_file";
261
262	3					7	$self->{filename} = $file;
263
264							$self->from_string( ref $file ?
265	2	50				8	do { local $/; binmode $file, $opts{binmode} if $opts{binmode}; <$file> } :
	2					7
	2					78
266	3	100				21	( read_file $file, binmode => $opts{binmode} ) );
267							}
268
269							=head2 from_reader
270
271							$result = $parser->from_reader( \&reader )
272
273							I
274
275							Parse the input which is read by the C function. This function will be
276							called in scalar context to generate portions of string to parse, being passed
277							the C<$parser> object. The function should return C when it has no more
278							string to return.
279
280							$reader->( $parser )
281
282							Note that because it is not generally possible to detect exactly when more
283							input may be required due to failed regexp parsing, the reader function is
284							only invoked during searching for skippable whitespace. This makes it suitable
285							for reading lines of a file in the common case where lines are considered as
286							skippable whitespace, or for reading lines of input interactively from a
287							user. It cannot be used in all cases (for example, reading fixed-size buffers
288							from a file) because two successive invocations may split a single token
289							across the buffer boundaries, and cause parse failures.
290
291							=cut
292
293							sub from_reader
294							{
295	1			1	1	12	my $self = shift;
296	1					3	my ( $reader ) = @_;
297
298	1					3	local $self->{reader} = $reader;
299
300	1					3	$self->{str} = "";
301	1					4	pos $self->{str} = 0;
302
303	1					5	my $result = $self->parse;
304
305	1	50				3	$self->at_eos or
306							$self->fail( "Expected end of input" );
307
308	1					8	return $result;
309							}
310
311							=head2 pos
312
313							$pos = $parser->pos
314
315							I
316
317							Returns the current parse position, as a character offset from the beginning
318							of the file or string.
319
320							=cut
321
322							sub pos
323							{
324	277			277	1	379	my $self = shift;
325	277					761	return pos $self->{str};
326							}
327
328							=head2 take
329
330							$str = $parser->take( $len )
331
332							I
333
334							Returns the next C<$len> characters directly from the input, prior to any
335							whitespace or comment skipping. This does I take account of any
336							end-of-scope marker that may be pending. It is intended for use by parsers of
337							partially-binary protocols, or other situations in which it would be incorrect
338							for the end-of-scope marker to take effect at this time.
339
340							=cut
341
342							sub take
343							{
344	46			46	1	84	my $self = shift;
345	46					76	my ( $len ) = @_;
346
347	46					164	my $start = pos( $self->{str} );
348
349	46					132	pos( $self->{str} ) += $len;
350
351	46					274	return substr( $self->{str}, $start, $len );
352							}
353
354							=head2 where
355
356							( $lineno, $col, $text ) = $parser->where
357
358							Returns the current parse position, as a line and column number, and
359							the entire current line of text. The first line is numbered 1, and the first
360							column is numbered 0.
361
362							=cut
363
364							sub where
365							{
366	14			14	1	31	my $self = shift;
367	14					31	my ( $pos ) = @_;
368
369	14	100				51	defined $pos or $pos = pos $self->{str};
370
371	14					32	my $str = $self->{str};
372
373	14					24	my $sol = $pos;
374	14	100	100			83	$sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
375	14		100			140	$sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
376
377	14					24	my $eol = $pos;
378	14		100			170	$eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
379
380	14					37	my $line = substr( $str, $sol, $eol - $sol );
381
382	14					24	my $col = $pos - $sol;
383	14					49	my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
384
385	14					68	return ( $lineno, $col, $line );
386							}
387
388							=head2 fail
389
390							=head2 fail_from
391
392							$parser->fail( $message )
393
394							$parser->fail_from( $pos, $message )
395
396							I since version 0.09.>
397
398							Aborts the current parse attempt with the given message string. The failure
399							message will include the line and column position, and the line of input that
400							failed at the current parse position (C), or a position earlier obtained
401							using the C method (C).
402
403							This failure will propagate up to the inner-most structure parsing method that
404							has not been committed; or will cause the entire parser to fail if there are
405							no further options to take.
406
407							=cut
408
409							sub fail
410							{
411	271			271	1	423	my $self = shift;
412	271					471	my ( $message ) = @_;
413	271					649	$self->fail_from( $self->pos, $message );
414							}
415
416							sub fail_from
417							{
418	271			271	1	356	my $self = shift;
419	271					496	my ( $pos, $message ) = @_;
420	271					659	die Parser::MGC::Failure->new( $message, $self, $pos );
421							}
422
423	253	100		253		2782	sub _isa_failure { blessed $_[0] and $_[0]->isa( "Parser::MGC::Failure" ) }
424
425							=head2 at_eos
426
427							$eos = $parser->at_eos
428
429							Returns true if the input string is at the end of the string.
430
431							=cut
432
433							sub at_eos
434							{
435	596			596	1	840	my $self = shift;
436
437							# Save pos() before skipping ws so we don't break the substring_before method
438	596					954	my $pos = pos $self->{str};
439
440	596					1414	$self->skip_ws;
441
442	596					830	my $at_eos;
443	596	100				1461	if( pos( $self->{str} ) >= length $self->{str} ) {
		100
444	162					246	$at_eos = 1;
445							}
446							elsif( defined $self->{endofscope} ) {
447	114					530	$at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
448							}
449							else {
450	320					501	$at_eos = 0;
451							}
452
453	596					1271	pos( $self->{str} ) = $pos;
454
455	596					1649	return $at_eos;
456							}
457
458							=head2 scope_level
459
460							$level = $parser->scope_level
461
462							I
463
464							Returns the number of nested C calls that have been made.
465
466							=cut
467
468							sub scope_level
469							{
470	5			5	1	10	my $self = shift;
471	5					45	return $self->{scope_level};
472							}
473
474							=head1 STRUCTURE-FORMING METHODS
475
476							The following methods may be used to build a grammatical structure out of the
477							defined basic token-parsing methods. Each takes at least one code reference,
478							which will be passed the actual C<$parser> object as its first argument.
479
480							Anywhere that a code reference is expected also permits a plain string giving
481							the name of a method to invoke. This is sufficient in many simple cases, such
482							as
483
484							$self->any_of(
485							'token_int',
486							'token_string',
487							...
488							);
489
490							=cut
491
492							=head2 maybe
493
494							$ret = $parser->maybe( $code )
495
496							Attempts to execute the given C<$code> in scalar context, and returns what it
497							returned, accepting that it might fail. C<$code> may either be a CODE
498							reference or a method name given as a string.
499
500							If the code fails (either by calling C itself, or by propagating a
501							failure from another method it invoked) before it has invoked C, then
502							none of the input string will be consumed; the current parsing position will
503							be restored. C will be returned in this case.
504
505							If it calls C then any subsequent failure will be propagated to the
506							caller, rather than returning C.
507
508							This may be considered to be similar to the C regexp qualifier.
509
510							sub parse_declaration
511							{
512							my $self = shift;
513
514							[ $self->parse_type,
515							$self->token_ident,
516							$self->maybe( sub {
517							$self->expect( "=" );
518							$self->parse_expression
519							} ),
520							];
521							}
522
523							=cut
524
525							sub maybe
526							{
527	4			4	1	39	my $self = shift;
528	4					9	my ( $code ) = @_;
529
530	4					7	my $pos = pos $self->{str};
531
532	4					5	my $committed = 0;
533	4			0		16	local $self->{committer} = sub { $committed++ };
	0					0
534
535							try {
536							return $self->$code;
537							}
538	4					13	catch ( $e ) {
539							pos($self->{str}) = $pos;
540
541							die $e if $committed or not _isa_failure( $e );
542							return undef;
543							}
544							}
545
546							=head2 scope_of
547
548							$ret = $parser->scope_of( $start, $code, $stop )
549
550							Expects to find the C<$start> pattern, then attempts to execute the given
551							C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
552							code returned. C<$code> may either be a CODE reference of a method name given
553							as a string.
554
555							While the code is being executed, the C<$stop> pattern will be used by the
556							token parsing methods as an end-of-scope marker; causing them to raise a
557							failure if called at the end of a scope.
558
559							sub parse_block
560							{
561							my $self = shift;
562
563							$self->scope_of( "{", 'parse_statements', "}" );
564							}
565
566							If the C<$start> pattern is undefined, it is presumed the caller has already
567							checked for this. This is useful when the stop pattern needs to be calculated
568							based on the start pattern.
569
570							sub parse_bracketed
571							{
572							my $self = shift;
573
574							my $delim = $self->expect( qr/[\(\[\<\{]/ );
575							$delim =~ tr/([<{/)]>}/;
576
577							$self->scope_of( undef, 'parse_body', $delim );
578							}
579
580							This method does not have any optional parts to it; any failures are
581							immediately propagated to the caller.
582
583							=cut
584
585							sub scope_of
586							{
587	19			19	1	108	my $self = shift;
588	19					77	$self->_scope_of( 0, @_ );
589							}
590
591							sub _scope_of
592							{
593	73			73		102	my $self = shift;
594	73					155	my ( $commit_if_started, $start, $code, $stop ) = @_;
595
596	73	50				574	ref $stop or $stop = qr/\Q$stop/;
597
598	73	100				295	$self->expect( $start ) if defined $start;
599
600	34	100				135	$self->commit if $commit_if_started;
601
602	34					81	local $self->{endofscope} = $stop;
603	34					82	local $self->{scope_level} = $self->{scope_level} + 1;
604
605	34					135	my $ret = $self->$code;
606
607	31					118	$self->expect( $stop );
608
609	30					215	return $ret;
610							}
611
612							=head2 committed_scope_of
613
614							$ret = $parser->committed_scope_of( $start, $code, $stop )
615
616							I
617
618							A variant of L that calls L after a successful match of
619							the start pattern. This is usually what you want if using C from
620							within an C choice, if no other alternative following this one could
621							possibly match if the start pattern has.
622
623							=cut
624
625							sub committed_scope_of
626							{
627	54			54	1	95	my $self = shift;
628	54					118	$self->_scope_of( 1, @_ );
629							}
630
631							=head2 list_of
632
633							$ret = $parser->list_of( $sep, $code )
634
635							Expects to find a list of instances of something parsed by C<$code>,
636							separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
637							the return values from the C<$code>. A single trailing delimiter is allowed,
638							and does not affect the return value. C<$code> may either be a CODE reference
639							or a method name given as a string. It is called in list context, and whatever
640							values it returns are appended to the eventual result - similar to perl's
641							C.
642
643							This method does not consider it an error if the returned list is empty; that
644							is, that the scope ended before any item instances were parsed from it.
645
646							sub parse_numbers
647							{
648							my $self = shift;
649
650							$self->list_of( ",", 'token_int' );
651							}
652
653							If the code fails (either by invoking C itself, or by propagating a
654							failure from another method it invoked) before it has invoked C on a
655							particular item, then the item is aborted and the parsing position will be
656							restored to the beginning of that failed item. The list of results from
657							previous successful attempts will be returned.
658
659							If it calls C within an item then any subsequent failure for that item
660							will cause the entire C to fail, propagating that to the caller.
661
662							=cut
663
664							sub list_of
665							{
666	72			72	1	155	my $self = shift;
667	72					129	my ( $sep, $code ) = @_;
668
669	72	100	33			308	ref $sep or $sep = qr/\Q$sep/ if defined $sep;
670
671	72					101	my $committed;
672	72			14		277	local $self->{committer} = sub { $committed++ };
	14					31
673
674	72					126	my @ret;
675
676	72					193	while( !$self->at_eos ) {
677	127					182	$committed = 0;
678	127					215	my $pos = pos $self->{str};
679
680							try {
681							push @ret, $self->$code;
682							next;
683							}
684							catch ( $e ) {
685							pos($self->{str}) = $pos;
686							die $e if $committed or not _isa_failure( $e );
687
688							last;
689							}
690	127					267	}
691							continue {
692	103	100				287	if( defined $sep ) {
693	32					79	$self->skip_ws;
694	32	100				222	$self->{str} =~ m/\G$sep/gc or last;
695							}
696							}
697
698	71					331	return \@ret;
699							}
700
701							=head2 sequence_of
702
703							$ret = $parser->sequence_of( $code )
704
705							A shortcut for calling C with an empty string as separator; expects
706							to find at least one instance of something parsed by C<$code>, separated only
707							by skipped whitespace.
708
709							This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
710
711							sub parse_statements
712							{
713							my $self = shift;
714
715							$self->sequence_of( 'parse_statement' );
716							}
717
718							The interaction of failures in the code and the C method is identical
719							to that of C.
720
721							=cut
722
723							sub sequence_of
724							{
725	56			56	1	198	my $self = shift;
726	56					129	my ( $code ) = @_;
727
728	56					188	$self->list_of( undef, $code );
729							}
730
731							=head2 any_of
732
733							$ret = $parser->any_of( @codes )
734
735							I
736
737							Expects that one of the given code instances can parse something from the
738							input, returning what it returned. Each code instance may indicate a failure
739							to parse by calling the C method or otherwise propagating a failure.
740							Each code instance may either be a CODE reference or a method name given as a
741							string.
742
743							This may be considered to be similar to the C<\|> regexp operator for forming
744							alternations of possible parse trees.
745
746							sub parse_statement
747							{
748							my $self = shift;
749
750							$self->any_of(
751							sub { $self->parse_declaration; $self->expect(";") },
752							sub { $self->parse_expression; $self->expect(";") },
753							sub { $self->parse_block },
754							);
755							}
756
757							If the code for a given choice fails (either by invoking C itself, or by
758							propagating a failure from another method it invoked) before it has invoked
759							C itself, then the parsing position restored and the next choice will
760							be attempted.
761
762							If it calls C then any subsequent failure for that choice will cause
763							the entire C to fail, propagating that to the caller and no further
764							choices will be attempted.
765
766							If none of the choices match then a simple failure message is printed:
767
768							Found nothing parseable
769
770							As this is unlikely to be helpful to users, a better message can be provided
771							by the final choice instead. Don't forget to C before printing the
772							failure message, or it won't count.
773
774							$self->any_of(
775							'token_int',
776							'token_string',
777							...,
778
779							sub { $self->commit; $self->fail( "Expected an int or string" ) }
780							);
781
782							=cut
783
784							sub any_of
785							{
786	218			218	1	491	my $self = shift;
787
788	218					489	while( @_ ) {
789	433					646	my $code = shift;
790	433					685	my $pos = pos $self->{str};
791
792	433					546	my $committed = 0;
793	433			55		1355	local $self->{committer} = sub { $committed++ };
	55					110
794
795							try {
796							return $self->$code;
797							}
798	433					862	catch ( $e ) {
799							pos( $self->{str} ) = $pos;
800
801							die $e if $committed or not _isa_failure( $e );
802							}
803							}
804
805	12					30	$self->fail( "Found nothing parseable" );
806							}
807
808							sub one_of {
809	0			0	0	0	croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
810							}
811
812							=head2 commit
813
814							$parser->commit
815
816							Calling this method will cancel the backtracking behaviour of the innermost
817							C, C, C, or C structure forming method.
818							That is, if later code then calls C, the exception will be propagated
819							out of C, no further list items will be attempted by C or
820							C, and no further code blocks will be attempted by C.
821
822							Typically this will be called once the grammatical structure alter has been
823							determined, ensuring that any further failures are raised as real exceptions,
824							rather than by attempting other alternatives.
825
826							sub parse_statement
827							{
828							my $self = shift;
829
830							$self->any_of(
831							...
832							sub {
833							$self->scope_of( "{",
834							sub { $self->commit; $self->parse_statements; },
835							"}" ),
836							},
837							);
838							}
839
840							Though in this common pattern, L may be used instead.
841
842							=cut
843
844							sub commit
845							{
846	69			69	1	143	my $self = shift;
847	69	50				159	if( $self->{committer} ) {
848	69					125	$self->{committer}->();
849							}
850							else {
851	0					0	croak "Cannot commit except within a backtrack-able structure";
852							}
853							}
854
855							=head1 TOKEN PARSING METHODS
856
857							The following methods attempt to consume some part of the input string, to be
858							used as part of the parsing process.
859
860							=cut
861
862							sub skip_ws
863							{
864	1283			1283	0	1737	my $self = shift;
865
866	1283					1831	my $pattern = $self->{patterns}{_skip};
867
868							{
869	1283					1655	1 while $self->{str} =~ m/\G$pattern/gc;
	1286					7202
870
871	1286	100				3453	return if pos( $self->{str} ) < length $self->{str};
872
873	222	100				614	return unless $self->{reader};
874
875	4					10	my $more = $self->{reader}->( $self );
876	4	100				17	if( defined $more ) {
877	3					5	my $pos = pos( $self->{str} );
878	3					7	$self->{str} .= $more;
879	3					7	pos( $self->{str} ) = $pos;
880
881	3					7	redo;
882							}
883
884	1					3	undef $self->{reader};
885	1					2	return;
886							}
887							}
888
889							=head2 expect
890
891							$str = $parser->expect( $literal )
892
893							$str = $parser->expect( qr/pattern/ )
894
895							@groups = $parser->expect( qr/pattern/ )
896
897							Expects to find a literal string or regexp pattern match, and consumes it.
898							In scalar context, this method returns the string that was captured. In list
899							context it returns the matching substring and the contents of any subgroups
900							contained in the pattern.
901
902							This method will raise a parse error (by calling C) if the regexp fails
903							to match. Note that if the pattern could match an empty string (such as for
904							example C), the pattern will always match, even if it has to match an
905							empty string. This method will not consider a failure if the regexp matches
906							with zero-width.
907
908							=head2 maybe_expect
909
910							$str = $parser->maybe_expect( ... )
911
912							@groups = $parser->maybe_expect( ... )
913
914							I
915
916							A convenient shortcut equivalent to calling C within C, but
917							implemented more efficiently, avoiding the exception-handling set up by
918							C. Returns C or an empty list if the match fails.
919
920							=cut
921
922							sub maybe_expect
923							{
924	371			371	1	534	my $self = shift;
925	371					615	my ( $expect ) = @_;
926
927	371	100				801	ref $expect or $expect = qr/\Q$expect/;
928
929	371					789	$self->skip_ws;
930	371	100				4304	$self->{str} =~ m/\G$expect/gc or return;
931
932	210	100				1378	return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
933	22	100				75	return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
	37					269
934							}
935
936							sub expect
937							{
938	355			355	1	1121	my $self = shift;
939	355					614	my ( $expect ) = @_;
940
941	355	100				2545	ref $expect or $expect = qr/\Q$expect/;
942
943	355	100				812	if( wantarray ) {
944	31	100				75	my @ret = $self->maybe_expect( $expect ) or
945							$self->fail( "Expected $expect" );
946	20					98	return @ret;
947							}
948							else {
949	324	100				658	defined( my $ret = $self->maybe_expect( $expect ) ) or
950							$self->fail( "Expected $expect" );
951	185					595	return $ret;
952							}
953							}
954
955							=head2 substring_before
956
957							$str = $parser->substring_before( $literal )
958
959							$str = $parser->substring_before( qr/pattern/ )
960
961							I
962
963							Expects to possibly find a literal string or regexp pattern match. If it finds
964							such, consume all the input text before but excluding this match, and return
965							it. If it fails to find a match before the end of the current scope, consumes
966							all the input text until the end of scope and return it.
967
968							This method does not consume the part of input that matches, only the text
969							before it. It is not considered a failure if the substring before this match
970							is empty. If a non-empty match is required, use the C method:
971
972							sub token_nonempty_part
973							{
974							my $self = shift;
975
976							my $str = $parser->substring_before( "," );
977							length $str or $self->fail( "Expected a string fragment before ," );
978
979							return $str;
980							}
981
982							Note that unlike most of the other token parsing methods, this method does not
983							consume either leading or trailing whitespace around the substring. It is
984							expected that this method would be used as part a parser to read quoted
985							strings, or similar cases where whitespace should be preserved.
986
987							=cut
988
989							sub substring_before
990							{
991	40			40	1	71	my $self = shift;
992	40					68	my ( $expect ) = @_;
993
994	40	100				182	ref $expect or $expect = qr/\Q$expect/;
995
996	40	100				186	my $endre = ( defined $self->{endofscope} ) ?
997							qr/$expect\|$self->{endofscope}/ :
998							$expect;
999
1000							# NO skip_ws
1001
1002	40					123	my $start = pos $self->{str};
1003	40					54	my $end;
1004	40	100				522	if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1005	37					153	$end = $-[1];
1006							}
1007							else {
1008	3					9	$end = length $self->{str};
1009							}
1010
1011	40					137	return $self->take( $end - $start );
1012							}
1013
1014							=head2 generic_token
1015
1016							$val = $parser->generic_token( $name, $re, $convert )
1017
1018							I
1019
1020							Expects to find a token matching the precompiled regexp C<$re>. If provided,
1021							the C<$convert> CODE reference can be used to convert the string into a more
1022							convenient form. C<$name> is used in the failure message if the pattern fails
1023							to match.
1024
1025							If provided, the C<$convert> function will be passed the parser and the
1026							matching substring; the value it returns is returned from C.
1027
1028							$convert->( $parser, $substr )
1029
1030							If not provided, the substring will be returned as it stands.
1031
1032							This method is mostly provided for subclasses to define their own token types.
1033							For example:
1034
1035							sub token_hex
1036							{
1037							my $self = shift;
1038							$self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1039							}
1040
1041							=cut
1042
1043							sub generic_token
1044							{
1045	230			230	1	348	my $self = shift;
1046	230					401	my ( $name, $re, $convert ) = @_;
1047
1048	230	50				483	$self->fail( "Expected $name" ) if $self->at_eos;
1049
1050	230					549	$self->skip_ws;
1051	230	100				4273	$self->{str} =~ m/\G$re/gc or
1052							$self->fail( "Expected $name" );
1053
1054	169					945	my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1055
1056	169	100				688	return $convert ? $convert->( $self, $match ) : $match;
1057							}
1058
1059							sub _token_generic
1060							{
1061	226			226		346	my $self = shift;
1062	226					1045	my %args = @_;
1063
1064	226					407	my $name = $args{name};
1065	226	50				573	my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1066	226					319	my $convert = $args{convert};
1067
1068	226					523	$self->generic_token( $name, $re, $convert );
1069							}
1070
1071							=head2 token_int
1072
1073							$int = $parser->token_int
1074
1075							Expects to find an integer in decimal, octal or hexadecimal notation, and
1076							consumes it. Negative integers, preceeded by C<->, are also recognised.
1077
1078							=cut
1079
1080							sub token_int
1081							{
1082	134			134	1	345	my $self = shift;
1083							$self->_token_generic(
1084							name => "int",
1085
1086							pattern => "int",
1087							convert => sub {
1088	104			104		195	my $int = $_[1];
1089	104	100				299	my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1090
1091	104					166	$int =~ s/^0o/0/;
1092
1093	104	100				326	return $sign * oct $int if $int =~ m/^0/;
1094	99					834	return $sign * $int;
1095							},
1096	134					596	);
1097							}
1098
1099							=head2 token_float
1100
1101							$float = $parser->token_float
1102
1103							I
1104
1105							Expects to find a number expressed in floating-point notation; a sequence of
1106							digits possibly prefixed by C<->, possibly containing a decimal point,
1107							possibly followed by an exponent specified by C followed by an integer. The
1108							numerical value is then returned.
1109
1110							=cut
1111
1112							sub token_float
1113							{
1114	20			20	1	68	my $self = shift;
1115							$self->_token_generic(
1116							name => "float",
1117
1118							pattern => "float",
1119	18			18		135	convert => sub { $_[1] + 0 },
1120	20					82	);
1121							}
1122
1123							=head2 token_number
1124
1125							$number = $parser->token_number
1126
1127							I
1128
1129							Expects to find a number expressed in either of the above forms.
1130
1131							=cut
1132
1133							sub token_number
1134							{
1135	7			7	1	34	my $self = shift;
1136	7					26	$self->any_of( \&token_float, \&token_int );
1137							}
1138
1139							=head2 token_string
1140
1141							$str = $parser->token_string
1142
1143							Expects to find a quoted string, and consumes it. The string should be quoted
1144							using C<"> or C<'> quote marks.
1145
1146							The content of the quoted string can contain character escapes similar to
1147							those accepted by C or Perl. Specifically, the following forms are recognised:
1148
1149							\a Bell ("alert")
1150							\b Backspace
1151							\e Escape
1152							\f Form feed
1153							\n Newline
1154							\r Return
1155							\t Horizontal Tab
1156							\0, \012 Octal character
1157							\x34, \x{5678} Hexadecimal character
1158
1159							C's C<\v> for vertical tab is not supported as it is rarely used in practice
1160							and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1161							control characters is also not supported.
1162
1163							=cut
1164
1165							my %escapes = (
1166							a => "\a",
1167							b => "\b",
1168							e => "\e",
1169							f => "\f",
1170							n => "\n",
1171							r => "\r",
1172							t => "\t",
1173							);
1174
1175							sub token_string
1176							{
1177	53			53	1	252	my $self = shift;
1178
1179	53	100				126	$self->fail( "Expected string" ) if $self->at_eos;
1180
1181	52					144	my $pos = pos $self->{str};
1182
1183	52					131	$self->skip_ws;
1184	52	100				448	$self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1185							$self->fail( "Expected string delimiter" );
1186
1187	32					94	my $delim = $1;
1188
1189							$self->{str} =~ m/
1190							\G(
1191							(?:
1192							\\[0-7]{1,3} # octal escape
1193							\|\\x[0-9A-F]{2} # 2-digit hex escape
1194							\|\\x\{[0-9A-F]+\} # {}-delimited hex escape
1195							\|\\. # symbolic escape
1196							\|[^\\$delim]+ # plain chunk
1197							)*?
1198							)$delim/gcix or
1199	32	50				1033	pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1200
1201	32					116	my $string = $1;
1202
1203	32					98	$string =~ s<\\(?:([0-7]{1,3})\|x([0-9A-F]{2})\|x\{([0-9A-F]+)\}\|(.))>
1204							[defined $1 ? chr oct $1 :
1205							defined $2 ? chr hex $2 :
1206	11	50				77	defined $3 ? chr hex $3 :
		100
		100
		100
1207							exists $escapes{$4} ? $escapes{$4} : $4]egi;
1208	32					126
1209							return $string;
1210							}
1211
1212							=head2 token_ident
1213
1214							$ident = $parser->token_ident
1215
1216							Expects to find an identifier, and consumes it.
1217
1218							=cut
1219
1220							sub token_ident
1221	72			72	1	145	{
1222	72					229	my $self = shift;
1223							$self->_token_generic(
1224							name => "ident",
1225
1226							pattern => "ident",
1227							);
1228							}
1229
1230							=head2 token_kw
1231
1232							$keyword = $parser->token_kw( @keywords )
1233
1234							Expects to find a keyword, and consumes it. A keyword is defined as an
1235							identifier which is exactly one of the literal values passed in.
1236
1237							=cut
1238
1239							sub token_kw
1240	2			2	1	14	{
1241	2					5	my $self = shift;
1242							my @acceptable = @_;
1243	2					9
1244							$self->skip_ws;
1245	2					5
1246							my $pos = pos $self->{str};
1247	2	50				10
1248							defined( my $kw = $self->token_ident ) or
1249							return undef;
1250	4					22
1251	2	100				6	grep { $_ eq $kw } @acceptable or
1252							pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1253	1					4
1254							return $kw;
1255							}
1256
1257							package # hide from indexer
1258							Parser::MGC::Failure;
1259
1260							sub new
1261	271			271		376	{
1262	271					535	my $class = shift;
1263	271					501	my $self = bless {}, $class;
	271					1402
1264	271					1401	@{$self}{qw( message parser pos )} = @_;
1265							return $self;
1266							}
1267	31			31		31085
	31					23988
	31					255
1268							use overload '""' => "STRING";
1269							sub STRING
1270	8			8		2112	{
1271							my $self = shift;
1272	8					23
1273	8					54	my $parser = $self->{parser};
1274							my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1275
1276							# Column number only counts characters. There may be tabs in there.
1277							# Rather than trying to calculate the visual column number, just print the
1278							# indentation as it stands.
1279	8					19
1280	8					37	my $indent = substr( $text, 0, $col );
1281							$indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1282	8					19
1283	8	100	100			52	my $filename = $parser->{filename};
1284							my $in_file = ( defined $filename and !ref $filename )
1285							? "in $filename " : "";
1286	8					157
1287							return "$self->{message} ${in_file}on line $linenum at:\n" .
1288							"$text\n" .
1289							"$indent^\n";
1290							}
1291
1292	31			31		7825	# Provide fallback operators for cmp, eq, etc...
	31					71
	31					156
1293							use overload fallback => 1;
1294
1295							=head1 EXAMPLES
1296
1297							=head2 Accumulating Results Using Variables
1298
1299							Although the structure-forming methods all return a value, obtained from their
1300							nested parsing code, it can sometimes be more convenient to use a variable to
1301							accumulate a result in instead. For example, consider the following parser
1302							method, designed to parse a set of C assignments, such as might
1303							be found in a configuration file, or YAML/JSON-style mapping value.
1304
1305							sub parse_dict
1306							{
1307							my $self = shift;
1308
1309							my %ret;
1310							$self->list_of( ",", sub {
1311							my $key = $self->token_ident;
1312							exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1313
1314							$self->expect( ":" );
1315
1316							$ret{$key} = $self->parse_value;
1317							} );
1318
1319							return \%ret
1320							}
1321
1322							Instead of using the return value from C, this method accumulates
1323							values in the C<%ret> hash, eventually returning a reference to it as its
1324							result. Because of this, it can perform some error checking while it parses;
1325							namely, rejecting duplicate keys.
1326
1327							=head1 TODO
1328
1329							=over 4
1330
1331							=item *
1332
1333							Make unescaping of string constants more customisable. Possibly consider
1334							instead a C using a loop over C.
1335
1336							=item *
1337
1338							Easy ability for subclasses to define more token types as methods. Perhaps
1339							provide a class method such as
1340
1341							__PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1342
1343							=item *
1344
1345							Investigate how well C can cope with buffer splitting across
1346							other tokens than simply skippable whitespace
1347
1348							=back
1349
1350							=head1 AUTHOR
1351
1352							Paul Evans
1353
1354							=cut
1355
1356							0x55AA;