File Coverage

blib/lib/Parser/MGC.pm
Criterion Covered Total %
statement 300 303 99.0
branch 99 112 88.3
condition 22 28 78.5
subroutine 58 60 96.6
pod 33 35 94.2
total 512 538 95.1


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2010-2022 -- leonerd@leonerd.org.uk
5              
6             package Parser::MGC 0.21;
7              
8 33     33   1581975 use v5.14;
  33         311  
9 33     33   143 use warnings;
  33         51  
  33         810  
10              
11 33     33   146 use Carp;
  33         72  
  33         1799  
12 33     33   14004 use Feature::Compat::Try;
  33         9114  
  33         120  
13              
14 33     33   78752 use Scalar::Util qw( blessed );
  33         64  
  33         3495  
15              
16             =head1 NAME
17              
18             C - build simple recursive-descent parsers
19              
20             =head1 SYNOPSIS
21              
22             package My::Grammar::Parser;
23             use base qw( Parser::MGC );
24              
25             sub parse
26             {
27             my $self = shift;
28              
29             $self->sequence_of( sub {
30             $self->any_of(
31             sub { $self->token_int },
32             sub { $self->token_string },
33             sub { \$self->token_ident },
34             sub { $self->scope_of( "(", \&parse, ")" ) }
35             );
36             } );
37             }
38              
39             my $parser = My::Grammar::Parser->new;
40              
41             my $tree = $parser->from_file( $ARGV[0] );
42              
43             ...
44              
45             =head1 DESCRIPTION
46              
47             This base class provides a low-level framework for building recursive-descent
48             parsers that consume a given input string from left to right, returning a
49             parse structure. It takes its name from the C regexps used to implement
50             the token parsing behaviour.
51              
52             It provides a number of token-parsing methods, which each extract a
53             grammatical token from the string. It also provides wrapping methods that can
54             be used to build up a possibly-recursive grammar structure, by applying a
55             structure around other parts of parsing code.
56              
57             =head2 Backtracking
58              
59             Each method, both token and structural, atomically either consumes a prefix of
60             the string and returns its result, or fails and consumes nothing. This makes
61             it simple to implement grammars that require backtracking.
62              
63             Several structure-forming methods have some form of "optional" behaviour; they
64             can optionally consume some amount of input or take some particular choice,
65             but if the code invoked inside that subsequently fails, the structure can
66             backtrack and take some different behaviour. This is usually what is required
67             when testing whether the structure of the input string matches some part of
68             the grammar that is optional, or has multiple choices.
69              
70             However, once the choice of grammar has been made, it is often useful to be
71             able to fix on that one choice, thus making subsequent failures propagate up
72             rather than taking that alternative behaviour. Control of this backtracking
73             is given by the C method; and careful use of this method is one of the
74             key advantages that C has over more simple parsing using single
75             regexps alone.
76              
77             =head2 Stall Detection
78              
79             Most of the methods in this class have bounded execution time, but some
80             methods (L and L) repeatedly recuse into other code
81             to build up a list of results until some ending condition is reached. A
82             possible class of bug is that whatever they recurse into might successfully
83             match an empty string, and thus make no progress.
84              
85             These methods will automatically detect this situation if they repeatedly
86             encounter the same string position more than a certain number of times (given
87             by the C argument). If this count is reached, the entire parse
88             attempt will be aborted by the L method.
89              
90             =cut
91              
92             =head1 CONSTRUCTOR
93              
94             =cut
95              
96             =head2 new
97              
98             $parser = Parser::MGC->new( %args )
99              
100             Returns a new instance of a C object. This must be called on a
101             subclass that provides method of the name provided as C, by default
102             called C.
103              
104             Takes the following named arguments
105              
106             =over 8
107              
108             =item toplevel => STRING
109              
110             Name of the toplevel method to use to start the parse from. If not supplied,
111             will try to use a method called C.
112              
113             =item patterns => HASH
114              
115             Keys in this hash should map to quoted regexp (C) references, to
116             override the default patterns used to match tokens. See C below
117              
118             =item accept_0o_oct => BOOL
119              
120             If true, the C method will also accept integers with a C<0o> prefix
121             as octal.
122              
123             =item stallcount => INT
124              
125             I
126              
127             The number of times that the stall-detector would have to see the same
128             position before it aborts the parse attempt. If not supplied, a default of
129             C<10> will apply.
130              
131             =back
132              
133             =cut
134              
135             =head1 PATTERNS
136              
137             The following pattern names are recognised. They may be passed to the
138             constructor in the C hash, or provided as a class method under the
139             name C>.
140              
141             =over 4
142              
143             =item * ws
144              
145             Pattern used to skip whitespace between tokens. Defaults to C
146              
147             =item * comment
148              
149             Pattern used to skip comments between tokens. Undefined by default.
150              
151             =item * int
152              
153             Pattern used to parse an integer by C. Defaults to
154             C. If C is given, then
155             this will be expanded to match C as well.
156              
157             =item * float
158              
159             Pattern used to parse a floating-point number by C. Defaults to
160             C.
161              
162             =item * ident
163              
164             Pattern used to parse an identifier by C. Defaults to
165             C
166              
167             =item * string_delim
168              
169             Pattern used to delimit a string by C. Defaults to C.
170              
171             =back
172              
173             =cut
174              
175             my @patterns = qw(
176             ws
177             comment
178             int
179             float
180             ident
181             string_delim
182             );
183              
184 33     33   235 use constant pattern_ws => qr/[\s\n\t]+/;
  33         63  
  33         1754  
185 33     33   153 use constant pattern_comment => undef;
  33         64  
  33         2446  
186 33     33   198 use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/;
  33         62  
  33         2536  
187 33     33   198 use constant pattern_float => qr/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i;
  33         60  
  33         3500  
188 33     33   202 use constant pattern_ident => qr/[[:alpha:]_]\w*/;
  33         133  
  33         1793  
189 33     33   166 use constant pattern_string_delim => qr/["']/;
  33         49  
  33         1272  
190              
191 33     33   150 use constant DEFAULT_STALLCOUNT => 10;
  33         57  
  33         128183  
192              
193             sub new
194             {
195 48     48 1 2834 my $class = shift;
196 48         120 my %args = @_;
197              
198 48   100     287 my $toplevel = $args{toplevel} || "parse";
199              
200 48 50       325 $class->can( $toplevel ) or
201             croak "Expected to be a subclass that can ->$toplevel";
202              
203             my $self = bless {
204             toplevel => $toplevel,
205             patterns => {},
206             scope_level => 0,
207 48   50     368 stallcount => $args{stallcount} // DEFAULT_STALLCOUNT,
208             }, $class;
209              
210 48   100     264 $self->{patterns}{$_} = $args{patterns}{$_} || $self->${\"pattern_$_"} for @patterns;
211              
212 48 100       157 if( $args{accept_0o_oct} ) {
213 1         30 $self->{patterns}{int} = qr/0o[0-7]+|$self->{patterns}{int}/;
214             }
215              
216 48 100       151 if( defined $self->{patterns}{comment} ) {
217 1         35 $self->{patterns}{_skip} = qr/$self->{patterns}{ws}|$self->{patterns}{comment}/;
218             }
219             else {
220 47         107 $self->{patterns}{_skip} = $self->{patterns}{ws};
221             }
222              
223 48         236 return $self;
224             }
225              
226             =head1 SUBCLASSING METHODS
227              
228             The following optional methods may be defined by subclasses, to customise
229             their parsing.
230              
231             =head2 on_parse_start
232              
233             $parser->on_parse_start
234              
235             I
236              
237             If defined, is invoked by the C method that begins a new parse
238             operation, just before invoking the toplevel structure method.
239              
240             =head2 on_parse_end
241              
242             $result = $parser->on_parse_end( $result )
243              
244             I
245              
246             If defined, is invoked by the C method once it has finished the
247             toplevel structure method. This is passed the tentative result from the
248             structure method, and whatever it returns becomes the result of the C
249             method itself.
250              
251             =cut
252              
253             =head1 METHODS
254              
255             =cut
256              
257             =head2 from_string
258              
259             $result = $parser->from_string( $str )
260              
261             Parse the given literal string and return the result from the toplevel method.
262              
263             =cut
264              
265             sub from_string
266             {
267 167     167 1 30088 my $self = shift;
268 167         325 my ( $str ) = @_;
269              
270 167         312 $self->{str} = $str;
271              
272 167         394 pos $self->{str} = 0;
273              
274 167 100       733 if( my $code = $self->can( "on_parse_start" ) ) {
275 2         6 $self->$code;
276             }
277              
278 167         264 my $toplevel = $self->{toplevel};
279 167         452 my $result = $self->$toplevel;
280              
281 148 100       462 $self->at_eos or
282             $self->fail( "Expected end of input" );
283              
284 143 100       620 if( my $code = $self->can( "on_parse_end" ) ) {
285 2         5 $result = $self->$code( $result );
286             }
287              
288 143         697 return $result;
289             }
290              
291             =head2 from_file
292              
293             $result = $parser->from_file( $file, %opts )
294              
295             Parse the given file, which may be a pathname in a string, or an opened IO
296             handle, and return the result from the toplevel method.
297              
298             The following options are recognised:
299              
300             =over 8
301              
302             =item binmode => STRING
303              
304             If set, applies the given binmode to the filehandle before reading. Typically
305             this can be used to set the encoding of the file.
306              
307             $parser->from_file( $file, binmode => ":encoding(UTF-8)" )
308              
309             =back
310              
311             =cut
312              
313             sub from_file
314             {
315 3     3 1 1975 my $self = shift;
316 3         8 my ( $file, %opts ) = @_;
317              
318 3 50       11 defined $file or croak "Expected a filename to ->from_file";
319              
320 3         16 $self->{filename} = $file;
321              
322 3         4 my $fh;
323 3 100       11 if( ref $file ) {
324 2         4 $fh = $file;
325             }
326             else {
327 1 50       40 open $fh, "<", $file or die "Cannot open $file for reading - $!";
328             }
329              
330 3 50       10 binmode $fh, $opts{binmode} if $opts{binmode};
331              
332 3         4 $self->from_string( do { local $/; <$fh>; } );
  3         12  
  3         96  
333             }
334              
335             =head2 filename
336              
337             $filename = $parser->filename
338              
339             I
340              
341             Returns the name of the file currently being parsed, if invoked from within
342             L.
343              
344             =cut
345              
346             sub filename
347             {
348 44     44 1 61 my $self = shift;
349 44         93 return $self->{filename};
350             }
351              
352             =head2 from_reader
353              
354             $result = $parser->from_reader( \&reader )
355              
356             I
357              
358             Parse the input which is read by the C function. This function will be
359             called in scalar context to generate portions of string to parse, being passed
360             the C<$parser> object. The function should return C when it has no more
361             string to return.
362              
363             $reader->( $parser )
364              
365             Note that because it is not generally possible to detect exactly when more
366             input may be required due to failed regexp parsing, the reader function is
367             only invoked during searching for skippable whitespace. This makes it suitable
368             for reading lines of a file in the common case where lines are considered as
369             skippable whitespace, or for reading lines of input interactively from a
370             user. It cannot be used in all cases (for example, reading fixed-size buffers
371             from a file) because two successive invocations may split a single token
372             across the buffer boundaries, and cause parse failures.
373              
374             =cut
375              
376             sub from_reader
377             {
378 1     1 1 13 my $self = shift;
379 1         2 my ( $reader ) = @_;
380              
381 1         3 local $self->{reader} = $reader;
382              
383 1         2 $self->{str} = "";
384 1         4 pos $self->{str} = 0;
385              
386 1         4 my $result = $self->parse;
387              
388 1 50       2 $self->at_eos or
389             $self->fail( "Expected end of input" );
390              
391 1         9 return $result;
392             }
393              
394             =head2 pos
395              
396             $pos = $parser->pos
397              
398             I
399              
400             Returns the current parse position, as a character offset from the beginning
401             of the file or string.
402              
403             =cut
404              
405             sub pos
406             {
407 291     291 1 358 my $self = shift;
408 291         611 return pos $self->{str};
409             }
410              
411             =head2 take
412              
413             $str = $parser->take( $len )
414              
415             I
416              
417             Returns the next C<$len> characters directly from the input, prior to any
418             whitespace or comment skipping. This does I take account of any
419             end-of-scope marker that may be pending. It is intended for use by parsers of
420             partially-binary protocols, or other situations in which it would be incorrect
421             for the end-of-scope marker to take effect at this time.
422              
423             =cut
424              
425             sub take
426             {
427 57     57 1 79 my $self = shift;
428 57         84 my ( $len ) = @_;
429              
430 57         102 my $start = pos( $self->{str} );
431              
432 57         109 pos( $self->{str} ) += $len;
433              
434 57         265 return substr( $self->{str}, $start, $len );
435             }
436              
437             =head2 where
438              
439             ( $lineno, $col, $text ) = $parser->where
440              
441             Returns the current parse position, as a line and column number, and
442             the entire current line of text. The first line is numbered 1, and the first
443             column is numbered 0.
444              
445             =cut
446              
447             sub where
448             {
449 51     51 1 89 my $self = shift;
450 51         98 my ( $pos ) = @_;
451              
452 51 100       131 defined $pos or $pos = pos $self->{str};
453              
454 51         93 my $str = $self->{str};
455              
456 51         64 my $sol = $pos;
457 51 100 100     240 $sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
458 51   100     340 $sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
459              
460 51         76 my $eol = $pos;
461 51   100     561 $eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
462              
463 51         125 my $line = substr( $str, $sol, $eol - $sol );
464              
465 51         80 my $col = $pos - $sol;
466 51         129 my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
467              
468 51         174 return ( $lineno, $col, $line );
469             }
470              
471             =head2 fail
472              
473             =head2 fail_from
474              
475             $parser->fail( $message )
476              
477             $parser->fail_from( $pos, $message )
478              
479             I since version 0.09.>
480              
481             Aborts the current parse attempt with the given message string. The failure
482             message will include the line and column position, and the line of input that
483             failed at the current parse position (C), or a position earlier obtained
484             using the C method (C).
485              
486             This failure will propagate up to the inner-most structure parsing method that
487             has not been committed; or will cause the entire parser to fail if there are
488             no further options to take.
489              
490             =cut
491              
492             sub fail
493             {
494 283     283 1 360 my $self = shift;
495 283         415 my ( $message ) = @_;
496 283         541 $self->fail_from( $self->pos, $message );
497             }
498              
499             sub fail_from
500             {
501 283     283 1 362 my $self = shift;
502 283         398 my ( $pos, $message ) = @_;
503 283         573 die Parser::MGC::Failure->new( $message, $self, $pos );
504             }
505              
506             # On perl 5.32 onwards we can use the nicer `isa` infix operator
507             # Problem is it won't even parse correctly on older perls so we'll have to go
508             # the long way around
509             *_isa_failure = ( $^V ge v5.32 )
510             ? do { eval 'use experimental "isa"; sub { $_[0] isa Parser::MGC::Failure }' // die $@ }
511             : do { require Scalar::Util;
512 264 100   264   2631 sub { Scalar::Util::blessed($_[0]) and $_[0]->isa( "Parser::MGC::Failure" ) } };
513              
514             =head2 die
515              
516             =head2 die_from
517              
518             $parser->die( $message )
519              
520             $parser->die_from( $pos, $message )
521              
522             I
523              
524             Throws an exception that propagates as normal for C, entirely out of the
525             entire parser and to the caller of the toplevel C method that invoked
526             it, bypassing all of the back-tracking logic.
527              
528             This is much like using core's C directly, except that the message string
529             will include the line and column position, and the line of input that the
530             parser was working on, as it does in the L method.
531              
532             This method is intended for reporting fatal errors where the parsed input was
533             correctly recognised at a grammar level, but is requesting something that
534             cannot be fulfilled semantically.
535              
536             =cut
537              
538             sub die :method
539             {
540 2     2 1 7 my $self = shift;
541 2         4 my ( $message ) = @_;
542 2         6 $self->die_from( $self->pos, $message );
543             }
544              
545             sub die_from
546             {
547 2     2 1 4 my $self = shift;
548 2         5 my ( $pos, $message ) = @_;
549             # Convenient just to use the ->STRING method of a Failure object but don't
550             # throw it directly
551 2         8 die Parser::MGC::Failure->new( $message, $self, $pos )->STRING;
552             }
553              
554             =head2 at_eos
555              
556             $eos = $parser->at_eos
557              
558             Returns true if the input string is at the end of the string.
559              
560             =cut
561              
562             sub at_eos
563             {
564 641     641 1 757 my $self = shift;
565              
566             # Save pos() before skipping ws so we don't break the substring_before method
567 641         860 my $pos = pos $self->{str};
568              
569 641         1153 $self->skip_ws;
570              
571 641         714 my $at_eos;
572 641 100       1324 if( pos( $self->{str} ) >= length $self->{str} ) {
    100          
573 177         247 $at_eos = 1;
574             }
575             elsif( defined $self->{endofscope} ) {
576 114         417 $at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
577             }
578             else {
579 350         405 $at_eos = 0;
580             }
581              
582 641         1096 pos( $self->{str} ) = $pos;
583              
584 641         1370 return $at_eos;
585             }
586              
587             =head2 scope_level
588              
589             $level = $parser->scope_level
590              
591             I
592              
593             Returns the number of nested C calls that have been made.
594              
595             =cut
596              
597             sub scope_level
598             {
599 5     5 1 7 my $self = shift;
600 5         34 return $self->{scope_level};
601             }
602              
603             =head2 include_string
604              
605             $result = $parser->include_string( $str, %opts )
606              
607             I
608              
609             Parses a given string into the existing parser object.
610              
611             The current parser state is moved aside from the duration of this method, and
612             is replaced by the given string. Then the toplevel parser method (or a
613             different as specified) is invoked over it. Its result is returned by this
614             method.
615              
616             This would typically be used to handle some sort of "include" or "macro
617             expansion" ability, by injecting new content in as if the current parse
618             location had encountered it. Other than the internal parser state, other
619             object fields are not altered, so whatever effects the invoked parsing methods
620             will have on it can continue to inspect and alter it as required.
621              
622             The following options are recognised:
623              
624             =over 8
625              
626             =item filename => STRING
627              
628             If set, provides a filename (or other descriptive text) to pretend for the
629             source of this string. It need not be a real file on the filesystem; it could
630             for example explain the source of the string in some other way. It is the
631             value reported by the L method and printed in failure messages.
632              
633             =item toplevel => STRING | CODE
634              
635             If set, provides the toplevel parser method to use within this inclusion,
636             overriding the object's defined default.
637              
638             =back
639              
640             =cut
641              
642             sub include_string
643             {
644 5     5 1 22 my $self = shift;
645 5         8 my ( $str, %opts ) = @_;
646              
647             # local'ize everything out of the way
648 5         7 local @{$self}{qw( str filename reader )};
  5         33  
649              
650 5         20 $self->{str} = $str;
651 5         12 pos($self->{str}) = 0;
652              
653 5         8 $self->{filename} = $opts{filename};
654              
655 5   33     16 my $toplevel = $opts{toplevel} // $self->{toplevel};
656 5         15 my $result = $self->$toplevel;
657              
658 5         41 return $result;
659             }
660              
661             =head1 STRUCTURE-FORMING METHODS
662              
663             The following methods may be used to build a grammatical structure out of the
664             defined basic token-parsing methods. Each takes at least one code reference,
665             which will be passed the actual C<$parser> object as its first argument.
666              
667             Anywhere that a code reference is expected also permits a plain string giving
668             the name of a method to invoke. This is sufficient in many simple cases, such
669             as
670              
671             $self->any_of(
672             'token_int',
673             'token_string',
674             ...
675             );
676              
677             =cut
678              
679             =head2 maybe
680              
681             $ret = $parser->maybe( $code )
682              
683             Attempts to execute the given C<$code> in scalar context, and returns what it
684             returned, accepting that it might fail. C<$code> may either be a CODE
685             reference or a method name given as a string.
686              
687             If the code fails (either by calling C itself, or by propagating a
688             failure from another method it invoked) before it has invoked C, then
689             none of the input string will be consumed; the current parsing position will
690             be restored. C will be returned in this case.
691              
692             If it calls C then any subsequent failure will be propagated to the
693             caller, rather than returning C.
694              
695             This may be considered to be similar to the C regexp qualifier.
696              
697             sub parse_declaration
698             {
699             my $self = shift;
700              
701             [ $self->parse_type,
702             $self->token_ident,
703             $self->maybe( sub {
704             $self->expect( "=" );
705             $self->parse_expression
706             } ),
707             ];
708             }
709              
710             =cut
711              
712             sub maybe
713             {
714 4     4 1 27 my $self = shift;
715 4         12 my ( $code ) = @_;
716              
717 4         8 my $pos = pos $self->{str};
718              
719 4         4 my $committed = 0;
720 4     0   12 local $self->{committer} = sub { $committed++ };
  0         0  
721              
722             try {
723             return $self->$code;
724             }
725 4         8 catch ( $e ) {
726             pos($self->{str}) = $pos;
727              
728             die $e if $committed or not _isa_failure( $e );
729             return undef;
730             }
731             }
732              
733             =head2 scope_of
734              
735             $ret = $parser->scope_of( $start, $code, $stop )
736              
737             Expects to find the C<$start> pattern, then attempts to execute the given
738             C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
739             code returned. C<$code> may either be a CODE reference of a method name given
740             as a string.
741              
742             While the code is being executed, the C<$stop> pattern will be used by the
743             token parsing methods as an end-of-scope marker; causing them to raise a
744             failure if called at the end of a scope.
745              
746             sub parse_block
747             {
748             my $self = shift;
749              
750             $self->scope_of( "{", 'parse_statements', "}" );
751             }
752              
753             If the C<$start> pattern is undefined, it is presumed the caller has already
754             checked for this. This is useful when the stop pattern needs to be calculated
755             based on the start pattern.
756              
757             sub parse_bracketed
758             {
759             my $self = shift;
760              
761             my $delim = $self->expect( qr/[\(\[\<\{]/ );
762             $delim =~ tr/([<{/)]>}/;
763              
764             $self->scope_of( undef, 'parse_body', $delim );
765             }
766              
767             This method does not have any optional parts to it; any failures are
768             immediately propagated to the caller.
769              
770             =cut
771              
772             sub scope_of
773             {
774 19     19 1 87 my $self = shift;
775 19         69 $self->_scope_of( 0, @_ );
776             }
777              
778             sub _scope_of
779             {
780 73     73   88 my $self = shift;
781 73         131 my ( $commit_if_started, $start, $code, $stop ) = @_;
782              
783 73 50       403 ref $stop or $stop = qr/\Q$stop/;
784              
785 73 100       235 $self->expect( $start ) if defined $start;
786              
787 34 100       108 $self->commit if $commit_if_started;
788              
789 34         105 local $self->{endofscope} = $stop;
790 34         74 local $self->{scope_level} = $self->{scope_level} + 1;
791              
792 34         104 my $ret = $self->$code;
793              
794 31         103 $self->expect( $stop );
795              
796 30         176 return $ret;
797             }
798              
799             =head2 committed_scope_of
800              
801             $ret = $parser->committed_scope_of( $start, $code, $stop )
802              
803             I
804              
805             A variant of L that calls L after a successful match of
806             the start pattern. This is usually what you want if using C from
807             within an C choice, if no other alternative following this one could
808             possibly match if the start pattern has.
809              
810             =cut
811              
812             sub committed_scope_of
813             {
814 54     54 1 71 my $self = shift;
815 54         95 $self->_scope_of( 1, @_ );
816             }
817              
818             =head2 list_of
819              
820             $ret = $parser->list_of( $sep, $code )
821              
822             Expects to find a list of instances of something parsed by C<$code>,
823             separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
824             the return values from the C<$code>. A single trailing delimiter is allowed,
825             and does not affect the return value. C<$code> may either be a CODE reference
826             or a method name given as a string. It is called in list context, and whatever
827             values it returns are appended to the eventual result - similar to perl's
828             C.
829              
830             This method does not consider it an error if the returned list is empty; that
831             is, that the scope ended before any item instances were parsed from it.
832              
833             sub parse_numbers
834             {
835             my $self = shift;
836              
837             $self->list_of( ",", 'token_int' );
838             }
839              
840             If the code fails (either by invoking C itself, or by propagating a
841             failure from another method it invoked) before it has invoked C on a
842             particular item, then the item is aborted and the parsing position will be
843             restored to the beginning of that failed item. The list of results from
844             previous successful attempts will be returned.
845              
846             If it calls C within an item then any subsequent failure for that item
847             will cause the entire C to fail, propagating that to the caller.
848              
849             =cut
850              
851             sub list_of
852             {
853 83     83 1 139 my $self = shift;
854 83         124 my ( $sep, $code ) = @_;
855              
856 83 100 33     230 ref $sep or $sep = qr/\Q$sep/ if defined $sep;
857              
858 83         95 my $committed;
859 83     14   251 local $self->{committer} = sub { $committed++ };
  14         21  
860              
861 83         119 my @ret;
862              
863             my @lastpos;
864              
865 83         172 while( !$self->at_eos ) {
866 157         188 $committed = 0;
867 157         188 my $pos = pos $self->{str};
868              
869 157         202 push @lastpos, $pos;
870 157 100       293 if( @lastpos > $self->{stallcount} ) {
871 1         2 shift @lastpos;
872 1 50       7 $self->die( ref($self) . " failed to make progress" ) if $lastpos[0] == $pos;
873             }
874              
875             try {
876             push @ret, $self->$code;
877             next;
878             }
879             catch ( $e ) {
880             pos($self->{str}) = $pos;
881             die $e if $committed or not _isa_failure( $e );
882              
883             last;
884             }
885 156         268 }
886             continue {
887 132 100       299 if( defined $sep ) {
888 32         62 $self->skip_ws;
889 32 100       149 $self->{str} =~ m/\G$sep/gc or last;
890             }
891             }
892              
893 81         358 return \@ret;
894             }
895              
896             =head2 sequence_of
897              
898             $ret = $parser->sequence_of( $code )
899              
900             A shortcut for calling C with an empty string as separator; expects
901             to find at least one instance of something parsed by C<$code>, separated only
902             by skipped whitespace.
903              
904             This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
905              
906             sub parse_statements
907             {
908             my $self = shift;
909              
910             $self->sequence_of( 'parse_statement' );
911             }
912              
913             The interaction of failures in the code and the C method is identical
914             to that of C.
915              
916             =cut
917              
918             sub sequence_of
919             {
920 67     67 1 223 my $self = shift;
921 67         96 my ( $code ) = @_;
922              
923 67         156 $self->list_of( undef, $code );
924             }
925              
926             =head2 any_of
927              
928             $ret = $parser->any_of( @codes )
929              
930             I
931              
932             Expects that one of the given code instances can parse something from the
933             input, returning what it returned. Each code instance may indicate a failure
934             to parse by calling the C method or otherwise propagating a failure.
935             Each code instance may either be a CODE reference or a method name given as a
936             string.
937              
938             This may be considered to be similar to the C<|> regexp operator for forming
939             alternations of possible parse trees.
940              
941             sub parse_statement
942             {
943             my $self = shift;
944              
945             $self->any_of(
946             sub { $self->parse_declaration; $self->expect(";") },
947             sub { $self->parse_expression; $self->expect(";") },
948             sub { $self->parse_block },
949             );
950             }
951              
952             If the code for a given choice fails (either by invoking C itself, or by
953             propagating a failure from another method it invoked) before it has invoked
954             C itself, then the parsing position restored and the next choice will
955             be attempted.
956              
957             If it calls C then any subsequent failure for that choice will cause
958             the entire C to fail, propagating that to the caller and no further
959             choices will be attempted.
960              
961             If none of the choices match then a simple failure message is printed:
962              
963             Found nothing parseable
964              
965             As this is unlikely to be helpful to users, a better message can be provided
966             by the final choice instead. Don't forget to C before printing the
967             failure message, or it won't count.
968              
969             $self->any_of(
970             'token_int',
971             'token_string',
972             ...,
973              
974             sub { $self->commit; $self->fail( "Expected an int or string" ) }
975             );
976              
977             =cut
978              
979             sub any_of
980             {
981 234     234 1 498 my $self = shift;
982              
983 234         418 while( @_ ) {
984 460         578 my $code = shift;
985 460         555 my $pos = pos $self->{str};
986              
987 460         467 my $committed = 0;
988 460     55   1190 local $self->{committer} = sub { $committed++ };
  55         114  
989              
990             try {
991             return $self->$code;
992             }
993 460         713 catch ( $e ) {
994             pos( $self->{str} ) = $pos;
995              
996             die $e if $committed or not _isa_failure( $e );
997             }
998             }
999              
1000 12         26 $self->fail( "Found nothing parseable" );
1001             }
1002              
1003             sub one_of {
1004 0     0 0 0 croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
1005             }
1006              
1007             =head2 commit
1008              
1009             $parser->commit
1010              
1011             Calling this method will cancel the backtracking behaviour of the innermost
1012             C, C, C, or C structure forming method.
1013             That is, if later code then calls C, the exception will be propagated
1014             out of C, no further list items will be attempted by C or
1015             C, and no further code blocks will be attempted by C.
1016              
1017             Typically this will be called once the grammatical structure alter has been
1018             determined, ensuring that any further failures are raised as real exceptions,
1019             rather than by attempting other alternatives.
1020              
1021             sub parse_statement
1022             {
1023             my $self = shift;
1024              
1025             $self->any_of(
1026             ...
1027             sub {
1028             $self->scope_of( "{",
1029             sub { $self->commit; $self->parse_statements; },
1030             "}" ),
1031             },
1032             );
1033             }
1034              
1035             Though in this common pattern, L may be used instead.
1036              
1037             =cut
1038              
1039             sub commit
1040             {
1041 69     69 1 108 my $self = shift;
1042 69 50       128 if( $self->{committer} ) {
1043 69         105 $self->{committer}->();
1044             }
1045             else {
1046 0         0 croak "Cannot commit except within a backtrack-able structure";
1047             }
1048             }
1049              
1050             =head1 TOKEN PARSING METHODS
1051              
1052             The following methods attempt to consume some part of the input string, to be
1053             used as part of the parsing process.
1054              
1055             =cut
1056              
1057             sub skip_ws
1058             {
1059 1357     1357 0 1484 my $self = shift;
1060              
1061 1357         1672 my $pattern = $self->{patterns}{_skip};
1062              
1063             {
1064 1357         1386 1 while $self->{str} =~ m/\G$pattern/gc;
  1360         5808  
1065              
1066 1360 100       2955 return if pos( $self->{str} ) < length $self->{str};
1067              
1068 237 100       545 return unless $self->{reader};
1069              
1070 4         6 my $more = $self->{reader}->( $self );
1071 4 100       16 if( defined $more ) {
1072 3         4 my $pos = pos( $self->{str} );
1073 3         6 $self->{str} .= $more;
1074 3         4 pos( $self->{str} ) = $pos;
1075              
1076 3         8 redo;
1077             }
1078              
1079 1         2 undef $self->{reader};
1080 1         2 return;
1081             }
1082             }
1083              
1084             =head2 expect
1085              
1086             $str = $parser->expect( $literal )
1087              
1088             $str = $parser->expect( qr/pattern/ )
1089              
1090             @groups = $parser->expect( qr/pattern/ )
1091              
1092             Expects to find a literal string or regexp pattern match, and consumes it.
1093             In scalar context, this method returns the string that was captured. In list
1094             context it returns the matching substring and the contents of any subgroups
1095             contained in the pattern.
1096              
1097             This method will raise a parse error (by calling C) if the regexp fails
1098             to match. Note that if the pattern could match an empty string (such as for
1099             example C), the pattern will always match, even if it has to match an
1100             empty string. This method will not consider a failure if the regexp matches
1101             with zero-width.
1102              
1103             =head2 maybe_expect
1104              
1105             $str = $parser->maybe_expect( ... )
1106              
1107             @groups = $parser->maybe_expect( ... )
1108              
1109             I
1110              
1111             A convenient shortcut equivalent to calling C within C, but
1112             implemented more efficiently, avoiding the exception-handling set up by
1113             C. Returns C or an empty list if the match fails.
1114              
1115             =cut
1116              
1117             sub maybe_expect
1118             {
1119 400     400 1 501 my $self = shift;
1120 400         496 my ( $expect ) = @_;
1121              
1122 400 100       655 ref $expect or $expect = qr/\Q$expect/;
1123              
1124 400         715 $self->skip_ws;
1125 400 100       3767 $self->{str} =~ m/\G$expect/gc or return;
1126              
1127 228 100       1153 return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
1128 40 100       102 return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
  60         361  
1129             }
1130              
1131             sub expect
1132             {
1133 384     384 1 657 my $self = shift;
1134 384         582 my ( $expect ) = @_;
1135              
1136 384 100       2258 ref $expect or $expect = qr/\Q$expect/;
1137              
1138 384 100       746 if( wantarray ) {
1139 60 100       138 my @ret = $self->maybe_expect( $expect ) or
1140             $self->fail( "Expected $expect" );
1141 38         129 return @ret;
1142             }
1143             else {
1144 324 100       573 defined( my $ret = $self->maybe_expect( $expect ) ) or
1145             $self->fail( "Expected $expect" );
1146 185         476 return $ret;
1147             }
1148             }
1149              
1150             =head2 substring_before
1151              
1152             $str = $parser->substring_before( $literal )
1153              
1154             $str = $parser->substring_before( qr/pattern/ )
1155              
1156             I
1157              
1158             Expects to possibly find a literal string or regexp pattern match. If it finds
1159             such, consume all the input text before but excluding this match, and return
1160             it. If it fails to find a match before the end of the current scope, consumes
1161             all the input text until the end of scope and return it.
1162              
1163             This method does not consume the part of input that matches, only the text
1164             before it. It is not considered a failure if the substring before this match
1165             is empty. If a non-empty match is required, use the C method:
1166              
1167             sub token_nonempty_part
1168             {
1169             my $self = shift;
1170              
1171             my $str = $parser->substring_before( "," );
1172             length $str or $self->fail( "Expected a string fragment before ," );
1173              
1174             return $str;
1175             }
1176              
1177             Note that unlike most of the other token parsing methods, this method does not
1178             consume either leading or trailing whitespace around the substring. It is
1179             expected that this method would be used as part a parser to read quoted
1180             strings, or similar cases where whitespace should be preserved.
1181              
1182             =head2 nonempty_substring_before
1183              
1184             $str = $parser->nonempty_substring_before( $literal )
1185              
1186             $str = $parser->nonempty_substring_before( qr/pattern/ )
1187              
1188             I
1189              
1190             A variant of L which fails if the matched part is empty.
1191              
1192             The example above could have been written:
1193              
1194             sub token_nonempty_part
1195             {
1196             my $self = shift;
1197              
1198             return $parser->nonempty_substring_before( "," );
1199             }
1200              
1201             This is often useful for breaking out of repeating loops; e.g.
1202              
1203             sub token_escaped_string
1204             {
1205             my $self = shift;
1206             $self->expect( '"' );
1207              
1208             my $ret = "";
1209             1 while $self->any_of(
1210             sub { $ret .= $self->nonempty_substring_before( qr/%|$/m ); 1 }
1211             sub { my $escape = ( $self->expect( qr/%(.)/ ) )[1];
1212             $ret .= _handle_escape( $escape );
1213             1 },
1214             sub { 0 },
1215             )
1216              
1217             return $ret;
1218             }
1219              
1220             =cut
1221              
1222             sub _substring_before
1223             {
1224 52     52   60 my $self = shift;
1225 52         80 my ( $expect, $fail_if_empty ) = @_;
1226              
1227 52 100       192 ref $expect or $expect = qr/\Q$expect/;
1228              
1229 52 100       195 my $endre = ( defined $self->{endofscope} ) ?
1230             qr/$expect|$self->{endofscope}/ :
1231             $expect;
1232              
1233             # NO skip_ws
1234              
1235 52         70 my $start = pos $self->{str};
1236 52         56 my $end;
1237 52 100       487 if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1238 42         109 $end = $-[1];
1239             }
1240             else {
1241 10         19 $end = length $self->{str};
1242             }
1243              
1244 52 100 66     126 $self->fail( "Expected to find a non-empty substring before $expect" )
1245             if $fail_if_empty and $end == $start;
1246              
1247 51         131 return $self->take( $end - $start );
1248             }
1249              
1250             sub substring_before
1251             {
1252 51     51 1 84 my $self = shift;
1253 51         95 return $self->_substring_before( $_[0], 0 );
1254             }
1255              
1256             sub nonempty_substring_before
1257             {
1258 1     1 1 8 my $self = shift;
1259 1         3 return $self->_substring_before( $_[0], 1 );
1260             }
1261              
1262             =head2 generic_token
1263              
1264             $val = $parser->generic_token( $name, $re, $convert )
1265              
1266             I
1267              
1268             Expects to find a token matching the precompiled regexp C<$re>. If provided,
1269             the C<$convert> CODE reference can be used to convert the string into a more
1270             convenient form. C<$name> is used in the failure message if the pattern fails
1271             to match.
1272              
1273             If provided, the C<$convert> function will be passed the parser and the
1274             matching substring; the value it returns is returned from C.
1275              
1276             $convert->( $parser, $substr )
1277              
1278             If not provided, the substring will be returned as it stands.
1279              
1280             This method is mostly provided for subclasses to define their own token types.
1281             For example:
1282              
1283             sub token_hex
1284             {
1285             my $self = shift;
1286             $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1287             }
1288              
1289             =cut
1290              
1291             sub generic_token
1292             {
1293 230     230 1 269 my $self = shift;
1294 230         376 my ( $name, $re, $convert ) = @_;
1295              
1296 230 50       372 $self->fail( "Expected $name" ) if $self->at_eos;
1297              
1298 230         453 $self->skip_ws;
1299 230 100       3179 $self->{str} =~ m/\G$re/gc or
1300             $self->fail( "Expected $name" );
1301              
1302 169         777 my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1303              
1304 169 100       579 return $convert ? $convert->( $self, $match ) : $match;
1305             }
1306              
1307             sub _token_generic
1308             {
1309 226     226   286 my $self = shift;
1310 226         564 my %args = @_;
1311              
1312 226         341 my $name = $args{name};
1313 226 50       470 my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1314 226         254 my $convert = $args{convert};
1315              
1316 226         416 $self->generic_token( $name, $re, $convert );
1317             }
1318              
1319             =head2 token_int
1320              
1321             $int = $parser->token_int
1322              
1323             Expects to find an integer in decimal, octal or hexadecimal notation, and
1324             consumes it. Negative integers, preceeded by C<->, are also recognised.
1325              
1326             =cut
1327              
1328             sub token_int
1329             {
1330 134     134 1 280 my $self = shift;
1331             $self->_token_generic(
1332             name => "int",
1333              
1334             pattern => "int",
1335             convert => sub {
1336 104     104   145 my $int = $_[1];
1337 104 100       210 my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1338              
1339 104         130 $int =~ s/^0o/0/;
1340              
1341 104 100       216 return $sign * oct $int if $int =~ m/^0/;
1342 99         683 return $sign * $int;
1343             },
1344 134         459 );
1345             }
1346              
1347             =head2 token_float
1348              
1349             $float = $parser->token_float
1350              
1351             I
1352              
1353             Expects to find a number expressed in floating-point notation; a sequence of
1354             digits possibly prefixed by C<->, possibly containing a decimal point,
1355             possibly followed by an exponent specified by C followed by an integer. The
1356             numerical value is then returned.
1357              
1358             =cut
1359              
1360             sub token_float
1361             {
1362 20     20 1 52 my $self = shift;
1363             $self->_token_generic(
1364             name => "float",
1365              
1366             pattern => "float",
1367 18     18   119 convert => sub { $_[1] + 0 },
1368 20         77 );
1369             }
1370              
1371             =head2 token_number
1372              
1373             $number = $parser->token_number
1374              
1375             I
1376              
1377             Expects to find a number expressed in either of the above forms.
1378              
1379             =cut
1380              
1381             sub token_number
1382             {
1383 7     7 1 26 my $self = shift;
1384 7         18 $self->any_of( \&token_float, \&token_int );
1385             }
1386              
1387             =head2 token_string
1388              
1389             $str = $parser->token_string
1390              
1391             Expects to find a quoted string, and consumes it. The string should be quoted
1392             using C<"> or C<'> quote marks.
1393              
1394             The content of the quoted string can contain character escapes similar to
1395             those accepted by C or Perl. Specifically, the following forms are recognised:
1396              
1397             \a Bell ("alert")
1398             \b Backspace
1399             \e Escape
1400             \f Form feed
1401             \n Newline
1402             \r Return
1403             \t Horizontal Tab
1404             \0, \012 Octal character
1405             \x34, \x{5678} Hexadecimal character
1406              
1407             C's C<\v> for vertical tab is not supported as it is rarely used in practice
1408             and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1409             control characters is also not supported.
1410              
1411             =cut
1412              
1413             my %escapes = (
1414             a => "\a",
1415             b => "\b",
1416             e => "\e",
1417             f => "\f",
1418             n => "\n",
1419             r => "\r",
1420             t => "\t",
1421             );
1422              
1423             sub token_string
1424             {
1425 53     53 1 136 my $self = shift;
1426              
1427 53 100       151 $self->fail( "Expected string" ) if $self->at_eos;
1428              
1429 52         77 my $pos = pos $self->{str};
1430              
1431 52         101 $self->skip_ws;
1432 52 100       448 $self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1433             $self->fail( "Expected string delimiter" );
1434              
1435 32         88 my $delim = $1;
1436              
1437             $self->{str} =~ m/
1438             \G(
1439             (?:
1440             \\[0-7]{1,3} # octal escape
1441             |\\x[0-9A-F]{2} # 2-digit hex escape
1442             |\\x\{[0-9A-F]+\} # {}-delimited hex escape
1443             |\\. # symbolic escape
1444             |[^\\$delim]+ # plain chunk
1445             )*?
1446             )$delim/gcix or
1447 32 50       835 pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1448              
1449 32         85 my $string = $1;
1450              
1451 32         101 $string =~ s<\\(?:([0-7]{1,3})|x([0-9A-F]{2})|x\{([0-9A-F]+)\}|(.))>
1452             [defined $1 ? chr oct $1 :
1453             defined $2 ? chr hex $2 :
1454 11 50       79 defined $3 ? chr hex $3 :
    100          
    100          
    100          
1455             exists $escapes{$4} ? $escapes{$4} : $4]egi;
1456 32         129  
1457             return $string;
1458             }
1459              
1460             =head2 token_ident
1461              
1462             $ident = $parser->token_ident
1463              
1464             Expects to find an identifier, and consumes it.
1465              
1466             =cut
1467              
1468             sub token_ident
1469 72     72 1 111 {
1470 72         119 my $self = shift;
1471             $self->_token_generic(
1472             name => "ident",
1473              
1474             pattern => "ident",
1475             );
1476             }
1477              
1478             =head2 token_kw
1479              
1480             $keyword = $parser->token_kw( @keywords )
1481              
1482             Expects to find a keyword, and consumes it. A keyword is defined as an
1483             identifier which is exactly one of the literal values passed in.
1484              
1485             =cut
1486              
1487             sub token_kw
1488 2     2 1 11 {
1489 2         4 my $self = shift;
1490             my @acceptable = @_;
1491 2         6  
1492             $self->skip_ws;
1493 2         3  
1494             my $pos = pos $self->{str};
1495 2 50       5  
1496             defined( my $kw = $self->token_ident ) or
1497             return undef;
1498 4         16  
1499 2 100       4 grep { $_ eq $kw } @acceptable or
1500             pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1501 1         3  
1502             return $kw;
1503             }
1504              
1505             package # hide from indexer
1506             Parser::MGC::Failure;
1507              
1508             sub new
1509 285     285   345 {
1510 285         450 my $class = shift;
1511 285         421 my $self = bless {}, $class;
  285         1279  
1512 285         1184 @{$self}{qw( message parser pos )} = @_;
1513             return $self;
1514             }
1515 33     33   27633  
  33         21615  
  33         205  
1516             use overload '""' => "STRING";
1517             sub STRING
1518 44     44   3607 {
1519             my $self = shift;
1520 44         92  
1521 44         163 my $parser = $self->{parser};
1522             my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1523              
1524             # Column number only counts characters. There may be tabs in there.
1525             # Rather than trying to calculate the visual column number, just print the
1526             # indentation as it stands.
1527 44         99  
1528 44         178 my $indent = substr( $text, 0, $col );
1529             $indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1530 44         170  
1531 44 100 100     167 my $filename = $parser->filename;
1532             my $in_file = ( defined $filename and !ref $filename )
1533             ? "in $filename " : "";
1534 44         462  
1535             return "$self->{message} ${in_file}on line $linenum at:\n" .
1536             "$text\n" .
1537             "$indent^\n";
1538             }
1539              
1540 33     33   6768 # Provide fallback operators for cmp, eq, etc...
  33         76  
  33         164  
1541             use overload fallback => 1;
1542              
1543             =head1 EXAMPLES
1544              
1545             =head2 Accumulating Results Using Variables
1546              
1547             Although the structure-forming methods all return a value, obtained from their
1548             nested parsing code, it can sometimes be more convenient to use a variable to
1549             accumulate a result in instead. For example, consider the following parser
1550             method, designed to parse a set of C assignments, such as might
1551             be found in a configuration file, or YAML/JSON-style mapping value.
1552              
1553             sub parse_dict
1554             {
1555             my $self = shift;
1556              
1557             my %ret;
1558             $self->list_of( ",", sub {
1559             my $key = $self->token_ident;
1560             exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1561              
1562             $self->expect( ":" );
1563              
1564             $ret{$key} = $self->parse_value;
1565             } );
1566              
1567             return \%ret
1568             }
1569              
1570             Instead of using the return value from C, this method accumulates
1571             values in the C<%ret> hash, eventually returning a reference to it as its
1572             result. Because of this, it can perform some error checking while it parses;
1573             namely, rejecting duplicate keys.
1574              
1575             =head1 TODO
1576              
1577             =over 4
1578              
1579             =item *
1580              
1581             Make unescaping of string constants more customisable. Possibly consider
1582             instead a C using a loop over C.
1583              
1584             =item *
1585              
1586             Easy ability for subclasses to define more token types as methods. Perhaps
1587             provide a class method such as
1588              
1589             __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1590              
1591             =item *
1592              
1593             Investigate how well C can cope with buffer splitting across
1594             other tokens than simply skippable whitespace
1595              
1596             =back
1597              
1598             =head1 AUTHOR
1599              
1600             Paul Evans
1601              
1602             =cut
1603              
1604             0x55AA;