File Coverage

blib/lib/Parser/MGC.pm
Criterion Covered Total %
statement 265 268 98.8
branch 89 100 89.0
condition 18 20 90.0
subroutine 52 54 96.3
pod 28 30 93.3
total 452 472 95.7


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2010-2021 -- leonerd@leonerd.org.uk
5              
6             package Parser::MGC 0.19;
7              
8 31     31   1726036 use v5.14;
  31         337  
9 31     31   179 use warnings;
  31         57  
  31         998  
10              
11 31     31   179 use Carp;
  31         66  
  31         2185  
12 31     31   16410 use Feature::Compat::Try;
  31         11601  
  31         133  
13              
14 31     31   67687 use File::Slurp::Tiny qw( read_file );
  31         426616  
  31         2145  
15 31     31   301 use Scalar::Util qw( blessed );
  31         69  
  31         3210  
16              
17             =head1 NAME
18              
19             C - build simple recursive-descent parsers
20              
21             =head1 SYNOPSIS
22              
23             package My::Grammar::Parser;
24             use base qw( Parser::MGC );
25              
26             sub parse
27             {
28             my $self = shift;
29              
30             $self->sequence_of( sub {
31             $self->any_of(
32             sub { $self->token_int },
33             sub { $self->token_string },
34             sub { \$self->token_ident },
35             sub { $self->scope_of( "(", \&parse, ")" ) }
36             );
37             } );
38             }
39              
40             my $parser = My::Grammar::Parser->new;
41              
42             my $tree = $parser->from_file( $ARGV[0] );
43              
44             ...
45              
46             =head1 DESCRIPTION
47              
48             This base class provides a low-level framework for building recursive-descent
49             parsers that consume a given input string from left to right, returning a
50             parse structure. It takes its name from the C regexps used to implement
51             the token parsing behaviour.
52              
53             It provides a number of token-parsing methods, which each extract a
54             grammatical token from the string. It also provides wrapping methods that can
55             be used to build up a possibly-recursive grammar structure, by applying a
56             structure around other parts of parsing code.
57              
58             =head2 Backtracking
59              
60             Each method, both token and structural, atomically either consumes a prefix of
61             the string and returns its result, or fails and consumes nothing. This makes
62             it simple to implement grammars that require backtracking.
63              
64             Several structure-forming methods have some form of "optional" behaviour; they
65             can optionally consume some amount of input or take some particular choice,
66             but if the code invoked inside that subsequently fails, the structure can
67             backtrack and take some different behaviour. This is usually what is required
68             when testing whether the structure of the input string matches some part of
69             the grammar that is optional, or has multiple choices.
70              
71             However, once the choice of grammar has been made, it is often useful to be
72             able to fix on that one choice, thus making subsequent failures propagate up
73             rather than taking that alternative behaviour. Control of this backtracking
74             is given by the C method; and careful use of this method is one of the
75             key advantages that C has over more simple parsing using single
76             regexps alone.
77              
78             =cut
79              
80             =head1 CONSTRUCTOR
81              
82             =cut
83              
84             =head2 new
85              
86             $parser = Parser::MGC->new( %args )
87              
88             Returns a new instance of a C object. This must be called on a
89             subclass that provides method of the name provided as C, by default
90             called C.
91              
92             Takes the following named arguments
93              
94             =over 8
95              
96             =item toplevel => STRING
97              
98             Name of the toplevel method to use to start the parse from. If not supplied,
99             will try to use a method called C.
100              
101             =item patterns => HASH
102              
103             Keys in this hash should map to quoted regexp (C) references, to
104             override the default patterns used to match tokens. See C below
105              
106             =item accept_0o_oct => BOOL
107              
108             If true, the C method will also accept integers with a C<0o> prefix
109             as octal.
110              
111             =back
112              
113             =cut
114              
115             =head1 PATTERNS
116              
117             The following pattern names are recognised. They may be passed to the
118             constructor in the C hash, or provided as a class method under the
119             name C>.
120              
121             =over 4
122              
123             =item * ws
124              
125             Pattern used to skip whitespace between tokens. Defaults to C
126              
127             =item * comment
128              
129             Pattern used to skip comments between tokens. Undefined by default.
130              
131             =item * int
132              
133             Pattern used to parse an integer by C. Defaults to
134             C. If C is given, then
135             this will be expanded to match C as well.
136              
137             =item * float
138              
139             Pattern used to parse a floating-point number by C. Defaults to
140             C.
141              
142             =item * ident
143              
144             Pattern used to parse an identifier by C. Defaults to
145             C
146              
147             =item * string_delim
148              
149             Pattern used to delimit a string by C. Defaults to C.
150              
151             =back
152              
153             =cut
154              
155             my @patterns = qw(
156             ws
157             comment
158             int
159             float
160             ident
161             string_delim
162             );
163              
164 31     31   238 use constant pattern_ws => qr/[\s\n\t]+/;
  31         73  
  31         2494  
165 31     31   203 use constant pattern_comment => undef;
  31         66  
  31         2529  
166 31     31   223 use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/;
  31         52  
  31         3351  
167 31     31   229 use constant pattern_float => qr/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i;
  31         224  
  31         4125  
168 31     31   232 use constant pattern_ident => qr/[[:alpha:]_]\w*/;
  31         67  
  31         2003  
169 31     31   189 use constant pattern_string_delim => qr/["']/;
  31         57  
  31         123267  
170              
171             sub new
172             {
173 46     46 1 2417 my $class = shift;
174 46         146 my %args = @_;
175              
176 46   100     312 my $toplevel = $args{toplevel} || "parse";
177              
178 46 50       387 $class->can( $toplevel ) or
179             croak "Expected to be a subclass that can ->$toplevel";
180              
181 46         255 my $self = bless {
182             toplevel => $toplevel,
183             patterns => {},
184             scope_level => 0,
185             }, $class;
186              
187 46   100     303 $self->{patterns}{$_} = $args{patterns}{$_} || $self->${\"pattern_$_"} for @patterns;
188              
189 46 100       173 if( $args{accept_0o_oct} ) {
190 1         39 $self->{patterns}{int} = qr/0o[0-7]+|$self->{patterns}{int}/;
191             }
192              
193 46 100       178 if( defined $self->{patterns}{comment} ) {
194 1         42 $self->{patterns}{_skip} = qr/$self->{patterns}{ws}|$self->{patterns}{comment}/;
195             }
196             else {
197 45         138 $self->{patterns}{_skip} = $self->{patterns}{ws};
198             }
199              
200 46         213 return $self;
201             }
202              
203             =head1 METHODS
204              
205             =cut
206              
207             =head2 from_string
208              
209             $result = $parser->from_string( $str )
210              
211             Parse the given literal string and return the result from the toplevel method.
212              
213             =cut
214              
215             sub from_string
216             {
217 160     160 1 31108 my $self = shift;
218 160         359 my ( $str ) = @_;
219              
220 160         352 $self->{str} = $str;
221              
222 160         439 pos $self->{str} = 0;
223              
224 160         345 my $toplevel = $self->{toplevel};
225 160         571 my $result = $self->$toplevel;
226              
227 143 100       586 $self->at_eos or
228             $self->fail( "Expected end of input" );
229              
230 138         766 return $result;
231             }
232              
233             =head2 from_file
234              
235             $result = $parser->from_file( $file, %opts )
236              
237             Parse the given file, which may be a pathname in a string, or an opened IO
238             handle, and return the result from the toplevel method.
239              
240             The following options are recognised:
241              
242             =over 8
243              
244             =item binmode => STRING
245              
246             If set, applies the given binmode to the filehandle before reading. Typically
247             this can be used to set the encoding of the file.
248              
249             $parser->from_file( $file, binmode => ":encoding(UTF-8)" )
250              
251             =back
252              
253             =cut
254              
255             sub from_file
256             {
257 3     3 1 1372 my $self = shift;
258 3         11 my ( $file, %opts ) = @_;
259              
260 3 50       11 defined $file or croak "Expected a filename to ->from_file";
261              
262 3         7 $self->{filename} = $file;
263              
264             $self->from_string( ref $file ?
265 2 50       8 do { local $/; binmode $file, $opts{binmode} if $opts{binmode}; <$file> } :
  2         7  
  2         78  
266 3 100       21 ( read_file $file, binmode => $opts{binmode} ) );
267             }
268              
269             =head2 from_reader
270              
271             $result = $parser->from_reader( \&reader )
272              
273             I
274              
275             Parse the input which is read by the C function. This function will be
276             called in scalar context to generate portions of string to parse, being passed
277             the C<$parser> object. The function should return C when it has no more
278             string to return.
279              
280             $reader->( $parser )
281              
282             Note that because it is not generally possible to detect exactly when more
283             input may be required due to failed regexp parsing, the reader function is
284             only invoked during searching for skippable whitespace. This makes it suitable
285             for reading lines of a file in the common case where lines are considered as
286             skippable whitespace, or for reading lines of input interactively from a
287             user. It cannot be used in all cases (for example, reading fixed-size buffers
288             from a file) because two successive invocations may split a single token
289             across the buffer boundaries, and cause parse failures.
290              
291             =cut
292              
293             sub from_reader
294             {
295 1     1 1 12 my $self = shift;
296 1         3 my ( $reader ) = @_;
297              
298 1         3 local $self->{reader} = $reader;
299              
300 1         3 $self->{str} = "";
301 1         4 pos $self->{str} = 0;
302              
303 1         5 my $result = $self->parse;
304              
305 1 50       3 $self->at_eos or
306             $self->fail( "Expected end of input" );
307              
308 1         8 return $result;
309             }
310              
311             =head2 pos
312              
313             $pos = $parser->pos
314              
315             I
316              
317             Returns the current parse position, as a character offset from the beginning
318             of the file or string.
319              
320             =cut
321              
322             sub pos
323             {
324 277     277 1 379 my $self = shift;
325 277         761 return pos $self->{str};
326             }
327              
328             =head2 take
329              
330             $str = $parser->take( $len )
331              
332             I
333              
334             Returns the next C<$len> characters directly from the input, prior to any
335             whitespace or comment skipping. This does I take account of any
336             end-of-scope marker that may be pending. It is intended for use by parsers of
337             partially-binary protocols, or other situations in which it would be incorrect
338             for the end-of-scope marker to take effect at this time.
339              
340             =cut
341              
342             sub take
343             {
344 46     46 1 84 my $self = shift;
345 46         76 my ( $len ) = @_;
346              
347 46         164 my $start = pos( $self->{str} );
348              
349 46         132 pos( $self->{str} ) += $len;
350              
351 46         274 return substr( $self->{str}, $start, $len );
352             }
353              
354             =head2 where
355              
356             ( $lineno, $col, $text ) = $parser->where
357              
358             Returns the current parse position, as a line and column number, and
359             the entire current line of text. The first line is numbered 1, and the first
360             column is numbered 0.
361              
362             =cut
363              
364             sub where
365             {
366 14     14 1 31 my $self = shift;
367 14         31 my ( $pos ) = @_;
368              
369 14 100       51 defined $pos or $pos = pos $self->{str};
370              
371 14         32 my $str = $self->{str};
372              
373 14         24 my $sol = $pos;
374 14 100 100     83 $sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
375 14   100     140 $sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
376              
377 14         24 my $eol = $pos;
378 14   100     170 $eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
379              
380 14         37 my $line = substr( $str, $sol, $eol - $sol );
381              
382 14         24 my $col = $pos - $sol;
383 14         49 my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
384              
385 14         68 return ( $lineno, $col, $line );
386             }
387              
388             =head2 fail
389              
390             =head2 fail_from
391              
392             $parser->fail( $message )
393              
394             $parser->fail_from( $pos, $message )
395              
396             I since version 0.09.>
397              
398             Aborts the current parse attempt with the given message string. The failure
399             message will include the line and column position, and the line of input that
400             failed at the current parse position (C), or a position earlier obtained
401             using the C method (C).
402              
403             This failure will propagate up to the inner-most structure parsing method that
404             has not been committed; or will cause the entire parser to fail if there are
405             no further options to take.
406              
407             =cut
408              
409             sub fail
410             {
411 271     271 1 423 my $self = shift;
412 271         471 my ( $message ) = @_;
413 271         649 $self->fail_from( $self->pos, $message );
414             }
415              
416             sub fail_from
417             {
418 271     271 1 356 my $self = shift;
419 271         496 my ( $pos, $message ) = @_;
420 271         659 die Parser::MGC::Failure->new( $message, $self, $pos );
421             }
422              
423 253 100   253   2782 sub _isa_failure { blessed $_[0] and $_[0]->isa( "Parser::MGC::Failure" ) }
424              
425             =head2 at_eos
426              
427             $eos = $parser->at_eos
428              
429             Returns true if the input string is at the end of the string.
430              
431             =cut
432              
433             sub at_eos
434             {
435 596     596 1 840 my $self = shift;
436              
437             # Save pos() before skipping ws so we don't break the substring_before method
438 596         954 my $pos = pos $self->{str};
439              
440 596         1414 $self->skip_ws;
441              
442 596         830 my $at_eos;
443 596 100       1461 if( pos( $self->{str} ) >= length $self->{str} ) {
    100          
444 162         246 $at_eos = 1;
445             }
446             elsif( defined $self->{endofscope} ) {
447 114         530 $at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
448             }
449             else {
450 320         501 $at_eos = 0;
451             }
452              
453 596         1271 pos( $self->{str} ) = $pos;
454              
455 596         1649 return $at_eos;
456             }
457              
458             =head2 scope_level
459              
460             $level = $parser->scope_level
461              
462             I
463              
464             Returns the number of nested C calls that have been made.
465              
466             =cut
467              
468             sub scope_level
469             {
470 5     5 1 10 my $self = shift;
471 5         45 return $self->{scope_level};
472             }
473              
474             =head1 STRUCTURE-FORMING METHODS
475              
476             The following methods may be used to build a grammatical structure out of the
477             defined basic token-parsing methods. Each takes at least one code reference,
478             which will be passed the actual C<$parser> object as its first argument.
479              
480             Anywhere that a code reference is expected also permits a plain string giving
481             the name of a method to invoke. This is sufficient in many simple cases, such
482             as
483              
484             $self->any_of(
485             'token_int',
486             'token_string',
487             ...
488             );
489              
490             =cut
491              
492             =head2 maybe
493              
494             $ret = $parser->maybe( $code )
495              
496             Attempts to execute the given C<$code> in scalar context, and returns what it
497             returned, accepting that it might fail. C<$code> may either be a CODE
498             reference or a method name given as a string.
499              
500             If the code fails (either by calling C itself, or by propagating a
501             failure from another method it invoked) before it has invoked C, then
502             none of the input string will be consumed; the current parsing position will
503             be restored. C will be returned in this case.
504              
505             If it calls C then any subsequent failure will be propagated to the
506             caller, rather than returning C.
507              
508             This may be considered to be similar to the C regexp qualifier.
509              
510             sub parse_declaration
511             {
512             my $self = shift;
513              
514             [ $self->parse_type,
515             $self->token_ident,
516             $self->maybe( sub {
517             $self->expect( "=" );
518             $self->parse_expression
519             } ),
520             ];
521             }
522              
523             =cut
524              
525             sub maybe
526             {
527 4     4 1 39 my $self = shift;
528 4         9 my ( $code ) = @_;
529              
530 4         7 my $pos = pos $self->{str};
531              
532 4         5 my $committed = 0;
533 4     0   16 local $self->{committer} = sub { $committed++ };
  0         0  
534              
535             try {
536             return $self->$code;
537             }
538 4         13 catch ( $e ) {
539             pos($self->{str}) = $pos;
540              
541             die $e if $committed or not _isa_failure( $e );
542             return undef;
543             }
544             }
545              
546             =head2 scope_of
547              
548             $ret = $parser->scope_of( $start, $code, $stop )
549              
550             Expects to find the C<$start> pattern, then attempts to execute the given
551             C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
552             code returned. C<$code> may either be a CODE reference of a method name given
553             as a string.
554              
555             While the code is being executed, the C<$stop> pattern will be used by the
556             token parsing methods as an end-of-scope marker; causing them to raise a
557             failure if called at the end of a scope.
558              
559             sub parse_block
560             {
561             my $self = shift;
562              
563             $self->scope_of( "{", 'parse_statements', "}" );
564             }
565              
566             If the C<$start> pattern is undefined, it is presumed the caller has already
567             checked for this. This is useful when the stop pattern needs to be calculated
568             based on the start pattern.
569              
570             sub parse_bracketed
571             {
572             my $self = shift;
573              
574             my $delim = $self->expect( qr/[\(\[\<\{]/ );
575             $delim =~ tr/([<{/)]>}/;
576              
577             $self->scope_of( undef, 'parse_body', $delim );
578             }
579              
580             This method does not have any optional parts to it; any failures are
581             immediately propagated to the caller.
582              
583             =cut
584              
585             sub scope_of
586             {
587 19     19 1 108 my $self = shift;
588 19         77 $self->_scope_of( 0, @_ );
589             }
590              
591             sub _scope_of
592             {
593 73     73   102 my $self = shift;
594 73         155 my ( $commit_if_started, $start, $code, $stop ) = @_;
595              
596 73 50       574 ref $stop or $stop = qr/\Q$stop/;
597              
598 73 100       295 $self->expect( $start ) if defined $start;
599              
600 34 100       135 $self->commit if $commit_if_started;
601              
602 34         81 local $self->{endofscope} = $stop;
603 34         82 local $self->{scope_level} = $self->{scope_level} + 1;
604              
605 34         135 my $ret = $self->$code;
606              
607 31         118 $self->expect( $stop );
608              
609 30         215 return $ret;
610             }
611              
612             =head2 committed_scope_of
613              
614             $ret = $parser->committed_scope_of( $start, $code, $stop )
615              
616             I
617              
618             A variant of L that calls L after a successful match of
619             the start pattern. This is usually what you want if using C from
620             within an C choice, if no other alternative following this one could
621             possibly match if the start pattern has.
622              
623             =cut
624              
625             sub committed_scope_of
626             {
627 54     54 1 95 my $self = shift;
628 54         118 $self->_scope_of( 1, @_ );
629             }
630              
631             =head2 list_of
632              
633             $ret = $parser->list_of( $sep, $code )
634              
635             Expects to find a list of instances of something parsed by C<$code>,
636             separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
637             the return values from the C<$code>. A single trailing delimiter is allowed,
638             and does not affect the return value. C<$code> may either be a CODE reference
639             or a method name given as a string. It is called in list context, and whatever
640             values it returns are appended to the eventual result - similar to perl's
641             C.
642              
643             This method does not consider it an error if the returned list is empty; that
644             is, that the scope ended before any item instances were parsed from it.
645              
646             sub parse_numbers
647             {
648             my $self = shift;
649              
650             $self->list_of( ",", 'token_int' );
651             }
652              
653             If the code fails (either by invoking C itself, or by propagating a
654             failure from another method it invoked) before it has invoked C on a
655             particular item, then the item is aborted and the parsing position will be
656             restored to the beginning of that failed item. The list of results from
657             previous successful attempts will be returned.
658              
659             If it calls C within an item then any subsequent failure for that item
660             will cause the entire C to fail, propagating that to the caller.
661              
662             =cut
663              
664             sub list_of
665             {
666 72     72 1 155 my $self = shift;
667 72         129 my ( $sep, $code ) = @_;
668              
669 72 100 33     308 ref $sep or $sep = qr/\Q$sep/ if defined $sep;
670              
671 72         101 my $committed;
672 72     14   277 local $self->{committer} = sub { $committed++ };
  14         31  
673              
674 72         126 my @ret;
675              
676 72         193 while( !$self->at_eos ) {
677 127         182 $committed = 0;
678 127         215 my $pos = pos $self->{str};
679              
680             try {
681             push @ret, $self->$code;
682             next;
683             }
684             catch ( $e ) {
685             pos($self->{str}) = $pos;
686             die $e if $committed or not _isa_failure( $e );
687              
688             last;
689             }
690 127         267 }
691             continue {
692 103 100       287 if( defined $sep ) {
693 32         79 $self->skip_ws;
694 32 100       222 $self->{str} =~ m/\G$sep/gc or last;
695             }
696             }
697              
698 71         331 return \@ret;
699             }
700              
701             =head2 sequence_of
702              
703             $ret = $parser->sequence_of( $code )
704              
705             A shortcut for calling C with an empty string as separator; expects
706             to find at least one instance of something parsed by C<$code>, separated only
707             by skipped whitespace.
708              
709             This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
710              
711             sub parse_statements
712             {
713             my $self = shift;
714              
715             $self->sequence_of( 'parse_statement' );
716             }
717              
718             The interaction of failures in the code and the C method is identical
719             to that of C.
720              
721             =cut
722              
723             sub sequence_of
724             {
725 56     56 1 198 my $self = shift;
726 56         129 my ( $code ) = @_;
727              
728 56         188 $self->list_of( undef, $code );
729             }
730              
731             =head2 any_of
732              
733             $ret = $parser->any_of( @codes )
734              
735             I
736              
737             Expects that one of the given code instances can parse something from the
738             input, returning what it returned. Each code instance may indicate a failure
739             to parse by calling the C method or otherwise propagating a failure.
740             Each code instance may either be a CODE reference or a method name given as a
741             string.
742              
743             This may be considered to be similar to the C<|> regexp operator for forming
744             alternations of possible parse trees.
745              
746             sub parse_statement
747             {
748             my $self = shift;
749              
750             $self->any_of(
751             sub { $self->parse_declaration; $self->expect(";") },
752             sub { $self->parse_expression; $self->expect(";") },
753             sub { $self->parse_block },
754             );
755             }
756              
757             If the code for a given choice fails (either by invoking C itself, or by
758             propagating a failure from another method it invoked) before it has invoked
759             C itself, then the parsing position restored and the next choice will
760             be attempted.
761              
762             If it calls C then any subsequent failure for that choice will cause
763             the entire C to fail, propagating that to the caller and no further
764             choices will be attempted.
765              
766             If none of the choices match then a simple failure message is printed:
767              
768             Found nothing parseable
769              
770             As this is unlikely to be helpful to users, a better message can be provided
771             by the final choice instead. Don't forget to C before printing the
772             failure message, or it won't count.
773              
774             $self->any_of(
775             'token_int',
776             'token_string',
777             ...,
778              
779             sub { $self->commit; $self->fail( "Expected an int or string" ) }
780             );
781              
782             =cut
783              
784             sub any_of
785             {
786 218     218 1 491 my $self = shift;
787              
788 218         489 while( @_ ) {
789 433         646 my $code = shift;
790 433         685 my $pos = pos $self->{str};
791              
792 433         546 my $committed = 0;
793 433     55   1355 local $self->{committer} = sub { $committed++ };
  55         110  
794              
795             try {
796             return $self->$code;
797             }
798 433         862 catch ( $e ) {
799             pos( $self->{str} ) = $pos;
800              
801             die $e if $committed or not _isa_failure( $e );
802             }
803             }
804              
805 12         30 $self->fail( "Found nothing parseable" );
806             }
807              
808             sub one_of {
809 0     0 0 0 croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
810             }
811              
812             =head2 commit
813              
814             $parser->commit
815              
816             Calling this method will cancel the backtracking behaviour of the innermost
817             C, C, C, or C structure forming method.
818             That is, if later code then calls C, the exception will be propagated
819             out of C, no further list items will be attempted by C or
820             C, and no further code blocks will be attempted by C.
821              
822             Typically this will be called once the grammatical structure alter has been
823             determined, ensuring that any further failures are raised as real exceptions,
824             rather than by attempting other alternatives.
825              
826             sub parse_statement
827             {
828             my $self = shift;
829              
830             $self->any_of(
831             ...
832             sub {
833             $self->scope_of( "{",
834             sub { $self->commit; $self->parse_statements; },
835             "}" ),
836             },
837             );
838             }
839              
840             Though in this common pattern, L may be used instead.
841              
842             =cut
843              
844             sub commit
845             {
846 69     69 1 143 my $self = shift;
847 69 50       159 if( $self->{committer} ) {
848 69         125 $self->{committer}->();
849             }
850             else {
851 0         0 croak "Cannot commit except within a backtrack-able structure";
852             }
853             }
854              
855             =head1 TOKEN PARSING METHODS
856              
857             The following methods attempt to consume some part of the input string, to be
858             used as part of the parsing process.
859              
860             =cut
861              
862             sub skip_ws
863             {
864 1283     1283 0 1737 my $self = shift;
865              
866 1283         1831 my $pattern = $self->{patterns}{_skip};
867              
868             {
869 1283         1655 1 while $self->{str} =~ m/\G$pattern/gc;
  1286         7202  
870              
871 1286 100       3453 return if pos( $self->{str} ) < length $self->{str};
872              
873 222 100       614 return unless $self->{reader};
874              
875 4         10 my $more = $self->{reader}->( $self );
876 4 100       17 if( defined $more ) {
877 3         5 my $pos = pos( $self->{str} );
878 3         7 $self->{str} .= $more;
879 3         7 pos( $self->{str} ) = $pos;
880              
881 3         7 redo;
882             }
883              
884 1         3 undef $self->{reader};
885 1         2 return;
886             }
887             }
888              
889             =head2 expect
890              
891             $str = $parser->expect( $literal )
892              
893             $str = $parser->expect( qr/pattern/ )
894              
895             @groups = $parser->expect( qr/pattern/ )
896              
897             Expects to find a literal string or regexp pattern match, and consumes it.
898             In scalar context, this method returns the string that was captured. In list
899             context it returns the matching substring and the contents of any subgroups
900             contained in the pattern.
901              
902             This method will raise a parse error (by calling C) if the regexp fails
903             to match. Note that if the pattern could match an empty string (such as for
904             example C), the pattern will always match, even if it has to match an
905             empty string. This method will not consider a failure if the regexp matches
906             with zero-width.
907              
908             =head2 maybe_expect
909              
910             $str = $parser->maybe_expect( ... )
911              
912             @groups = $parser->maybe_expect( ... )
913              
914             I
915              
916             A convenient shortcut equivalent to calling C within C, but
917             implemented more efficiently, avoiding the exception-handling set up by
918             C. Returns C or an empty list if the match fails.
919              
920             =cut
921              
922             sub maybe_expect
923             {
924 371     371 1 534 my $self = shift;
925 371         615 my ( $expect ) = @_;
926              
927 371 100       801 ref $expect or $expect = qr/\Q$expect/;
928              
929 371         789 $self->skip_ws;
930 371 100       4304 $self->{str} =~ m/\G$expect/gc or return;
931              
932 210 100       1378 return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
933 22 100       75 return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
  37         269  
934             }
935              
936             sub expect
937             {
938 355     355 1 1121 my $self = shift;
939 355         614 my ( $expect ) = @_;
940              
941 355 100       2545 ref $expect or $expect = qr/\Q$expect/;
942              
943 355 100       812 if( wantarray ) {
944 31 100       75 my @ret = $self->maybe_expect( $expect ) or
945             $self->fail( "Expected $expect" );
946 20         98 return @ret;
947             }
948             else {
949 324 100       658 defined( my $ret = $self->maybe_expect( $expect ) ) or
950             $self->fail( "Expected $expect" );
951 185         595 return $ret;
952             }
953             }
954              
955             =head2 substring_before
956              
957             $str = $parser->substring_before( $literal )
958              
959             $str = $parser->substring_before( qr/pattern/ )
960              
961             I
962              
963             Expects to possibly find a literal string or regexp pattern match. If it finds
964             such, consume all the input text before but excluding this match, and return
965             it. If it fails to find a match before the end of the current scope, consumes
966             all the input text until the end of scope and return it.
967              
968             This method does not consume the part of input that matches, only the text
969             before it. It is not considered a failure if the substring before this match
970             is empty. If a non-empty match is required, use the C method:
971              
972             sub token_nonempty_part
973             {
974             my $self = shift;
975              
976             my $str = $parser->substring_before( "," );
977             length $str or $self->fail( "Expected a string fragment before ," );
978              
979             return $str;
980             }
981              
982             Note that unlike most of the other token parsing methods, this method does not
983             consume either leading or trailing whitespace around the substring. It is
984             expected that this method would be used as part a parser to read quoted
985             strings, or similar cases where whitespace should be preserved.
986              
987             =cut
988              
989             sub substring_before
990             {
991 40     40 1 71 my $self = shift;
992 40         68 my ( $expect ) = @_;
993              
994 40 100       182 ref $expect or $expect = qr/\Q$expect/;
995              
996 40 100       186 my $endre = ( defined $self->{endofscope} ) ?
997             qr/$expect|$self->{endofscope}/ :
998             $expect;
999              
1000             # NO skip_ws
1001              
1002 40         123 my $start = pos $self->{str};
1003 40         54 my $end;
1004 40 100       522 if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1005 37         153 $end = $-[1];
1006             }
1007             else {
1008 3         9 $end = length $self->{str};
1009             }
1010              
1011 40         137 return $self->take( $end - $start );
1012             }
1013              
1014             =head2 generic_token
1015              
1016             $val = $parser->generic_token( $name, $re, $convert )
1017              
1018             I
1019              
1020             Expects to find a token matching the precompiled regexp C<$re>. If provided,
1021             the C<$convert> CODE reference can be used to convert the string into a more
1022             convenient form. C<$name> is used in the failure message if the pattern fails
1023             to match.
1024              
1025             If provided, the C<$convert> function will be passed the parser and the
1026             matching substring; the value it returns is returned from C.
1027              
1028             $convert->( $parser, $substr )
1029              
1030             If not provided, the substring will be returned as it stands.
1031              
1032             This method is mostly provided for subclasses to define their own token types.
1033             For example:
1034              
1035             sub token_hex
1036             {
1037             my $self = shift;
1038             $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1039             }
1040              
1041             =cut
1042              
1043             sub generic_token
1044             {
1045 230     230 1 348 my $self = shift;
1046 230         401 my ( $name, $re, $convert ) = @_;
1047              
1048 230 50       483 $self->fail( "Expected $name" ) if $self->at_eos;
1049              
1050 230         549 $self->skip_ws;
1051 230 100       4273 $self->{str} =~ m/\G$re/gc or
1052             $self->fail( "Expected $name" );
1053              
1054 169         945 my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1055              
1056 169 100       688 return $convert ? $convert->( $self, $match ) : $match;
1057             }
1058              
1059             sub _token_generic
1060             {
1061 226     226   346 my $self = shift;
1062 226         1045 my %args = @_;
1063              
1064 226         407 my $name = $args{name};
1065 226 50       573 my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1066 226         319 my $convert = $args{convert};
1067              
1068 226         523 $self->generic_token( $name, $re, $convert );
1069             }
1070              
1071             =head2 token_int
1072              
1073             $int = $parser->token_int
1074              
1075             Expects to find an integer in decimal, octal or hexadecimal notation, and
1076             consumes it. Negative integers, preceeded by C<->, are also recognised.
1077              
1078             =cut
1079              
1080             sub token_int
1081             {
1082 134     134 1 345 my $self = shift;
1083             $self->_token_generic(
1084             name => "int",
1085              
1086             pattern => "int",
1087             convert => sub {
1088 104     104   195 my $int = $_[1];
1089 104 100       299 my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1090              
1091 104         166 $int =~ s/^0o/0/;
1092              
1093 104 100       326 return $sign * oct $int if $int =~ m/^0/;
1094 99         834 return $sign * $int;
1095             },
1096 134         596 );
1097             }
1098              
1099             =head2 token_float
1100              
1101             $float = $parser->token_float
1102              
1103             I
1104              
1105             Expects to find a number expressed in floating-point notation; a sequence of
1106             digits possibly prefixed by C<->, possibly containing a decimal point,
1107             possibly followed by an exponent specified by C followed by an integer. The
1108             numerical value is then returned.
1109              
1110             =cut
1111              
1112             sub token_float
1113             {
1114 20     20 1 68 my $self = shift;
1115             $self->_token_generic(
1116             name => "float",
1117              
1118             pattern => "float",
1119 18     18   135 convert => sub { $_[1] + 0 },
1120 20         82 );
1121             }
1122              
1123             =head2 token_number
1124              
1125             $number = $parser->token_number
1126              
1127             I
1128              
1129             Expects to find a number expressed in either of the above forms.
1130              
1131             =cut
1132              
1133             sub token_number
1134             {
1135 7     7 1 34 my $self = shift;
1136 7         26 $self->any_of( \&token_float, \&token_int );
1137             }
1138              
1139             =head2 token_string
1140              
1141             $str = $parser->token_string
1142              
1143             Expects to find a quoted string, and consumes it. The string should be quoted
1144             using C<"> or C<'> quote marks.
1145              
1146             The content of the quoted string can contain character escapes similar to
1147             those accepted by C or Perl. Specifically, the following forms are recognised:
1148              
1149             \a Bell ("alert")
1150             \b Backspace
1151             \e Escape
1152             \f Form feed
1153             \n Newline
1154             \r Return
1155             \t Horizontal Tab
1156             \0, \012 Octal character
1157             \x34, \x{5678} Hexadecimal character
1158              
1159             C's C<\v> for vertical tab is not supported as it is rarely used in practice
1160             and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1161             control characters is also not supported.
1162              
1163             =cut
1164              
1165             my %escapes = (
1166             a => "\a",
1167             b => "\b",
1168             e => "\e",
1169             f => "\f",
1170             n => "\n",
1171             r => "\r",
1172             t => "\t",
1173             );
1174              
1175             sub token_string
1176             {
1177 53     53 1 252 my $self = shift;
1178              
1179 53 100       126 $self->fail( "Expected string" ) if $self->at_eos;
1180              
1181 52         144 my $pos = pos $self->{str};
1182              
1183 52         131 $self->skip_ws;
1184 52 100       448 $self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1185             $self->fail( "Expected string delimiter" );
1186              
1187 32         94 my $delim = $1;
1188              
1189             $self->{str} =~ m/
1190             \G(
1191             (?:
1192             \\[0-7]{1,3} # octal escape
1193             |\\x[0-9A-F]{2} # 2-digit hex escape
1194             |\\x\{[0-9A-F]+\} # {}-delimited hex escape
1195             |\\. # symbolic escape
1196             |[^\\$delim]+ # plain chunk
1197             )*?
1198             )$delim/gcix or
1199 32 50       1033 pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1200              
1201 32         116 my $string = $1;
1202              
1203 32         98 $string =~ s<\\(?:([0-7]{1,3})|x([0-9A-F]{2})|x\{([0-9A-F]+)\}|(.))>
1204             [defined $1 ? chr oct $1 :
1205             defined $2 ? chr hex $2 :
1206 11 50       77 defined $3 ? chr hex $3 :
    100          
    100          
    100          
1207             exists $escapes{$4} ? $escapes{$4} : $4]egi;
1208 32         126  
1209             return $string;
1210             }
1211              
1212             =head2 token_ident
1213              
1214             $ident = $parser->token_ident
1215              
1216             Expects to find an identifier, and consumes it.
1217              
1218             =cut
1219              
1220             sub token_ident
1221 72     72 1 145 {
1222 72         229 my $self = shift;
1223             $self->_token_generic(
1224             name => "ident",
1225              
1226             pattern => "ident",
1227             );
1228             }
1229              
1230             =head2 token_kw
1231              
1232             $keyword = $parser->token_kw( @keywords )
1233              
1234             Expects to find a keyword, and consumes it. A keyword is defined as an
1235             identifier which is exactly one of the literal values passed in.
1236              
1237             =cut
1238              
1239             sub token_kw
1240 2     2 1 14 {
1241 2         5 my $self = shift;
1242             my @acceptable = @_;
1243 2         9  
1244             $self->skip_ws;
1245 2         5  
1246             my $pos = pos $self->{str};
1247 2 50       10  
1248             defined( my $kw = $self->token_ident ) or
1249             return undef;
1250 4         22  
1251 2 100       6 grep { $_ eq $kw } @acceptable or
1252             pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1253 1         4  
1254             return $kw;
1255             }
1256              
1257             package # hide from indexer
1258             Parser::MGC::Failure;
1259              
1260             sub new
1261 271     271   376 {
1262 271         535 my $class = shift;
1263 271         501 my $self = bless {}, $class;
  271         1402  
1264 271         1401 @{$self}{qw( message parser pos )} = @_;
1265             return $self;
1266             }
1267 31     31   31085  
  31         23988  
  31         255  
1268             use overload '""' => "STRING";
1269             sub STRING
1270 8     8   2112 {
1271             my $self = shift;
1272 8         23  
1273 8         54 my $parser = $self->{parser};
1274             my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1275              
1276             # Column number only counts characters. There may be tabs in there.
1277             # Rather than trying to calculate the visual column number, just print the
1278             # indentation as it stands.
1279 8         19  
1280 8         37 my $indent = substr( $text, 0, $col );
1281             $indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1282 8         19  
1283 8 100 100     52 my $filename = $parser->{filename};
1284             my $in_file = ( defined $filename and !ref $filename )
1285             ? "in $filename " : "";
1286 8         157  
1287             return "$self->{message} ${in_file}on line $linenum at:\n" .
1288             "$text\n" .
1289             "$indent^\n";
1290             }
1291              
1292 31     31   7825 # Provide fallback operators for cmp, eq, etc...
  31         71  
  31         156  
1293             use overload fallback => 1;
1294              
1295             =head1 EXAMPLES
1296              
1297             =head2 Accumulating Results Using Variables
1298              
1299             Although the structure-forming methods all return a value, obtained from their
1300             nested parsing code, it can sometimes be more convenient to use a variable to
1301             accumulate a result in instead. For example, consider the following parser
1302             method, designed to parse a set of C assignments, such as might
1303             be found in a configuration file, or YAML/JSON-style mapping value.
1304              
1305             sub parse_dict
1306             {
1307             my $self = shift;
1308              
1309             my %ret;
1310             $self->list_of( ",", sub {
1311             my $key = $self->token_ident;
1312             exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1313              
1314             $self->expect( ":" );
1315              
1316             $ret{$key} = $self->parse_value;
1317             } );
1318              
1319             return \%ret
1320             }
1321              
1322             Instead of using the return value from C, this method accumulates
1323             values in the C<%ret> hash, eventually returning a reference to it as its
1324             result. Because of this, it can perform some error checking while it parses;
1325             namely, rejecting duplicate keys.
1326              
1327             =head1 TODO
1328              
1329             =over 4
1330              
1331             =item *
1332              
1333             Make unescaping of string constants more customisable. Possibly consider
1334             instead a C using a loop over C.
1335              
1336             =item *
1337              
1338             Easy ability for subclasses to define more token types as methods. Perhaps
1339             provide a class method such as
1340              
1341             __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1342              
1343             =item *
1344              
1345             Investigate how well C can cope with buffer splitting across
1346             other tokens than simply skippable whitespace
1347              
1348             =back
1349              
1350             =head1 AUTHOR
1351              
1352             Paul Evans
1353              
1354             =cut
1355              
1356             0x55AA;