File Coverage

blib/lib/Parser/MGC.pm
Criterion Covered Total %
statement 279 282 98.9
branch 92 104 88.4
condition 20 23 86.9
subroutine 56 58 96.5
pod 32 34 94.1
total 479 501 95.6


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2010-2022 -- leonerd@leonerd.org.uk
5              
6             package Parser::MGC 0.20;
7              
8 31     31   1418337 use v5.14;
  31         1556  
9 31     31   141 use warnings;
  31         44  
  31         741  
10              
11 31     31   127 use Carp;
  31         48  
  31         1633  
12 31     31   12849 use Feature::Compat::Try;
  31         8602  
  31         106  
13              
14 31     31   71230 use Scalar::Util qw( blessed );
  31         61  
  31         3009  
15              
16             =head1 NAME
17              
18             C - build simple recursive-descent parsers
19              
20             =head1 SYNOPSIS
21              
22             package My::Grammar::Parser;
23             use base qw( Parser::MGC );
24              
25             sub parse
26             {
27             my $self = shift;
28              
29             $self->sequence_of( sub {
30             $self->any_of(
31             sub { $self->token_int },
32             sub { $self->token_string },
33             sub { \$self->token_ident },
34             sub { $self->scope_of( "(", \&parse, ")" ) }
35             );
36             } );
37             }
38              
39             my $parser = My::Grammar::Parser->new;
40              
41             my $tree = $parser->from_file( $ARGV[0] );
42              
43             ...
44              
45             =head1 DESCRIPTION
46              
47             This base class provides a low-level framework for building recursive-descent
48             parsers that consume a given input string from left to right, returning a
49             parse structure. It takes its name from the C regexps used to implement
50             the token parsing behaviour.
51              
52             It provides a number of token-parsing methods, which each extract a
53             grammatical token from the string. It also provides wrapping methods that can
54             be used to build up a possibly-recursive grammar structure, by applying a
55             structure around other parts of parsing code.
56              
57             =head2 Backtracking
58              
59             Each method, both token and structural, atomically either consumes a prefix of
60             the string and returns its result, or fails and consumes nothing. This makes
61             it simple to implement grammars that require backtracking.
62              
63             Several structure-forming methods have some form of "optional" behaviour; they
64             can optionally consume some amount of input or take some particular choice,
65             but if the code invoked inside that subsequently fails, the structure can
66             backtrack and take some different behaviour. This is usually what is required
67             when testing whether the structure of the input string matches some part of
68             the grammar that is optional, or has multiple choices.
69              
70             However, once the choice of grammar has been made, it is often useful to be
71             able to fix on that one choice, thus making subsequent failures propagate up
72             rather than taking that alternative behaviour. Control of this backtracking
73             is given by the C method; and careful use of this method is one of the
74             key advantages that C has over more simple parsing using single
75             regexps alone.
76              
77             =cut
78              
79             =head1 CONSTRUCTOR
80              
81             =cut
82              
83             =head2 new
84              
85             $parser = Parser::MGC->new( %args )
86              
87             Returns a new instance of a C object. This must be called on a
88             subclass that provides method of the name provided as C, by default
89             called C.
90              
91             Takes the following named arguments
92              
93             =over 8
94              
95             =item toplevel => STRING
96              
97             Name of the toplevel method to use to start the parse from. If not supplied,
98             will try to use a method called C.
99              
100             =item patterns => HASH
101              
102             Keys in this hash should map to quoted regexp (C) references, to
103             override the default patterns used to match tokens. See C below
104              
105             =item accept_0o_oct => BOOL
106              
107             If true, the C method will also accept integers with a C<0o> prefix
108             as octal.
109              
110             =back
111              
112             =cut
113              
114             =head1 PATTERNS
115              
116             The following pattern names are recognised. They may be passed to the
117             constructor in the C hash, or provided as a class method under the
118             name C>.
119              
120             =over 4
121              
122             =item * ws
123              
124             Pattern used to skip whitespace between tokens. Defaults to C
125              
126             =item * comment
127              
128             Pattern used to skip comments between tokens. Undefined by default.
129              
130             =item * int
131              
132             Pattern used to parse an integer by C. Defaults to
133             C. If C is given, then
134             this will be expanded to match C as well.
135              
136             =item * float
137              
138             Pattern used to parse a floating-point number by C. Defaults to
139             C.
140              
141             =item * ident
142              
143             Pattern used to parse an identifier by C. Defaults to
144             C
145              
146             =item * string_delim
147              
148             Pattern used to delimit a string by C. Defaults to C.
149              
150             =back
151              
152             =cut
153              
154             my @patterns = qw(
155             ws
156             comment
157             int
158             float
159             ident
160             string_delim
161             );
162              
163 31     31   177 use constant pattern_ws => qr/[\s\n\t]+/;
  31         51  
  31         1635  
164 31     31   146 use constant pattern_comment => undef;
  31         56  
  31         2365  
165 31     31   178 use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/;
  31         58  
  31         2394  
166 31     31   178 use constant pattern_float => qr/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i;
  31         53  
  31         3078  
167 31     31   504 use constant pattern_ident => qr/[[:alpha:]_]\w*/;
  31         108  
  31         1685  
168 31     31   151 use constant pattern_string_delim => qr/["']/;
  31         51  
  31         113440  
169              
170             sub new
171             {
172 46     46 1 2555 my $class = shift;
173 46         114 my %args = @_;
174              
175 46   100     231 my $toplevel = $args{toplevel} || "parse";
176              
177 46 50       299 $class->can( $toplevel ) or
178             croak "Expected to be a subclass that can ->$toplevel";
179              
180 46         204 my $self = bless {
181             toplevel => $toplevel,
182             patterns => {},
183             scope_level => 0,
184             }, $class;
185              
186 46   100     241 $self->{patterns}{$_} = $args{patterns}{$_} || $self->${\"pattern_$_"} for @patterns;
187              
188 46 100       141 if( $args{accept_0o_oct} ) {
189 1         30 $self->{patterns}{int} = qr/0o[0-7]+|$self->{patterns}{int}/;
190             }
191              
192 46 100       142 if( defined $self->{patterns}{comment} ) {
193 1         23 $self->{patterns}{_skip} = qr/$self->{patterns}{ws}|$self->{patterns}{comment}/;
194             }
195             else {
196 45         101 $self->{patterns}{_skip} = $self->{patterns}{ws};
197             }
198              
199 46         253 return $self;
200             }
201              
202             =head1 METHODS
203              
204             =cut
205              
206             =head2 from_string
207              
208             $result = $parser->from_string( $str )
209              
210             Parse the given literal string and return the result from the toplevel method.
211              
212             =cut
213              
214             sub from_string
215             {
216 161     161 1 33770 my $self = shift;
217 161         287 my ( $str ) = @_;
218              
219 161         286 $self->{str} = $str;
220              
221 161         372 pos $self->{str} = 0;
222              
223 161         282 my $toplevel = $self->{toplevel};
224 161         475 my $result = $self->$toplevel;
225              
226 143 100       434 $self->at_eos or
227             $self->fail( "Expected end of input" );
228              
229 138         602 return $result;
230             }
231              
232             =head2 from_file
233              
234             $result = $parser->from_file( $file, %opts )
235              
236             Parse the given file, which may be a pathname in a string, or an opened IO
237             handle, and return the result from the toplevel method.
238              
239             The following options are recognised:
240              
241             =over 8
242              
243             =item binmode => STRING
244              
245             If set, applies the given binmode to the filehandle before reading. Typically
246             this can be used to set the encoding of the file.
247              
248             $parser->from_file( $file, binmode => ":encoding(UTF-8)" )
249              
250             =back
251              
252             =cut
253              
254             sub from_file
255             {
256 3     3 1 1350 my $self = shift;
257 3         8 my ( $file, %opts ) = @_;
258              
259 3 50       10 defined $file or croak "Expected a filename to ->from_file";
260              
261 3         6 $self->{filename} = $file;
262              
263 3         5 my $fh;
264 3 100       10 if( ref $file ) {
265 2         4 $fh = $file;
266             }
267             else {
268 1 50       35 open $fh, "<", $file or die "Cannot open $file for reading - $!";
269             }
270              
271 3 50       11 binmode $fh, $opts{binmode} if $opts{binmode};
272              
273 3         6 $self->from_string( do { local $/; <$fh>; } );
  3         12  
  3         106  
274             }
275              
276             =head2 filename
277              
278             $filename = $parser->filename
279              
280             I
281              
282             Returns the name of the file currently being parsed, if invoked from within
283             L.
284              
285             =cut
286              
287             sub filename
288             {
289 43     43 1 68 my $self = shift;
290 43         101 return $self->{filename};
291             }
292              
293             =head2 from_reader
294              
295             $result = $parser->from_reader( \&reader )
296              
297             I
298              
299             Parse the input which is read by the C function. This function will be
300             called in scalar context to generate portions of string to parse, being passed
301             the C<$parser> object. The function should return C when it has no more
302             string to return.
303              
304             $reader->( $parser )
305              
306             Note that because it is not generally possible to detect exactly when more
307             input may be required due to failed regexp parsing, the reader function is
308             only invoked during searching for skippable whitespace. This makes it suitable
309             for reading lines of a file in the common case where lines are considered as
310             skippable whitespace, or for reading lines of input interactively from a
311             user. It cannot be used in all cases (for example, reading fixed-size buffers
312             from a file) because two successive invocations may split a single token
313             across the buffer boundaries, and cause parse failures.
314              
315             =cut
316              
317             sub from_reader
318             {
319 1     1 1 13 my $self = shift;
320 1         2 my ( $reader ) = @_;
321              
322 1         2 local $self->{reader} = $reader;
323              
324 1         2 $self->{str} = "";
325 1         3 pos $self->{str} = 0;
326              
327 1         4 my $result = $self->parse;
328              
329 1 50       2 $self->at_eos or
330             $self->fail( "Expected end of input" );
331              
332 1         9 return $result;
333             }
334              
335             =head2 pos
336              
337             $pos = $parser->pos
338              
339             I
340              
341             Returns the current parse position, as a character offset from the beginning
342             of the file or string.
343              
344             =cut
345              
346             sub pos
347             {
348 279     279 1 358 my $self = shift;
349 279         600 return pos $self->{str};
350             }
351              
352             =head2 take
353              
354             $str = $parser->take( $len )
355              
356             I
357              
358             Returns the next C<$len> characters directly from the input, prior to any
359             whitespace or comment skipping. This does I take account of any
360             end-of-scope marker that may be pending. It is intended for use by parsers of
361             partially-binary protocols, or other situations in which it would be incorrect
362             for the end-of-scope marker to take effect at this time.
363              
364             =cut
365              
366             sub take
367             {
368 46     46 1 64 my $self = shift;
369 46         66 my ( $len ) = @_;
370              
371 46         65 my $start = pos( $self->{str} );
372              
373 46         90 pos( $self->{str} ) += $len;
374              
375 46         218 return substr( $self->{str}, $start, $len );
376             }
377              
378             =head2 where
379              
380             ( $lineno, $col, $text ) = $parser->where
381              
382             Returns the current parse position, as a line and column number, and
383             the entire current line of text. The first line is numbered 1, and the first
384             column is numbered 0.
385              
386             =cut
387              
388             sub where
389             {
390 49     49 1 79 my $self = shift;
391 49         89 my ( $pos ) = @_;
392              
393 49 100       127 defined $pos or $pos = pos $self->{str};
394              
395 49         83 my $str = $self->{str};
396              
397 49         62 my $sol = $pos;
398 49 100 100     217 $sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
399 49   100     299 $sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
400              
401 49         83 my $eol = $pos;
402 49   100     477 $eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
403              
404 49         117 my $line = substr( $str, $sol, $eol - $sol );
405              
406 49         105 my $col = $pos - $sol;
407 49         146 my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
408              
409 49         164 return ( $lineno, $col, $line );
410             }
411              
412             =head2 fail
413              
414             =head2 fail_from
415              
416             $parser->fail( $message )
417              
418             $parser->fail_from( $pos, $message )
419              
420             I since version 0.09.>
421              
422             Aborts the current parse attempt with the given message string. The failure
423             message will include the line and column position, and the line of input that
424             failed at the current parse position (C), or a position earlier obtained
425             using the C method (C).
426              
427             This failure will propagate up to the inner-most structure parsing method that
428             has not been committed; or will cause the entire parser to fail if there are
429             no further options to take.
430              
431             =cut
432              
433             sub fail
434             {
435 272     272 1 346 my $self = shift;
436 272         377 my ( $message ) = @_;
437 272         484 $self->fail_from( $self->pos, $message );
438             }
439              
440             sub fail_from
441             {
442 272     272 1 311 my $self = shift;
443 272         374 my ( $pos, $message ) = @_;
444 272         527 die Parser::MGC::Failure->new( $message, $self, $pos );
445             }
446              
447             # On perl 5.32 onwards we can use the nicer `isa` infix operator
448             # Problem is it won't even parse correctly on older perls so we'll have to go
449             # the long way around
450             *_isa_failure = ( $^V ge v5.32 )
451             ? do { eval 'use experimental "isa"; sub { $_[0] isa Parser::MGC::Failure }' // die $@ }
452             : do { require Scalar::Util;
453 253 100   253   2186 sub { Scalar::Util::blessed($_[0]) and $_[0]->isa( "Parser::MGC::Failure" ) } };
454              
455             =head2 die
456              
457             =head2 die_from
458              
459             $parser->die( $message )
460              
461             $parser->die_from( $pos, $message )
462              
463             I
464              
465             Throws an exception that propagates as normal for C, entirely out of the
466             entire parser and to the caller of the toplevel C method that invoked
467             it, bypassing all of the back-tracking logic.
468              
469             This is much like using core's C directly, except that the message string
470             will include the line and column position, and the line of input that the
471             parser was working on, as it does in the L method.
472              
473             This method is intended for reporting fatal errors where the parsed input was
474             correctly recognised at a grammar level, but is requesting something that
475             cannot be fulfilled semantically.
476              
477             =cut
478              
479             sub die :method
480             {
481 1     1 1 6 my $self = shift;
482 1         1 my ( $message ) = @_;
483 1         3 $self->die_from( $self->pos, $message );
484             }
485              
486             sub die_from
487             {
488 1     1 1 2 my $self = shift;
489 1         1 my ( $pos, $message ) = @_;
490             # Convenient just to use the ->STRING method of a Failure object but don't
491             # throw it directly
492 1         3 die Parser::MGC::Failure->new( $message, $self, $pos )->STRING;
493             }
494              
495             =head2 at_eos
496              
497             $eos = $parser->at_eos
498              
499             Returns true if the input string is at the end of the string.
500              
501             =cut
502              
503             sub at_eos
504             {
505 596     596 1 743 my $self = shift;
506              
507             # Save pos() before skipping ws so we don't break the substring_before method
508 596         797 my $pos = pos $self->{str};
509              
510 596         1050 $self->skip_ws;
511              
512 596         632 my $at_eos;
513 596 100       1186 if( pos( $self->{str} ) >= length $self->{str} ) {
    100          
514 162         200 $at_eos = 1;
515             }
516             elsif( defined $self->{endofscope} ) {
517 114         396 $at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
518             }
519             else {
520 320         357 $at_eos = 0;
521             }
522              
523 596         1022 pos( $self->{str} ) = $pos;
524              
525 596         1326 return $at_eos;
526             }
527              
528             =head2 scope_level
529              
530             $level = $parser->scope_level
531              
532             I
533              
534             Returns the number of nested C calls that have been made.
535              
536             =cut
537              
538             sub scope_level
539             {
540 5     5 1 7 my $self = shift;
541 5         34 return $self->{scope_level};
542             }
543              
544             =head1 STRUCTURE-FORMING METHODS
545              
546             The following methods may be used to build a grammatical structure out of the
547             defined basic token-parsing methods. Each takes at least one code reference,
548             which will be passed the actual C<$parser> object as its first argument.
549              
550             Anywhere that a code reference is expected also permits a plain string giving
551             the name of a method to invoke. This is sufficient in many simple cases, such
552             as
553              
554             $self->any_of(
555             'token_int',
556             'token_string',
557             ...
558             );
559              
560             =cut
561              
562             =head2 maybe
563              
564             $ret = $parser->maybe( $code )
565              
566             Attempts to execute the given C<$code> in scalar context, and returns what it
567             returned, accepting that it might fail. C<$code> may either be a CODE
568             reference or a method name given as a string.
569              
570             If the code fails (either by calling C itself, or by propagating a
571             failure from another method it invoked) before it has invoked C, then
572             none of the input string will be consumed; the current parsing position will
573             be restored. C will be returned in this case.
574              
575             If it calls C then any subsequent failure will be propagated to the
576             caller, rather than returning C.
577              
578             This may be considered to be similar to the C regexp qualifier.
579              
580             sub parse_declaration
581             {
582             my $self = shift;
583              
584             [ $self->parse_type,
585             $self->token_ident,
586             $self->maybe( sub {
587             $self->expect( "=" );
588             $self->parse_expression
589             } ),
590             ];
591             }
592              
593             =cut
594              
595             sub maybe
596             {
597 4     4 1 26 my $self = shift;
598 4         7 my ( $code ) = @_;
599              
600 4         5 my $pos = pos $self->{str};
601              
602 4         6 my $committed = 0;
603 4     0   10 local $self->{committer} = sub { $committed++ };
  0         0  
604              
605             try {
606             return $self->$code;
607             }
608 4         10 catch ( $e ) {
609             pos($self->{str}) = $pos;
610              
611             die $e if $committed or not _isa_failure( $e );
612             return undef;
613             }
614             }
615              
616             =head2 scope_of
617              
618             $ret = $parser->scope_of( $start, $code, $stop )
619              
620             Expects to find the C<$start> pattern, then attempts to execute the given
621             C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
622             code returned. C<$code> may either be a CODE reference of a method name given
623             as a string.
624              
625             While the code is being executed, the C<$stop> pattern will be used by the
626             token parsing methods as an end-of-scope marker; causing them to raise a
627             failure if called at the end of a scope.
628              
629             sub parse_block
630             {
631             my $self = shift;
632              
633             $self->scope_of( "{", 'parse_statements', "}" );
634             }
635              
636             If the C<$start> pattern is undefined, it is presumed the caller has already
637             checked for this. This is useful when the stop pattern needs to be calculated
638             based on the start pattern.
639              
640             sub parse_bracketed
641             {
642             my $self = shift;
643              
644             my $delim = $self->expect( qr/[\(\[\<\{]/ );
645             $delim =~ tr/([<{/)]>}/;
646              
647             $self->scope_of( undef, 'parse_body', $delim );
648             }
649              
650             This method does not have any optional parts to it; any failures are
651             immediately propagated to the caller.
652              
653             =cut
654              
655             sub scope_of
656             {
657 19     19 1 93 my $self = shift;
658 19         53 $self->_scope_of( 0, @_ );
659             }
660              
661             sub _scope_of
662             {
663 73     73   79 my $self = shift;
664 73         123 my ( $commit_if_started, $start, $code, $stop ) = @_;
665              
666 73 50       426 ref $stop or $stop = qr/\Q$stop/;
667              
668 73 100       249 $self->expect( $start ) if defined $start;
669              
670 34 100       108 $self->commit if $commit_if_started;
671              
672 34         66 local $self->{endofscope} = $stop;
673 34         69 local $self->{scope_level} = $self->{scope_level} + 1;
674              
675 34         89 my $ret = $self->$code;
676              
677 31         87 $self->expect( $stop );
678              
679 30         156 return $ret;
680             }
681              
682             =head2 committed_scope_of
683              
684             $ret = $parser->committed_scope_of( $start, $code, $stop )
685              
686             I
687              
688             A variant of L that calls L after a successful match of
689             the start pattern. This is usually what you want if using C from
690             within an C choice, if no other alternative following this one could
691             possibly match if the start pattern has.
692              
693             =cut
694              
695             sub committed_scope_of
696             {
697 54     54 1 70 my $self = shift;
698 54         99 $self->_scope_of( 1, @_ );
699             }
700              
701             =head2 list_of
702              
703             $ret = $parser->list_of( $sep, $code )
704              
705             Expects to find a list of instances of something parsed by C<$code>,
706             separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
707             the return values from the C<$code>. A single trailing delimiter is allowed,
708             and does not affect the return value. C<$code> may either be a CODE reference
709             or a method name given as a string. It is called in list context, and whatever
710             values it returns are appended to the eventual result - similar to perl's
711             C.
712              
713             This method does not consider it an error if the returned list is empty; that
714             is, that the scope ended before any item instances were parsed from it.
715              
716             sub parse_numbers
717             {
718             my $self = shift;
719              
720             $self->list_of( ",", 'token_int' );
721             }
722              
723             If the code fails (either by invoking C itself, or by propagating a
724             failure from another method it invoked) before it has invoked C on a
725             particular item, then the item is aborted and the parsing position will be
726             restored to the beginning of that failed item. The list of results from
727             previous successful attempts will be returned.
728              
729             If it calls C within an item then any subsequent failure for that item
730             will cause the entire C to fail, propagating that to the caller.
731              
732             =cut
733              
734             sub list_of
735             {
736 72     72 1 128 my $self = shift;
737 72         108 my ( $sep, $code ) = @_;
738              
739 72 100 33     207 ref $sep or $sep = qr/\Q$sep/ if defined $sep;
740              
741 72         81 my $committed;
742 72     14   226 local $self->{committer} = sub { $committed++ };
  14         22  
743              
744 72         99 my @ret;
745              
746 72         140 while( !$self->at_eos ) {
747 127         176 $committed = 0;
748 127         154 my $pos = pos $self->{str};
749              
750             try {
751             push @ret, $self->$code;
752             next;
753             }
754             catch ( $e ) {
755             pos($self->{str}) = $pos;
756             die $e if $committed or not _isa_failure( $e );
757              
758             last;
759             }
760 127         212 }
761             continue {
762 103 100       224 if( defined $sep ) {
763 32         58 $self->skip_ws;
764 32 100       145 $self->{str} =~ m/\G$sep/gc or last;
765             }
766             }
767              
768 71         247 return \@ret;
769             }
770              
771             =head2 sequence_of
772              
773             $ret = $parser->sequence_of( $code )
774              
775             A shortcut for calling C with an empty string as separator; expects
776             to find at least one instance of something parsed by C<$code>, separated only
777             by skipped whitespace.
778              
779             This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
780              
781             sub parse_statements
782             {
783             my $self = shift;
784              
785             $self->sequence_of( 'parse_statement' );
786             }
787              
788             The interaction of failures in the code and the C method is identical
789             to that of C.
790              
791             =cut
792              
793             sub sequence_of
794             {
795 56     56 1 169 my $self = shift;
796 56         91 my ( $code ) = @_;
797              
798 56         125 $self->list_of( undef, $code );
799             }
800              
801             =head2 any_of
802              
803             $ret = $parser->any_of( @codes )
804              
805             I
806              
807             Expects that one of the given code instances can parse something from the
808             input, returning what it returned. Each code instance may indicate a failure
809             to parse by calling the C method or otherwise propagating a failure.
810             Each code instance may either be a CODE reference or a method name given as a
811             string.
812              
813             This may be considered to be similar to the C<|> regexp operator for forming
814             alternations of possible parse trees.
815              
816             sub parse_statement
817             {
818             my $self = shift;
819              
820             $self->any_of(
821             sub { $self->parse_declaration; $self->expect(";") },
822             sub { $self->parse_expression; $self->expect(";") },
823             sub { $self->parse_block },
824             );
825             }
826              
827             If the code for a given choice fails (either by invoking C itself, or by
828             propagating a failure from another method it invoked) before it has invoked
829             C itself, then the parsing position restored and the next choice will
830             be attempted.
831              
832             If it calls C then any subsequent failure for that choice will cause
833             the entire C to fail, propagating that to the caller and no further
834             choices will be attempted.
835              
836             If none of the choices match then a simple failure message is printed:
837              
838             Found nothing parseable
839              
840             As this is unlikely to be helpful to users, a better message can be provided
841             by the final choice instead. Don't forget to C before printing the
842             failure message, or it won't count.
843              
844             $self->any_of(
845             'token_int',
846             'token_string',
847             ...,
848              
849             sub { $self->commit; $self->fail( "Expected an int or string" ) }
850             );
851              
852             =cut
853              
854             sub any_of
855             {
856 218     218 1 379 my $self = shift;
857              
858 218         399 while( @_ ) {
859 433         536 my $code = shift;
860 433         508 my $pos = pos $self->{str};
861              
862 433         457 my $committed = 0;
863 433     55   1091 local $self->{committer} = sub { $committed++ };
  55         80  
864              
865             try {
866             return $self->$code;
867             }
868 433         741 catch ( $e ) {
869             pos( $self->{str} ) = $pos;
870              
871             die $e if $committed or not _isa_failure( $e );
872             }
873             }
874              
875 12         26 $self->fail( "Found nothing parseable" );
876             }
877              
878             sub one_of {
879 0     0 0 0 croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
880             }
881              
882             =head2 commit
883              
884             $parser->commit
885              
886             Calling this method will cancel the backtracking behaviour of the innermost
887             C, C, C, or C structure forming method.
888             That is, if later code then calls C, the exception will be propagated
889             out of C, no further list items will be attempted by C or
890             C, and no further code blocks will be attempted by C.
891              
892             Typically this will be called once the grammatical structure alter has been
893             determined, ensuring that any further failures are raised as real exceptions,
894             rather than by attempting other alternatives.
895              
896             sub parse_statement
897             {
898             my $self = shift;
899              
900             $self->any_of(
901             ...
902             sub {
903             $self->scope_of( "{",
904             sub { $self->commit; $self->parse_statements; },
905             "}" ),
906             },
907             );
908             }
909              
910             Though in this common pattern, L may be used instead.
911              
912             =cut
913              
914             sub commit
915             {
916 69     69 1 101 my $self = shift;
917 69 50       117 if( $self->{committer} ) {
918 69         108 $self->{committer}->();
919             }
920             else {
921 0         0 croak "Cannot commit except within a backtrack-able structure";
922             }
923             }
924              
925             =head1 TOKEN PARSING METHODS
926              
927             The following methods attempt to consume some part of the input string, to be
928             used as part of the parsing process.
929              
930             =cut
931              
932             sub skip_ws
933             {
934 1283     1283 0 1368 my $self = shift;
935              
936 1283         1476 my $pattern = $self->{patterns}{_skip};
937              
938             {
939 1283         1283 1 while $self->{str} =~ m/\G$pattern/gc;
  1286         5533  
940              
941 1286 100       2800 return if pos( $self->{str} ) < length $self->{str};
942              
943 222 100       483 return unless $self->{reader};
944              
945 4         9 my $more = $self->{reader}->( $self );
946 4 100       12 if( defined $more ) {
947 3         5 my $pos = pos( $self->{str} );
948 3         7 $self->{str} .= $more;
949 3         4 pos( $self->{str} ) = $pos;
950              
951 3         7 redo;
952             }
953              
954 1         3 undef $self->{reader};
955 1         1 return;
956             }
957             }
958              
959             =head2 expect
960              
961             $str = $parser->expect( $literal )
962              
963             $str = $parser->expect( qr/pattern/ )
964              
965             @groups = $parser->expect( qr/pattern/ )
966              
967             Expects to find a literal string or regexp pattern match, and consumes it.
968             In scalar context, this method returns the string that was captured. In list
969             context it returns the matching substring and the contents of any subgroups
970             contained in the pattern.
971              
972             This method will raise a parse error (by calling C) if the regexp fails
973             to match. Note that if the pattern could match an empty string (such as for
974             example C), the pattern will always match, even if it has to match an
975             empty string. This method will not consider a failure if the regexp matches
976             with zero-width.
977              
978             =head2 maybe_expect
979              
980             $str = $parser->maybe_expect( ... )
981              
982             @groups = $parser->maybe_expect( ... )
983              
984             I
985              
986             A convenient shortcut equivalent to calling C within C, but
987             implemented more efficiently, avoiding the exception-handling set up by
988             C. Returns C or an empty list if the match fails.
989              
990             =cut
991              
992             sub maybe_expect
993             {
994 371     371 1 461 my $self = shift;
995 371         482 my ( $expect ) = @_;
996              
997 371 100       624 ref $expect or $expect = qr/\Q$expect/;
998              
999 371         637 $self->skip_ws;
1000 371 100       3512 $self->{str} =~ m/\G$expect/gc or return;
1001              
1002 210 100       1098 return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
1003 22 100       56 return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
  37         226  
1004             }
1005              
1006             sub expect
1007             {
1008 355     355 1 544 my $self = shift;
1009 355         507 my ( $expect ) = @_;
1010              
1011 355 100       2156 ref $expect or $expect = qr/\Q$expect/;
1012              
1013 355 100       657 if( wantarray ) {
1014 31 100       64 my @ret = $self->maybe_expect( $expect ) or
1015             $self->fail( "Expected $expect" );
1016 20         80 return @ret;
1017             }
1018             else {
1019 324 100       553 defined( my $ret = $self->maybe_expect( $expect ) ) or
1020             $self->fail( "Expected $expect" );
1021 185         502 return $ret;
1022             }
1023             }
1024              
1025             =head2 substring_before
1026              
1027             $str = $parser->substring_before( $literal )
1028              
1029             $str = $parser->substring_before( qr/pattern/ )
1030              
1031             I
1032              
1033             Expects to possibly find a literal string or regexp pattern match. If it finds
1034             such, consume all the input text before but excluding this match, and return
1035             it. If it fails to find a match before the end of the current scope, consumes
1036             all the input text until the end of scope and return it.
1037              
1038             This method does not consume the part of input that matches, only the text
1039             before it. It is not considered a failure if the substring before this match
1040             is empty. If a non-empty match is required, use the C method:
1041              
1042             sub token_nonempty_part
1043             {
1044             my $self = shift;
1045              
1046             my $str = $parser->substring_before( "," );
1047             length $str or $self->fail( "Expected a string fragment before ," );
1048              
1049             return $str;
1050             }
1051              
1052             Note that unlike most of the other token parsing methods, this method does not
1053             consume either leading or trailing whitespace around the substring. It is
1054             expected that this method would be used as part a parser to read quoted
1055             strings, or similar cases where whitespace should be preserved.
1056              
1057             =head2 nonempty_substring_before
1058              
1059             $str = $parser->nonempty_substring_before( $literal )
1060              
1061             $str = $parser->nonempty_substring_before( qr/pattern/ )
1062              
1063             I
1064              
1065             A variant of L which fails if the matched part is empty.
1066              
1067             The example above could have been written:
1068              
1069             sub token_nonempty_part
1070             {
1071             my $self = shift;
1072              
1073             return $parser->nonempty_substring_before( "," );
1074             }
1075              
1076             This is often useful for breaking out of repeating loops; e.g.
1077              
1078             sub token_escaped_string
1079             {
1080             my $self = shift;
1081             $self->expect( '"' );
1082              
1083             my $ret = "";
1084             1 while $self->any_of(
1085             sub { $ret .= $self->nonempty_substring_before( qr/%|$/m ); 1 }
1086             sub { my $escape = ( $self->expect( qr/%(.)/ ) )[1];
1087             $ret .= _handle_escape( $escape );
1088             1 },
1089             sub { 0 },
1090             )
1091              
1092             return $ret;
1093             }
1094              
1095             =cut
1096              
1097             sub _substring_before
1098             {
1099 41     41   50 my $self = shift;
1100 41         65 my ( $expect, $fail_if_empty ) = @_;
1101              
1102 41 100       150 ref $expect or $expect = qr/\Q$expect/;
1103              
1104 41 100       131 my $endre = ( defined $self->{endofscope} ) ?
1105             qr/$expect|$self->{endofscope}/ :
1106             $expect;
1107              
1108             # NO skip_ws
1109              
1110 41         59 my $start = pos $self->{str};
1111 41         43 my $end;
1112 41 100       422 if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1113 38         94 $end = $-[1];
1114             }
1115             else {
1116 3         8 $end = length $self->{str};
1117             }
1118              
1119 41 100 66     106 $self->fail( "Expected to find a non-empty substring before $expect" )
1120             if $fail_if_empty and $end == $start;
1121              
1122 40         92 return $self->take( $end - $start );
1123             }
1124              
1125             sub substring_before
1126             {
1127 40     40 1 63 my $self = shift;
1128 40         71 return $self->_substring_before( $_[0], 0 );
1129             }
1130              
1131             sub nonempty_substring_before
1132             {
1133 1     1 1 9 my $self = shift;
1134 1         13 return $self->_substring_before( $_[0], 1 );
1135             }
1136              
1137             =head2 generic_token
1138              
1139             $val = $parser->generic_token( $name, $re, $convert )
1140              
1141             I
1142              
1143             Expects to find a token matching the precompiled regexp C<$re>. If provided,
1144             the C<$convert> CODE reference can be used to convert the string into a more
1145             convenient form. C<$name> is used in the failure message if the pattern fails
1146             to match.
1147              
1148             If provided, the C<$convert> function will be passed the parser and the
1149             matching substring; the value it returns is returned from C.
1150              
1151             $convert->( $parser, $substr )
1152              
1153             If not provided, the substring will be returned as it stands.
1154              
1155             This method is mostly provided for subclasses to define their own token types.
1156             For example:
1157              
1158             sub token_hex
1159             {
1160             my $self = shift;
1161             $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1162             }
1163              
1164             =cut
1165              
1166             sub generic_token
1167             {
1168 230     230 1 273 my $self = shift;
1169 230         359 my ( $name, $re, $convert ) = @_;
1170              
1171 230 50       377 $self->fail( "Expected $name" ) if $self->at_eos;
1172              
1173 230         432 $self->skip_ws;
1174 230 100       2985 $self->{str} =~ m/\G$re/gc or
1175             $self->fail( "Expected $name" );
1176              
1177 169         762 my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1178              
1179 169 100       558 return $convert ? $convert->( $self, $match ) : $match;
1180             }
1181              
1182             sub _token_generic
1183             {
1184 226     226   270 my $self = shift;
1185 226         559 my %args = @_;
1186              
1187 226         311 my $name = $args{name};
1188 226 50       501 my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1189 226         257 my $convert = $args{convert};
1190              
1191 226         433 $self->generic_token( $name, $re, $convert );
1192             }
1193              
1194             =head2 token_int
1195              
1196             $int = $parser->token_int
1197              
1198             Expects to find an integer in decimal, octal or hexadecimal notation, and
1199             consumes it. Negative integers, preceeded by C<->, are also recognised.
1200              
1201             =cut
1202              
1203             sub token_int
1204             {
1205 134     134 1 289 my $self = shift;
1206             $self->_token_generic(
1207             name => "int",
1208              
1209             pattern => "int",
1210             convert => sub {
1211 104     104   151 my $int = $_[1];
1212 104 100       213 my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1213              
1214 104         126 $int =~ s/^0o/0/;
1215              
1216 104 100       227 return $sign * oct $int if $int =~ m/^0/;
1217 99         659 return $sign * $int;
1218             },
1219 134         463 );
1220             }
1221              
1222             =head2 token_float
1223              
1224             $float = $parser->token_float
1225              
1226             I
1227              
1228             Expects to find a number expressed in floating-point notation; a sequence of
1229             digits possibly prefixed by C<->, possibly containing a decimal point,
1230             possibly followed by an exponent specified by C followed by an integer. The
1231             numerical value is then returned.
1232              
1233             =cut
1234              
1235             sub token_float
1236             {
1237 20     20 1 59 my $self = shift;
1238             $self->_token_generic(
1239             name => "float",
1240              
1241             pattern => "float",
1242 18     18   122 convert => sub { $_[1] + 0 },
1243 20         76 );
1244             }
1245              
1246             =head2 token_number
1247              
1248             $number = $parser->token_number
1249              
1250             I
1251              
1252             Expects to find a number expressed in either of the above forms.
1253              
1254             =cut
1255              
1256             sub token_number
1257             {
1258 7     7 1 28 my $self = shift;
1259 7         21 $self->any_of( \&token_float, \&token_int );
1260             }
1261              
1262             =head2 token_string
1263              
1264             $str = $parser->token_string
1265              
1266             Expects to find a quoted string, and consumes it. The string should be quoted
1267             using C<"> or C<'> quote marks.
1268              
1269             The content of the quoted string can contain character escapes similar to
1270             those accepted by C or Perl. Specifically, the following forms are recognised:
1271              
1272             \a Bell ("alert")
1273             \b Backspace
1274             \e Escape
1275             \f Form feed
1276             \n Newline
1277             \r Return
1278             \t Horizontal Tab
1279             \0, \012 Octal character
1280             \x34, \x{5678} Hexadecimal character
1281              
1282             C's C<\v> for vertical tab is not supported as it is rarely used in practice
1283             and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1284             control characters is also not supported.
1285              
1286             =cut
1287              
1288             my %escapes = (
1289             a => "\a",
1290             b => "\b",
1291             e => "\e",
1292             f => "\f",
1293             n => "\n",
1294             r => "\r",
1295             t => "\t",
1296             );
1297              
1298             sub token_string
1299             {
1300 53     53 1 137 my $self = shift;
1301              
1302 53 100       102 $self->fail( "Expected string" ) if $self->at_eos;
1303              
1304 52         115 my $pos = pos $self->{str};
1305              
1306 52         109 $self->skip_ws;
1307 52 100       347 $self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1308             $self->fail( "Expected string delimiter" );
1309              
1310 32         89 my $delim = $1;
1311              
1312             $self->{str} =~ m/
1313             \G(
1314             (?:
1315             \\[0-7]{1,3} # octal escape
1316             |\\x[0-9A-F]{2} # 2-digit hex escape
1317             |\\x\{[0-9A-F]+\} # {}-delimited hex escape
1318             |\\. # symbolic escape
1319             |[^\\$delim]+ # plain chunk
1320             )*?
1321             )$delim/gcix or
1322 32 50       780 pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1323              
1324 32         84 my $string = $1;
1325              
1326 32         80 $string =~ s<\\(?:([0-7]{1,3})|x([0-9A-F]{2})|x\{([0-9A-F]+)\}|(.))>
1327             [defined $1 ? chr oct $1 :
1328             defined $2 ? chr hex $2 :
1329 11 50       67 defined $3 ? chr hex $3 :
    100          
    100          
    100          
1330             exists $escapes{$4} ? $escapes{$4} : $4]egi;
1331 32         101  
1332             return $string;
1333             }
1334              
1335             =head2 token_ident
1336              
1337             $ident = $parser->token_ident
1338              
1339             Expects to find an identifier, and consumes it.
1340              
1341             =cut
1342              
1343             sub token_ident
1344 72     72 1 121 {
1345 72         123 my $self = shift;
1346             $self->_token_generic(
1347             name => "ident",
1348              
1349             pattern => "ident",
1350             );
1351             }
1352              
1353             =head2 token_kw
1354              
1355             $keyword = $parser->token_kw( @keywords )
1356              
1357             Expects to find a keyword, and consumes it. A keyword is defined as an
1358             identifier which is exactly one of the literal values passed in.
1359              
1360             =cut
1361              
1362             sub token_kw
1363 2     2 1 9 {
1364 2         5 my $self = shift;
1365             my @acceptable = @_;
1366 2         6  
1367             $self->skip_ws;
1368 2         3  
1369             my $pos = pos $self->{str};
1370 2 50       7  
1371             defined( my $kw = $self->token_ident ) or
1372             return undef;
1373 4         17  
1374 2 100       3 grep { $_ eq $kw } @acceptable or
1375             pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1376 1         3  
1377             return $kw;
1378             }
1379              
1380             package # hide from indexer
1381             Parser::MGC::Failure;
1382              
1383             sub new
1384 273     273   319 {
1385 273         442 my $class = shift;
1386 273         459 my $self = bless {}, $class;
  273         1107  
1387 273         1114 @{$self}{qw( message parser pos )} = @_;
1388             return $self;
1389             }
1390 31     31   25447  
  31         19299  
  31         200  
1391             use overload '""' => "STRING";
1392             sub STRING
1393 43     43   3444 {
1394             my $self = shift;
1395 43         88  
1396 43         164 my $parser = $self->{parser};
1397             my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1398              
1399             # Column number only counts characters. There may be tabs in there.
1400             # Rather than trying to calculate the visual column number, just print the
1401             # indentation as it stands.
1402 43         85  
1403 43         122 my $indent = substr( $text, 0, $col );
1404             $indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1405 43         144  
1406 43 100 100     152 my $filename = $parser->filename;
1407             my $in_file = ( defined $filename and !ref $filename )
1408             ? "in $filename " : "";
1409 43         422  
1410             return "$self->{message} ${in_file}on line $linenum at:\n" .
1411             "$text\n" .
1412             "$indent^\n";
1413             }
1414              
1415 31     31   6153 # Provide fallback operators for cmp, eq, etc...
  31         58  
  31         137  
1416             use overload fallback => 1;
1417              
1418             =head1 EXAMPLES
1419              
1420             =head2 Accumulating Results Using Variables
1421              
1422             Although the structure-forming methods all return a value, obtained from their
1423             nested parsing code, it can sometimes be more convenient to use a variable to
1424             accumulate a result in instead. For example, consider the following parser
1425             method, designed to parse a set of C assignments, such as might
1426             be found in a configuration file, or YAML/JSON-style mapping value.
1427              
1428             sub parse_dict
1429             {
1430             my $self = shift;
1431              
1432             my %ret;
1433             $self->list_of( ",", sub {
1434             my $key = $self->token_ident;
1435             exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1436              
1437             $self->expect( ":" );
1438              
1439             $ret{$key} = $self->parse_value;
1440             } );
1441              
1442             return \%ret
1443             }
1444              
1445             Instead of using the return value from C, this method accumulates
1446             values in the C<%ret> hash, eventually returning a reference to it as its
1447             result. Because of this, it can perform some error checking while it parses;
1448             namely, rejecting duplicate keys.
1449              
1450             =head1 TODO
1451              
1452             =over 4
1453              
1454             =item *
1455              
1456             Make unescaping of string constants more customisable. Possibly consider
1457             instead a C using a loop over C.
1458              
1459             =item *
1460              
1461             Easy ability for subclasses to define more token types as methods. Perhaps
1462             provide a class method such as
1463              
1464             __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1465              
1466             =item *
1467              
1468             Investigate how well C can cope with buffer splitting across
1469             other tokens than simply skippable whitespace
1470              
1471             =back
1472              
1473             =head1 AUTHOR
1474              
1475             Paul Evans
1476              
1477             =cut
1478              
1479             0x55AA;