File Coverage

blib/lib/Parse/FSM/Driver.pm

Criterion	Covered	Total	%
statement	73	74	98.6
branch	34	36	94.4
condition			n/a
subroutine	14	14	100.0
pod	5	5	100.0
total	126	129	97.6

line	stmt	bran	sub	pod	time	code
1						# $Id: Driver.pm,v 1.10 2013/07/27 00:34:39 Paulo Exp $
2
3						package Parse::FSM::Driver;
4
5						#------------------------------------------------------------------------------
6
7						=head1 NAME
8
9						Parse::FSM::Driver - Run-time engine for Parse::FSM parser
10
11						=cut
12
13						#------------------------------------------------------------------------------
14
15	6		6		851	use warnings;
	6				11
	6				155
16	6		6		26	use strict;
	6				12
	6				107
17
18	6		6		24	use Carp; our @CARP_NOT = ('Parse::FSM::Driver');
	6				11
	6				538
19	6		6		782	use Data::Dump 'dump';
	6				5228
	6				489
20
21						our $VERSION = '1.12';
22
23						#------------------------------------------------------------------------------
24
25						=head1 SYNOPSIS
26
27						use MyParser; # isa Parse::FSM::Driver
28
29						$parser = MyParser->new;
30						$parser->input( \&lexer );
31						$parser->user( $user_pointer );
32
33						$result = $parser->parse( $start_rule );
34						$result = $parser->parse_start_rule;
35
36						$token = $parser->peek_token;
37						$token = $parser->get_token;
38						$parser->unget_token(@tokens);
39
40						=head1 DESCRIPTION
41
42						This module implements a deterministic top-down parser based on a
43						pre-computed Finite State Machine (FSM).
44
45						The FSM is generated by L, by
46						reading a BNF-type grammar file and generating
47						a run-time module that includes the state tables. The module also include
48						the run-time parsing routine that follows the state tables to obtain
49						a parse of the input.
50
51						This module is not intended to be used stand alone. It is used as a base class
52						by the modules generated by L.
53
54						=head1 METHODS - SETUP
55
56						=head2 new
57
58						Creates a new object.
59
60						=head2 user
61
62						Get/set of the parser user pointer. The user pointer is not used by the parser,
63						and is available for communication between the parser actions and the
64						calling module.
65
66						It can for example point to a data structure that describes the objects
67						already identified in the parse.
68
69						=cut
70
71						#------------------------------------------------------------------------------
72						# Parsing state machine
73						# Each state hash has:
74						# terminal => (state ID), for a match
75						# terminal => [ (subrule ID), (next state ID) ], for a sub-rule
76						# followed by a match
77						# terminal => [ (subrule ID), sub{} ], for a sub-rule followed by an accept
78						# terminal => sub{}, for an accept
79						# Each sub{} has $self and @args pre-declared
80						# @args is [] of all parsed elements
81						# $self is the Parse::FSM::Driver object
82
83						#------------------------------------------------------------------------------
84						use Class::XSAccessor {
85	6				105	constructor => '_init',
86						accessors => [
87						'input', # input iterator
88						'_head', # unget queue of tokens retrived from input
89						'user', # user pointer
90						'_state_table', # list of states
91						'_start_state', # ID of start state
92						],
93	6		6		738	};
	6				2466
94
95						#------------------------------------------------------------------------------
96						sub new {
97	100		100	1	2351	my($class, @args) = @_;
98						return $class->_init(
99			3			input => sub {},
100	100				1142	_head => [],
101						user => {},
102						_state_table => [],
103						_start_state => 0,
104						@args);
105						}
106						#------------------------------------------------------------------------------
107
108						=head1 METHODS - INPUT STREAM
109
110						=head2 input
111
112						Get/set the parser input lexer iterator. The iterator is a code reference of
113						a function that returns the next token to be parsed as an array ref,
114						with token type and token value C<[$type, $value]>.
115						It returns C on end of input. E.g. for a simple expression lexer:
116
117						sub make_lexer {
118						my($line) = @_;
119						return sub {
120						for ($line) {
121						/\G\s+/gc;
122						return [NUM => $1] if /\G(\d+)/gc;
123						return [NAME => $1] if /\G([a-z]\w*)/gci;
124						return [$1 => $1] if /\G(.)/gc;
125						return;
126						}
127						};
128						}
129						$parser->input(make_lexer("2+3*4"));
130
131						=head2 peek_token
132
133						Returns the next token to be retrieved by the lexer, but keeps it in the input
134						queue. Can be used by a rule action to decide based on the input that follows.
135
136						=cut
137
138						#------------------------------------------------------------------------------
139						sub peek_token {
140	2407		2407	1	2806	my($self) = @_;
141	2407	100			2346	@{$self->_head} or push @{$self->_head}, $self->input->();
	1340				3804
	2407				6055
142	2403				10481	return $self->_head->[0]; # may be undef, if end of input
143						}
144						#------------------------------------------------------------------------------
145
146						=head2 get_token
147
148						Extracts the next token from the lexer stream. Can be used by a rule action to
149						discard the following tokens.
150
151						=cut
152
153						#------------------------------------------------------------------------------
154						sub get_token {
155	886		886	1	28971	my($self) = @_;
156	886	100			829	@{$self->_head} and return shift @{$self->_head};
	698				1499
	886				2531
157	188				646	return $self->_head->[0]; # may be undef, if end of input
158						}
159						#------------------------------------------------------------------------------
160
161						=head2 unget_token
162
163						Pushes back the given list of tokens to the lexer input stream, to be retrieved
164						on the next calls to C.
165
166						=cut
167
168						#------------------------------------------------------------------------------
169						sub unget_token {
170	4		4	1	8	my($self, @tokens) = @_;
171	4				5	unshift @{$self->_head}, @tokens;
	4				16
172	4				43	return;
173						}
174						#------------------------------------------------------------------------------
175
176						=head1 METHODS - PARSING
177
178						=head2 parse
179
180						This function receives an optional start rule name, and uses the default rule
181						of the grammar if not supplied.
182
183						It parses the input stream, leaving the stream at the first unparsed
184						token, and returns the parse value - the result of the action function for the
185						start rule.
186
187						The function dies with an error message indicating the input that cannot
188						be parsed in case of a parse error.
189
190						=head2 parse_XXX
191
192						For each rule C in the grammar, L creates a
193						corresponding C to start the parse at that rule. This is a
194						short-cut to C.
195
196						=cut
197
198						#------------------------------------------------------------------------------
199						sub parse {
200	175		175	1	13064	my($self, $start_rule) = @_;
201
202						# current state
203	175				209	my $state;
204	175	100			344	if (defined($start_rule)) {
205	8	100			199	$state = $self->_state_table->[0]{$start_rule}
206						or croak "Rule $start_rule not found";
207						}
208						else {
209	167	50			578	$state = $self->_start_state
210						or croak "Start state not found";
211						}
212	174				397	return $self->_parse($state);
213						}
214
215						#------------------------------------------------------------------------------
216						sub _parse {
217	194		194		256	my($self, $state) = @_;
218
219	194				282	my @values = ();
220
221						# return stack of states
222	194				218	my @stack = (); # store: [$state, @values]
223
224						# fetch token only after drop and after calling parser rules
225	194				457	my $token = $self->peek_token;
226	194				229	while (1) {
227	2572				2611	my($entry, $found_else);
228	2572	100			8970	if ($entry = $self->_state_table->[$state]{($token ? $token->[0] : "")}) {
		100
		100
229						# entry exists, found token
230						}
231						elsif ($entry = $self->_state_table->[$state]{__else__}) {
232	521				677	$found_else++;
233						}
234						else {
235	82				250	$self->_error_at($token, $state);
236						}
237
238	2490	100			4507	if (ref($entry) eq 'ARRAY') { # call sub-rule
239	1411				1756	my($next_state, $return_state) = @$entry;
240	1411				2386	push(@stack, [ $return_state, @values ]); # return data
241	1411				2577	($state, @values) = ($next_state); # call
242						}
243						else { # accept token
244	1079				1151	$state = $entry;
245
246	1079	100			1816	if (!$found_else) {
247	725	100			1420	push(@values, $token) if $token; # add token to values
248	725				1209	$self->get_token; # drop value
249	725				1300	$token = $self->peek_token; # and get next token
250						}
251
252	1075				2451	while (ref($state) eq 'CODE') { # return from sub-rules
253	1460				8406	my $value = $self->$state(@values);
254	1460				2775	$token = $self->peek_token; # input may have changed
255
256	1460	100			2665	if ( ! @stack ) { # END OF PARSE
257	108				571	return $value;
258						}
259
260	1352				1557	my $top = pop(@stack);
261	1352				2769	($state, @values) = @$top;
262
263						# keep only defined values
264	1352	100			5234	push(@values, $value) if defined($value);
265						}
266						}
267						}
268	0				0	die 'not reached';
269						}
270
271						#------------------------------------------------------------------------------
272						# expected error at given stream position, die with error message
273						sub _error_at {
274	84		84		596	my($self, $token, $state) = @_;
275
276	95				737	my @expected = sort map {_format_token($_)}
277	84				109	keys %{$self->_state_table->[$state]};
	84				300
278	84	100			1545	die("Expected ",
		100
279						scalar(@expected) == 1 ? "@expected" : "one of (@expected)",
280						" at ",
281						defined($token) ? _format_token($token->[0]) : "EOF",
282						"\n");
283						}
284
285						#------------------------------------------------------------------------------
286						# format a token
287						sub _format_token {
288	131		131		179	my($token) = @_;
289	131	50			250	return "" if !defined($token);
290	131	100			299	return "EOF" if $token eq "";
291	113	100			379	return dump($token) if $token =~ /\W/;
292	89				463	return $token;
293						}
294						#------------------------------------------------------------------------------
295
296						=head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT
297
298						See L
299
300						=cut
301
302						#------------------------------------------------------------------------------
303
304						1;