File Coverage

blib/lib/Parse/FSM/Driver.pm

Criterion	Covered	Total	%
statement	74	75	98.6
branch	34	36	94.4
condition			n/a
subroutine	14	14	100.0
pod	5	5	100.0
total	127	130	97.6

line	stmt	bran	sub	pod	time	code
1						# $Id: Driver.pm,v 1.10 2013/07/27 00:34:39 Paulo Exp $
2
3						package Parse::FSM::Driver;
4
5						#------------------------------------------------------------------------------
6
7						=head1 NAME
8
9						Parse::FSM::Driver - Run-time engine for Parse::FSM parser
10
11						=cut
12
13						#------------------------------------------------------------------------------
14
15	6		6		967	use warnings;
	6				10
	6				200
16	6		6		26	use strict;
	6				7
	6				200
17
18	6		6		24	use Carp; our @CARP_NOT = ('Parse::FSM::Driver');
	6				8
	6				574
19	6		6		752	use Data::Dump 'dump';
	6				6482
	6				621
20
21						our $VERSION = '1.11';
22
23						#------------------------------------------------------------------------------
24
25						=head1 SYNOPSIS
26
27						use MyParser; # isa Parse::FSM::Driver
28
29						$parser = MyParser->new;
30						$parser->input( \&lexer );
31						$parser->user( $user_pointer );
32
33						$result = $parser->parse( $start_rule );
34						$result = $parser->parse_start_rule;
35
36						$token = $parser->peek_token;
37						$token = $parser->get_token;
38						$parser->unget_token(@tokens);
39
40						=head1 DESCRIPTION
41
42						This modules implements a deterministic top-down parser based on a
43						pre-computed Finite State Machine (FSM).
44
45						The FSM is generated by L, by
46						reading a BNF-type grammar file and generating
47						a run-time module that includes the state tables. The module also include
48						the run-time parsing routine that follows the state tables to obtain
49						a parse of the input.
50
51						This module is not intended to be used stand alone. It is used as a base class
52						by the modules generated by L.
53
54						=head1 METHODS - SETUP
55
56						=head2 new
57
58						Creates a new object.
59
60						=head2 user
61
62						Get/set of the parser user pointer. The user pointer is not used by the parser,
63						and is available for communication between the parser actions and the
64						calling module.
65
66						It can for example point to a data structure that describes the objects
67						already identified in the parse.
68
69						=cut
70
71						#------------------------------------------------------------------------------
72						# Parsing state machine
73						# Each state hash has:
74						# terminal => (state ID), for a match
75						# terminal => [ (subrule ID), (next state ID) ], for a sub-rule
76						# followed by a match
77						# terminal => [ (subrule ID), sub{} ], for a sub-rule followed by an accept
78						# terminal => sub{}, for an accept
79						# Each sub{} has $self and @args pre-declared
80						# @args is [] of all parsed elements
81						# $self is the Parse::FSM::Driver object
82
83						#------------------------------------------------------------------------------
84						use Class::XSAccessor {
85	6				58	constructor => '_init',
86						accessors => [
87						'input', # input iterator
88						'_head', # unget queue of tokens retrived from input
89						'user', # user pointer
90						'_state_table', # list of states
91						'_start_state', # ID of start state
92						],
93	6		6		760	};
	6				2723
94
95						#------------------------------------------------------------------------------
96						sub new {
97	100		100	1	2847	my($class, @args) = @_;
98						return $class->_init(
99	3		3		455	input => sub {},
100	100				1469	_head => [],
101						user => {},
102						_state_table => [],
103						_start_state => 0,
104						@args);
105						}
106						#------------------------------------------------------------------------------
107
108						=head1 METHODS - INPUT STREAM
109
110						=head2 input
111
112						Get/set the parser input lexer iterator. The iterator is a code reference of
113						a function that returns the next token to be parsed as an array ref,
114						with token type and token value C<[$type, $value]>.
115						It returns C on end of input. E.g. for a simple expression lexer:
116
117						sub make_lexer {
118						my($line) = @_;
119						return sub {
120						for ($line) {
121						/\G\s+/gc;
122						return [NUM => $1] if /\G(\d+)/gc;
123						return [NAME => $1] if /\G([a-z]\w*)/gci;
124						return [$1 => $1] if /\G(.)/gc;
125						return;
126						}
127						};
128						}
129						$parser->input(make_lexer("2+3*4"));
130
131						=head2 peek_token
132
133						Returns the next token to be retrieved by the lexer, but keeps it in the input
134						queue. Can be used by a rule action to decide based on the input that follows.
135
136						=cut
137
138						#------------------------------------------------------------------------------
139						sub peek_token {
140	2407		2407	1	2135	my($self) = @_;
141	2407	100			1775	@{$self->_head} or push @{$self->_head}, $self->input->();
	1340				2945
	2407				5008
142	2403				8972	return $self->_head->[0]; # may be undef, if end of input
143						}
144						#------------------------------------------------------------------------------
145
146						=head2 get_token
147
148						Extracts the next token from the lexer stream. Can be used by a rule action to
149						discard the following tokens.
150
151						=cut
152
153						#------------------------------------------------------------------------------
154						sub get_token {
155	886		886	1	31433	my($self) = @_;
156	886	100			727	@{$self->_head} and return shift @{$self->_head};
	698				1251
	886				2352
157	188				647	return $self->_head->[0]; # may be undef, if end of input
158						}
159						#------------------------------------------------------------------------------
160
161						=head2 unget_token
162
163						Pushes back the given list of tokens to the lexer input stream, to be retrieved
164						on the next calls to C.
165
166						=cut
167
168						#------------------------------------------------------------------------------
169						sub unget_token {
170	4		4	1	9	my($self, @tokens) = @_;
171	4				5	unshift @{$self->_head}, @tokens;
	4				16
172	4				38	return;
173						}
174						#------------------------------------------------------------------------------
175
176						=head1 METHODS - PARSING
177
178						=head2 parse
179
180						This function receives an optional start rule name, and uses the default rule
181						of the grammar if not supplied.
182
183						It parses the input stream, leaving the stream at the first unparsed
184						token, and returns the parse value - the result of the action function for the
185						start rule.
186
187						The function dies with an error message indicating the input that cannot
188						be parsed in case of a parse error.
189
190						=head2 parse_XXX
191
192						For each rule C in the grammar, L creates a correspnding
193						C to start the parse at that rule. This is a short-cut to
194						C.
195
196						=cut
197
198						#------------------------------------------------------------------------------
199						sub parse {
200	175		175	1	11757	my($self, $start_rule) = @_;
201
202						# current state
203	175				226	my $state;
204	175	100			380	if (defined($start_rule)) {
205	8	100			212	$state = $self->_state_table->[0]{$start_rule}
206						or croak "Rule $start_rule not found";
207						}
208						else {
209	167	50			666	$state = $self->_start_state
210						or croak "Start state not found";
211						}
212	174				434	return $self->_parse($state);
213						}
214
215						#------------------------------------------------------------------------------
216						sub _parse {
217	194		194		235	my($self, $state) = @_;
218
219	194				272	my @values = ();
220
221						# return stack of states
222	194				213	my @stack = (); # store: [$state, @values]
223
224						# fetch token only after drop and after calling parser rules
225	194				422	my $token = $self->peek_token;
226	194				282	while (1) {
227	2572				1934	my($entry, $found_else);
228	2572	100			8424	if ($entry = $self->_state_table->[$state]{($token ? $token->[0] : "")}) {
		100
		100
229						# entry exists, found token
230						}
231						elsif ($entry = $self->_state_table->[$state]{__else__}) {
232	521				478	$found_else++;
233						}
234						else {
235	82				279	$self->_error_at($token, $state);
236						}
237
238	2490	100			3358	if (ref($entry) eq 'ARRAY') { # call sub-rule
239	1411				1536	my($next_state, $return_state) = @$entry;
240	1411				1985	push(@stack, [ $return_state, @values ]); # return data
241	1411				2013	($state, @values) = ($next_state); # call
242						}
243						else { # accept token
244	1079				858	$state = $entry;
245
246	1079	100			1646	if (!$found_else) {
247	725	100			1289	push(@values, $token) if $token; # add token to values
248	725				1194	$self->get_token; # drop value
249	725				1046	$token = $self->peek_token; # and get next token
250						}
251
252	1075				1964	while (ref($state) eq 'CODE') { # return from sub-rules
253	1460				8447	my $value = $self->$state(@values);
254	1460				2271	$token = $self->peek_token; # input may have changed
255
256	1460	100			2351	if ( ! @stack ) { # END OF PARSE
257	108				651	return $value;
258						}
259
260	1352				1257	my $top = pop(@stack);
261	1352				2412	($state, @values) = @$top;
262
263						# keep only defined values
264	1352	100			4388	push(@values, $value) if defined($value);
265						}
266						}
267						}
268	0				0	die 'not reached';
269						}
270
271						#------------------------------------------------------------------------------
272						# expected error at given stream position, die with error message
273						sub _error_at {
274	84		84		493	my($self, $token, $state) = @_;
275
276	95				773	my @expected = sort map {_format_token($_)}
	84				288
277	84				98	keys %{$self->_state_table->[$state]};
278	84	100			1999	die("Expected ",
		100
279						scalar(@expected) == 1 ? "@expected" : "one of (@expected)",
280						" at ",
281						defined($token) ? _format_token($token->[0]) : "EOF",
282						"\n");
283						}
284
285						#------------------------------------------------------------------------------
286						# format a token
287						sub _format_token {
288	131		131		183	my($token) = @_;
289	131	50			258	return "" if !defined($token);
290	131	100			321	return "EOF" if $token eq "";
291	113	100			434	return dump($token) if $token =~ /\W/;
292	89				409	return $token;
293						}
294						#------------------------------------------------------------------------------
295
296						=head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT
297
298						See L
299
300						=cut
301
302						#------------------------------------------------------------------------------
303
304						1;