File Coverage

blib/lib/Math/Expression/Evaluator/Parser.pm
Criterion Covered Total %
statement 141 143 98.6
branch 48 50 96.0
condition 3 3 100.0
subroutine 24 24 100.0
pod 1 1 100.0
total 217 221 98.1


line stmt bran cond sub pod time code
1             package Math::Expression::Evaluator::Parser;
2              
3             =head1 NAME
4              
5             Math::Expression::Evaluator::Parser - Parse mathematical expressions
6              
7             =head1 SYNOPSIS
8              
9             use Math::Expression::Evaluator::Parser;
10             my $exp = '2 + a * 4';
11             my $ast = Math::Expression::Evaluator::Parser::parse($exp, {});
12             # $ast is now something like this:
13             # $ast = ['+',
14             # 2,
15             # ['*',
16             # ['$', 'a'],
17             # 4
18             # ]
19             # ];
20              
21             =head1 DESCRIPTION
22              
23             This module parses a mathematical expression in usual notation, and
24             turns it into an Abstract Syntax Tree (AST).
25              
26             If you want to have a simple interface and want to evaluate these
27             ASTs, use L.
28              
29             The following description of the AST structure matches the current
30             implementation, but really is an implementation detail that's subject to
31             change without further notice. In particular a possible addition of meta
32             information (like file and line numbers) might require a change of structure.
33              
34             The AST is a tree that consists of nested array refs. The first item
35             is a string (until now always a single character), and denotes the type
36             of the node. The rest of the items in the array is a list of its arguments.
37              
38             For the mathematical symbols C<+>, C<->, C<*>, C, C<^> (exponentation)
39             this is straight forward, but C and C<-> are always treated as prefix ops,
40             so the string '2 - 3' is actually turned into C<['+', 2, ['-', 3]]>.
41              
42             Other AST nodes are
43              
44             =over 4
45              
46             =item '$'
47              
48             C<['$', $var_name]> represents a variable.
49              
50             =item '{'
51              
52             C<['{', $expr1, $expr2, ... ]> represents a block, i.e. a list of expressions.
53              
54             =item '='
55              
56             C<['=', $var, $expr]> represents an assignment, where C<$expr> is assigned
57             to C<$var>.
58              
59             =item '&'
60              
61             C<['&', $name, @args]> is a function toll to the function called C<$name>.
62              
63             =back
64              
65             =head1 METHODS
66              
67             =over
68              
69             =item parse
70              
71             C takes a string and a hash ref, where the hash ref takes
72             configuration parameters. Currently the only allowed option is
73             C. If set to a true value, it forces statements to
74             be forced by semicolons (so C<2 3> will be forbidden, C<2; 3> is still
75             allowed).
76              
77             C throws an exception on parse errors.
78              
79             =back
80              
81             =cut
82              
83 16     16   80 use strict;
  16         30  
  16         476  
84 16     16   107 use warnings;
  16         36  
  16         446  
85              
86 16     16   16002 use Math::Expression::Evaluator::Lexer qw(lex);
  16         45  
  16         1105  
87 16     16   9987 use Math::Expression::Evaluator::Util qw(is_lvalue);
  16         42  
  16         932  
88 16     16   660 use Carp qw(confess);
  16         29  
  16         820  
89 16     16   81 use Data::Dumper;
  16         24  
  16         29881  
90              
91              
92             my @input_tokens = (
93             ['ExpOp' => '\^|\*\*'],
94             ['MulOp' => qr{[*/%]}],
95             ['AddOp' => '\+|-'],
96             # This regex is 'stolen' from Regexp::Common, and a bit simplified
97             # Copyright by Damian Conway and Abigail, 2001-2005
98             ['Float' => "[+-]?(?=[0-9]|[.])[0-9]*(?:[.][0-9]*)?(?:[eE](?:[+-]?[0-9]+)|)"],
99             ['OpenParen' => '\('],
100             ['ClosingParen' => '\)'],
101             ['Colon' => ';'],
102             ['Comma' => ','],
103             ['AssignmentOp' => '='],
104             ['Name' => '[a-zA-Z_][a-zA-Z_0-9]*'],
105             ['Whitespace' => '\s+', sub {return undef}],
106             ['Comment' => qr/\#.*?$/, sub {return undef}],
107             );
108              
109             my %token_description = (
110             ExpOp => 'Operator',
111             MulOp => 'Operator',
112             AddOp => 'Operator',
113             AssignmentOp => 'Operator',
114             Float => 'Term',
115             Name => 'Term',
116             );
117              
118             sub parse {
119              
120 291     291 1 707 my ($text, $parse_opts) = @_;
121              
122             # note that this object is only used internally, to the
123             # world outside we hide it.
124 291         524 my $self = bless {};
125 291         760 $self->{config} = $parse_opts;
126 291         1099 $self->{tokens} = lex($text, \@input_tokens);
127 290         627 $self->{token_pointer} = 0;
128 290         1000 return $self->_program();
129              
130             }
131              
132             # checks if the next token is what you expected, for example
133             # _is_next_token("AddOp") checks if the next token is a '+' or '-'
134             sub _is_next_token {
135 4578     4578   5206 my $self = shift;
136 4578         5102 my $cmp = shift;
137 4578         7596 my $next = $self->{tokens}[$self->{token_pointer}];
138 4578 100 100     26156 if (defined $next && $next->[0] eq $cmp){
139 499         1675 return $next->[1];
140             }
141             }
142              
143             # basically the same _is_next_token, but does an arbitrary number of lookahead
144             # steps.
145             sub _lookahead {
146 1252     1252   1380 my $self = shift;
147 1252         1440 my $i = 0;
148 1252         3653 while (my $v = shift){
149 1422 100       3854 return undef unless($self->{tokens}[$self->{token_pointer}+$i]);
150 1397         2468 my $ref = $self->{tokens}[$self->{token_pointer} + $i]->[0];
151 1397 100       6269 return undef unless($ref eq $v);
152 261         696 $i++;
153             }
154 91         191 return 1;
155             }
156              
157             # move the token pointer one step further.
158             sub _proceed {
159 1340     1340   1594 my $self = shift;
160 1340         2313 $self->{token_pointer}++;
161             }
162              
163             # returns the next not-yet-parsed token
164             sub _next_token {
165 2645     2645   7645 return $_[0]->{tokens}[$_[0]->{token_pointer}];
166             }
167              
168             # program -> statement*
169             # parse a program, e.g. a collection of statements.
170             # The corrsponding AST looks like this: ['{', $s1, $s2, $s3, ... ]
171             sub _program {
172 290     290   439 my $self = shift;
173 290         593 my @res = ('{');
174 290         646 while (defined $self->_next_token()){
175 304         774 push @res, $self->_statement();
176             }
177 277         547 return _return_simplify(@res);
178             }
179              
180             # generates an error message that something was expected but not found,
181             # for example 'a + +' would warn that a value was expected, but an AddOp
182             # was found.
183             sub _expected {
184 4     4   5 my $self = shift;
185 4 50       9 if (scalar @_ > 1){
186 4         15 confess("Parse error: Expected $_[0]; got: '$_[1]'\n"
187             . "near character " . $self->_next_token->[2] . "\n");
188             } else {
189 0         0 confess("Parse error: Expected $_[0]\n"
190             . "near character " . $self->_next_token->[2] . "\n");
191             }
192             }
193              
194             # matches a specific token, and returns its text if successfull. Dies if
195             # unsuccessfull.
196             sub _match {
197 1031     1031   1169 my $self = shift;
198 1031         1265 my $m = shift;
199 1031         1047 my $val;
200 1031         1757 my $next = $self->_next_token();
201 1031 100       3206 confess("Expected $m, got EOF") unless ref $next;
202 1025 100       2071 if ($next->[0] eq $m){
203 1021         1769 $val = $self->_next_token()->[1];
204 1021         2226 $self->_proceed();
205 1021         2443 return $val;
206             } else {
207 4         6 $self->_expected($m, $self->_next_token()->[0]);
208             }
209             }
210              
211             # -> | |
212             # parses a single value: a float, a function call or a variable name
213             # returns the corresponding AST.
214             sub _value {
215 708     708   807 my $self = shift;
216 708 100       1842 if ($self->_lookahead("Name", "OpenParen")){
    100          
    100          
217 85         155 return $self->_function_call();
218             } elsif ($self->_is_next_token("Name")){
219 79         248 return $self->_get_variable();
220             } elsif ($self->_lookahead(qw/AddOp Float/)) {
221 6         14 my $sign = $self->_match('AddOp');
222 6         17 return ("$sign".1) * $self->_match('Float');
223             } else {
224 538         1107 return $self->_match("Float");
225             }
226             }
227              
228             # -> '(' [ [',' ]* ]? ')'
229             # parses a function call, the AST looks like this: ['&', $name, @args]
230             sub _function_call {
231 85     85   115 my $self = shift;
232 85         146 my @res = ('&', $self->_match("Name"));
233 85         190 $self->_match("OpenParen");
234 85 100       181 if ($self->_is_next_token("ClosingParen")){
235 8         19 $self->_proceed();
236 8         27 return \@res;
237             }
238 77         232 push @res, $self->_expression();
239 77         246 while ($self->_is_next_token("Comma")){
240 4         14 $self->_proceed();
241 4 100       10 last if $self->_is_next_token('ClosingParen');
242 2         10 push @res, $self->_expression();
243             }
244 77         269 $self->_match("ClosingParen");
245 77         200 return \@res;
246             }
247              
248             # -> m/[a-zA-Z_]\w*/
249             # parses a variable name, and returns it
250             sub _get_variable {
251 79     79   105 my $self = shift;
252 79         198 my $var_name = $self->_match("Name");
253 79         269 return ['$', $var_name];
254             }
255              
256             # -> <_assignment> |
257             # parses a statement, eg an _assignment or an expression.
258             sub _statement {
259 304     304   370 my $self = shift;
260 304         728 my $e = $self->_expression();
261 296 100       609 if ($self->_is_next_token("AssignmentOp")){
262 10         28 $e = $self->_assignment($e);
263             }
264              
265 293 100       795 if ($self->{config}->{force_semicolon}){
266             # forced semicolon between two statements (but the last statement
267             # isn't forced to have one):
268 4 100       8 if ($self->_next_token()){
269 3         8 $self->_match("Colon");
270             }
271             } else {
272             # optional semicolon at end of statement
273 289 100       584 if ($self->_is_next_token("Colon")){
274 7         19 $self->_proceed();
275             }
276             }
277 291         899 return $e;
278             }
279              
280             # <_assignment> ::= '='
281             # expects the lvalue as first argument
282             sub _assignment {
283 10     10   16 my $self = shift;
284 10         19 my $e = shift;
285 10         25 $self->_match("AssignmentOp");
286 10         27 my $val = $self->_expression();
287 10 100       87 if (is_lvalue($e)){
288 7         27 return ['=', $e, $val];
289             } else {
290 3         508 confess("Not an lvalue in _assignment");
291             }
292             }
293              
294              
295             # ::= [('*'|'/') ]*
296             # the AST is a bit weird, a simple product is expressed as
297             # ['*', $v1, $v2, ... ]
298             # a division is a bit more complex:
299             # a / b / c becomes ['*', a, ['/', b], ['/', c]]
300             sub _term {
301 568     568   725 my $self = shift;
302 568         1116 my $val = $self->_exponential();
303 562         1374 my @res = ('*', $val);
304 562         1073 while (my $op = $self->_is_next_token("MulOp")){
305 157 100       394 if ($op eq '*'){
    100          
    50          
306 87         179 $self->_proceed();
307 87         188 push @res, $self->_exponential();
308             } elsif ($op eq '/'){
309 37         89 $self->_proceed();
310 37         82 push @res, ['/', $self->_exponential()];
311             } elsif ($op eq '%') {
312 33         70 $self->_proceed();
313             # XXX not very efficient
314 33         98 @res = ('*', ['%', [@res], $self->_exponential()]);
315             } else {
316 0         0 die "Don't know how to handle MulOp $op\n";
317             }
318             }
319 560         1189 return _return_simplify(@res);
320             }
321              
322             # ::= ['+'|'-']? [('+'|'-') term]*
323             sub _expression {
324 446     446   542 my $self = shift;
325             # print STDERR "expression...\n";
326 446         779 my @res = ('+');
327 446 100       964 if (my $op = $self->_is_next_token("AddOp")){
328             # unary +/-
329 21         101 $self->_proceed();
330 21 100       45 if ($op eq '+'){
331 4         9 push @res, $self->_term();
332             } else {
333 17         41 push @res, ['-', $self->_term()];
334             }
335             } else {
336 425         1048 push @res, $self->_term();
337             }
338 441         1002 while (my $op = $self->_is_next_token("AddOp")){
339 122 100       245 if ($op eq '+'){
340 95         184 $self->_proceed();
341 95         188 push @res, $self->_term();
342             } else {
343             # a '-'
344 27         65 $self->_proceed();
345 27         60 push @res, ['-', $self->_term()];
346             }
347             }
348 438         1016 return _return_simplify(@res);
349             }
350              
351             # ::= | '(' ')'
352             sub _factor {
353 761     761   979 my $self = shift;
354 761         755 my $val;
355 761 100       1277 if ($self->_is_next_token("OpenParen")){
356 53         135 $self->_match("OpenParen");
357 53         133 $val = $self->_expression();
358 53         121 $self->_match("ClosingParen");
359             } else {
360 708         1405 $val = $self->_value();
361             }
362 753         1795 return $val;
363             }
364              
365             # ::= [ '^' ]?
366             # note that 2^3^4 is not defined, ie ^ is not associative
367             sub _exponential {
368 725     725   890 my $self = shift;
369 725         1277 my $val = $self->_factor();
370 718 100       1347 if ($self->_is_next_token("ExpOp")){
371 36         76 $self->_match("ExpOp");
372 36         80 return ['^', $val, $self->_factor()];
373             } else {
374 682         1639 return $val;
375             }
376             }
377              
378             sub _return_simplify {
379 1275 100   1275   6115 return $_[1] if @_ == 2;
380 205         712 return \@_;
381             }
382              
383              
384             1;
385             # vim: sw=4 ts=4 expandtab