File Coverage

blib/lib/Math/Expression/Evaluator/Lexer.pm
Criterion Covered Total %
statement 42 42 100.0
branch 14 14 100.0
condition n/a
subroutine 6 6 100.0
pod 1 1 100.0
total 63 63 100.0


line stmt bran cond sub pod time code
1             package Math::Expression::Evaluator::Lexer;
2 17     17   28244 use warnings;
  17         33  
  17         463  
3 17     17   81 use strict;
  17         31  
  17         505  
4 17     17   96 use Carp qw(confess);
  17         25  
  17         1219  
5 17     17   16987 use Data::Dumper;
  17         165113  
  17         9011  
6              
7             =head1 NAME
8              
9             Math::Expression::Evaluator::Lexer - Simple Lexer
10              
11             =head1 SYNOPSIS
12              
13             use Math::Expression::Evaluator::Lexer qw(lex);
14             # suppose you want to parse simple math expressions
15             my @input_tokens = (
16             ['Int', qr/[+-]?\d+/ ],
17             ['Op', qr{[+-/*]} ],
18             ['Brace_Open', qr/\(/ ],
19             ['Brace_Close', qr/\)/ ],
20             ['Whitespace', qr/\s+/, sub { return; }],
21             );
22             my $text = "-12 * (3+4)";
23             my $out_tokens = lex($text, \@input_tokens);
24             for (@$out_tokens){
25             my ($name, $text, $pos) = @$_;
26             print "Found Token $name: $text (string pos: $pos)\n";
27             }
28              
29             =head1 DESCRIPTION
30              
31             Math::Expression::Evaluator::Lexer is a simple lexer that breaks up a text
32             into tokens, depending on the input tokens you provide
33              
34             =head1 METHODS
35              
36             =over 2
37              
38             =item lex
39              
40             The only exported routine is lex, which expects input text as its
41             first argument and a array ref to list of input tokens.
42              
43             Each input token consists of a token name (which you can choose freely),
44             a regex which matches the desired token, and optionally a reference to
45             a functions that takes the matched token text as its argument. The
46             token text is replaced by the return value of that function. If the
47             function returns undef, that token will not be included in the list
48             of output tokens. The regex should either fail or match at least one
49             character; zero-width matches utterly confuse the lexer, and are disallowed.
50              
51             lex() returns an array ref to a list of output tokens, each output
52             token is a reference to a list which contains the token name, the matched
53             text, the string position (in characters, counted from the start of
54             the input string, zero based) and the line number.
55              
56             Note that C puts parentheses around the entire regex, so if you
57             want to use backreferences, the numbering of the capturing group is changed.
58              
59             =back
60              
61             =head1 COPYRIGHT AND LICENSE
62              
63             Copyright (C) 2007 by Moritz Lenz, L,
64             L.
65              
66             This Program and its Documentation is free software. You may distribute it
67             under the same terms as perl itself.
68              
69             However all code examples are to be public domain, so you can use it in any
70             way you want to.
71              
72             =cut
73              
74             require Exporter;
75             our @ISA = qw(Exporter);
76             our @EXPORT_OK = qw(lex);
77              
78             our %EXPORT_TAGS = (":all" => \@EXPORT_OK);
79              
80             sub lex {
81 297     297 1 1204 my ($text, $tokens) = @_;
82 297 100       1111 confess("passed undefined value to lex()") unless defined $text;
83 296         489 my $l = length $text;
84 296 100       618 return [] unless $l;
85              
86 295         478 my ($last_line_number, $last_pos) = (0, 0);
87             my $pos_and_line_number = sub {
88 1348     1348   1605 my $pos = shift;
89 1348         2957 $last_line_number +=
90             (substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
91 1348         1714 $last_pos = $pos;
92 1348         4654 return ($pos, $last_line_number + 1);
93 295         1400 };
94              
95 295         412 my @res;
96              
97             # avoid 'Use of uninitialized value in numeric lt (<)' warnings:
98 295         724 pos($text) = 0;
99              
100 295         904 while (pos($text) < $l){
101 963         1176 my $matched = 0;
102             REGEXES:
103 963         1649 for (@$tokens){
104 11513         17113 my $re = $_->[1];
105             # failed regex matches reset pos() unless the /c modifier
106             # is present
107 11513 100       211789 if ($text =~ m/\G($re)/gc){
108 1582         2128 $matched = 1;
109 1582         3070 my $match = $1;
110 1582 100       3229 if (length $match == 0){
111 1         185 confess("Each token has to require at least one "
112             . "character; Rule $_->[0] matched Zero!\n");
113             }
114              
115             # safe information before callbacks can modify $match
116             # and thus length($match)
117 1581         2381 my $pos = pos($text) - length($match);
118              
119 1581 100       3297 if ($_->[2]){
120 234         808 $match = $_->[2]->($match);
121             }
122 1581 100       3139 if (defined $match){
123 1348         2946 push @res, [
124             $_->[0],
125             $match,
126             $pos_and_line_number->($pos),
127             ];
128             }
129 1581         5726 next REGEXES;
130             }
131             }
132 962 100       3498 if ($matched == 0){
133 1         120 confess("No token matched input text <$text> at position " . pos($text));
134             }
135             }
136 293         2658 return \@res;
137             }
138              
139             1;
140              
141             # vim: sw=4 ts=4 expandtab