File Coverage

blib/lib/Parse/FSM/Lexer.pm
Criterion Covered Total %
statement 180 182 98.9
branch 48 52 92.3
condition 4 5 80.0
subroutine 34 34 100.0
pod 10 13 76.9
total 276 286 96.5


line stmt bran cond sub pod time code
1             # $Id: Lexer.pm,v 1.10 2013/07/27 00:34:39 Paulo Exp $
2            
3             package Parse::FSM::Lexer;
4            
5             #------------------------------------------------------------------------------
6            
7             =head1 NAME
8            
9             Parse::FSM::Lexer - Companion Lexer for the Parse::FSM parser
10            
11             =cut
12            
13             #------------------------------------------------------------------------------
14            
15 1     1   30447 use 5.010;
  1         3  
16 1     1   3 use strict;
  1         0  
  1         19  
17 1     1   3 use warnings;
  1         0  
  1         17  
18            
19 1     1   3 use File::Spec;
  1         1  
  1         17  
20 1     1   3 use Data::Dump 'dump';
  1         1  
  1         32  
21 1     1   347 use Parse::FSM::Error;
  1         1  
  1         49  
22            
23             our $VERSION = '1.13';
24            
25             #------------------------------------------------------------------------------
26            
27             =head1 SYNOPSIS
28            
29             use Parse::FSM::Lexer;
30             $lex = Parse::FSM::Lexer->new;
31             $lex = Parse::FSM::Lexer->new(@files);
32            
33             $lex->add_path(@dirs); @dirs = $lex->path;
34             $full_path = $lex->path_search($file);
35            
36             $lex->from_file($filename);
37             $lex->from_list(@input);
38             $lex->from_list(sub {});
39            
40             $lex->get_token;
41            
42             $lex->error($message);
43             $lex->warning($message);
44             $lex->file;
45             $lex->line_nr;
46            
47             # in a nearby piece of code
48             use MyParser; # isa Parse::FSM::Driver;
49             my $parser = MyParser->new;
50             $parser->input(sub {$lex->get_token});
51             eval {$parser->parse}; $@ and $lex->error($@);
52            
53             =head1 DESCRIPTION
54            
55             This module implements a generic tokenizer that can be used by
56             L parsers, and can also be used standalone
57             independently of the parser.
58            
59             It supports recursive file includes and takes track of current file name
60             and line number. It keeps the path of search directories to search for
61             input files.
62            
63             The C method can be called by the C method of the parser
64             to retrieves the next input token to parse.
65            
66             The module can be used directly if the supplied tokenizer is enough for the
67             application, but usually a derived class has to be written implementing a
68             custom version of the C method.
69            
70             =head1 METHODS - SETUP
71            
72             =head2 new
73            
74             Creates a new object. If an argument list is given, calls C
75             for each of the file starting from the last, so that the files are
76             read in the given order.
77            
78             =cut
79            
80             #------------------------------------------------------------------------------
81 1     1   4 use constant INPUT => 0; # input stream, code ref
  1         1  
  1         67  
82 1     1   4 use constant FILE => 1; # name of the input file, undef for list
  1         1  
  1         31  
83 1     1   3 use constant LINE_NR => 2; # current input line number
  1         1  
  1         29  
84 1     1   3 use constant LINE_INC => 3; # increment to next line number
  1         1  
  1         28  
85 1     1   3 use constant SAW_NL => 4; # true if saw a newline before
  1         1  
  1         31  
86             # used to increment LINE_INC on next token
87 1     1   2 use constant TEXT => 5; # line text being lexed
  1         5  
  1         31  
88            
89 1     1   3 use constant STACK => 6; # stack of previous contexts for recursive
  1         1  
  1         29  
90             # includes, saves
91             # [input, file, line_nr, line_inc, saw_nl,
92             # text, pos(text)]
93 1     1   2 use constant PATH => 7; # path of search directories
  1         1  
  1         55  
94            
95             # only limited accessors
96             use Class::XSAccessor::Array {
97 1         6 accessors => {
98             file => FILE,
99             line_nr => LINE_NR,
100             line_inc => LINE_INC,
101             }
102 1     1   434 };
  1         2663  
103            
104             #------------------------------------------------------------------------------
105             sub new {
106 34     34 1 8277 my($class, @files) = @_;
107 34         51 my $self = bless [], $class;
108 34         54 $self->[STACK] = [];
109 34         41 $self->[PATH] = [];
110 34         74 $self->from_file($_) for reverse @files;
111 33         50 return $self;
112             }
113            
114             #------------------------------------------------------------------------------
115             # push context for include file
116             sub _push_context {
117 21     21   20 my($self) = @_;
118 21         23 push @{$self->[STACK]},
119 21         18 [ @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) ];
  21         51  
120 21         22 return;
121             }
122            
123             #------------------------------------------------------------------------------
124             # pop context
125             sub _pop_context {
126 47     47   39 my($self) = @_;
127 47         139 ( @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) )
128 47 100       32 = @{ pop(@{$self->[STACK]}) || [] };
  47         31  
  47         155  
129 47         162 return;
130             }
131            
132             #------------------------------------------------------------------------------
133            
134             =head1 METHODS - SEARCH PATH FOR FILES
135            
136             =head2 path
137            
138             Returns the list of directories to search in sequence for source files.
139            
140             =cut
141            
142             #------------------------------------------------------------------------------
143 3     3 1 269 sub path { @{$_[0][PATH]} } ## no critic
  3         12  
144             #------------------------------------------------------------------------------
145            
146             =head2 add_path
147            
148             Adds the given directories to the path searched for include files.
149            
150             =cut
151            
152             #------------------------------------------------------------------------------
153             sub add_path {
154 2     2 1 3 my($self, @dirs) = @_;
155 2         2 push @{$self->[PATH]}, @dirs;
  2         5  
156             }
157             #------------------------------------------------------------------------------
158            
159             =head2 path_search
160            
161             Searches for the given file name in the C created by C, returns
162             the first full path name where the file can be found.
163            
164             Returns the given input file name unchanged if:
165            
166             =over 4
167            
168             =item *
169            
170             the file is found in the current directory; or
171            
172             =item *
173            
174             the file is not found in any of the C directories.
175            
176             =back
177            
178             =cut
179            
180             #------------------------------------------------------------------------------
181             sub path_search {
182 39     39 1 28 my($self, $file) = @_;
183            
184 39 100       458 return $file if -f $file; # found
185            
186 8         8 for my $dir (@{$self->[PATH]}) {
  8         15  
187 8         65 my $full_path = File::Spec->catfile($dir, $file);
188 8 100       106 return $full_path if -f $full_path;
189             }
190            
191 3         10 return $file; # not found
192             }
193             #------------------------------------------------------------------------------
194            
195             =head1 METHODS - INPUT STREAM
196            
197             =head2 from_file
198            
199             Saves the current input context, searches for the given input file name
200             in the C, opens the file and sets-up the object to read
201             each line in sequence. At the end of the
202             file input resumes to the place where it was when C was called.
203            
204             Dies if the input file cannot be read, or if a file is
205             included recursively, to avoid an infinite include loop.
206            
207             =cut
208            
209             #------------------------------------------------------------------------------
210             sub from_file {
211 35     35 1 840 my($self, $file) = @_;
212            
213             # search include path
214 35         47 $file = $self->path_search($file);
215            
216             # check for include loop
217 35 100 100     31 if (grep {($_->[FILE] // "") eq $file} @{$self->[STACK]}) {
  12         44  
  35         69  
218 1         3 $self->error("#include loop");
219             }
220            
221             # open the file
222 34 100       627 open(my $fh, "<", $file)
223             or $self->error("unable to open input file '$file'");
224            
225             # create a new iterator to read file lines
226             my $input = sub {
227 87 50   87   97 $fh or return;
228 87         443 my $line = <$fh>;
229 87 100       116 if (defined $line) {
230 59 100       165 $line .= "\n" unless $line =~ /\n\z/; # add \n if missing
231 59         79 return $line;
232             }
233 28         23 $fh = undef; # free handle when file ends
234 28         186 return;
235 32         93 };
236 32         55 $self->from_list($input);
237 32         29 $self->[FILE] = $file;
238            
239 32         41 return;
240             }
241             #------------------------------------------------------------------------------
242            
243             =head2 from_list
244            
245             Saves the current input context and sets-up the object to read each element
246             of the passed input list. Each element either a text string
247             or a code reference of an iterator that returns text strings.
248             The iterator returns C at the end of input.
249            
250             =cut
251            
252             #------------------------------------------------------------------------------
253             # input from text string (if scalar) or iterator (if CODE ref)
254             sub from_list {
255 51     51 1 3970 my($self, @input) = @_;
256            
257             # save previous context
258 51 100       110 $self->_push_context if defined $self->[INPUT];
259            
260             # iterator
261             my $input = sub {
262 141     141   90 while (1) {
263 170 100       286 @input or return; # end of input
264 122         154 for ($input[0]) {
265 122 100       175 if (! ref $_) {
266 30         71 return shift @input; # scalar -> return it
267             }
268             else { # has to be a CODE ref
269 92         88 my $element = $_->();
270 92 100       110 if (defined $element) { # iterator returned something
271 63         121 return $element;
272             }
273             else { # end of iterator
274 29         89 shift @input; # continue loop
275             }
276             }
277             }
278             }
279 51         102 };
280            
281             # initialize
282 51         51 @{$self}[ INPUT, FILE, LINE_NR, LINE_INC, SAW_NL, TEXT ]
  51         97  
283             = ( $input, undef, 0, 1, 1, undef );
284            
285 51         62 return;
286             }
287             #------------------------------------------------------------------------------
288            
289             =head1 METHODS - INPUT
290            
291             =head2 get_token
292            
293             Retrieves the next token from the input as an array reference containing
294             token type and token value.
295            
296             Returns C on end of input.
297            
298             =head2 tokenizer
299            
300             Method responsible to match the next token from the given input string.
301            
302             This method can be overridden by a child class in order to implement a different
303             set of tokens to be retrieved from the input.
304            
305             It is implemented with features from the Perl 5.010 regex engine:
306            
307             =over 4
308            
309             =item *
310            
311             one big regex with C to match from where the
312             last match ended; the string to match is passed as a scalar reference, so that
313             the position of last match C is preserved;
314            
315             =item *
316            
317             one sequence of C<(?:...|...)> alternations for each token to be matched;
318            
319             =item *
320            
321             using C<(?E...)> for each token to make sure there is no
322             backtracking;
323            
324             =item *
325            
326             using capturing parentheses and embedded code evaluation
327             C<(?{ [TYPE =E $^N] })> to return the token value
328             from the regex match;
329            
330             =item *
331            
332             using C<$^R> as the value of the matched token;
333            
334             As the regex engine is not
335             reentrant, any operation that may call another regex match
336             (e.g. recursive file include) cannot be done inside
337             the C<(?{ ... })> code block, and is done after the regex match by checking the
338             C<$^R> for special tokens.
339            
340             =item *
341            
342             using C as the return of C<$^R> to ignore a token, e.g. white space.
343            
344             =back
345            
346             The default tokenizer recognizes and returns the following token types:
347            
348             =over 4
349            
350             =item [STR => $value]
351            
352             Perl-like single or double quoted string, C<$value> contains the string
353             without the quotes and with any backslash escapes resolved.
354            
355             The string cannot span multiple input lines.
356            
357             =item [NUM => $value]
358            
359             Perl-like integer in decimal, hexadecimal, octal or binary notation,
360             C<$value> contains decimal value of the integer.
361            
362             =item [NAME => $name]
363            
364             Perl-like identifier name, i.e. word starting with a letter or underscore and
365             followed by letters, underscores or digits.
366            
367             =item [$token => $token]
368            
369             All other characters except white space are returned in the form
370             C<[$token=E$token]>, where C<$token> is a single character or one
371             of the following composed tokens: << >> == != >= <=
372            
373             =item white space
374            
375             All white space is ignored, i.e. the tokenizer returns C.
376            
377             =item [INCLUDE => $file]
378            
379             Returned when a C<#include> statement is recognized, causes the lexer to
380             recursively include the file at the current input stream location.
381            
382             =item [INPUT_POS => $file, $line_nr, $line_inc]
383            
384             Returned when a C<#line> statement is recognized, causes the lexer to
385             set the current input location to the given C<$file>, C<$line_nr> and
386             C<$line_inc>.
387            
388             =item [ERROR => $message]
389            
390             Causes the lexer to call C with the given error message, can be
391             used when the input cannot be tokenized.
392            
393             =back
394            
395             =cut
396            
397             #------------------------------------------------------------------------------
398             # get the next line from input, set TEXT, return true
399             # accumulate lines ending in \\, to allow lexer to handle continuation lines
400             sub _readline {
401 148     148   116 my($self) = @_;
402            
403 148         99 while (1) {
404 195 100       449 my $input = $self->[INPUT] or return; # no input, return false
405 139 100       149 if ( defined( $self->[TEXT] = $input->() ) ) {
406 92         170 while ( $self->[TEXT] =~ /\\\Z/ ) {
407 2         3 my $next_line = $input->();
408 2 100       6 last unless defined $next_line;
409 1         4 $self->[TEXT] .= $next_line;
410             }
411 92         137 pos($self->[TEXT]) = 0;
412 92         116 last;
413             }
414             else {
415 47         68 $self->_pop_context; # pop and continue
416             }
417             }
418 92         155 return 1;
419             }
420            
421             #------------------------------------------------------------------------------
422             # get next token as [TYPE => VALUE], undef on end of input
423             sub get_token {
424 212     212 1 149762 my($self) = @_;
425            
426             LINE:
427 212         189 while (1) {
428             # read line
429 301 100       530 if (! defined $self->[TEXT]) {
430 148 100       201 $self->_readline or return; # end of input
431             }
432            
433             # return tokens
434 245         468 while ( (my $start_pos = pos($self->[TEXT]))
435             < length($self->[TEXT])
436             ) {
437             # increment line number if last token included newlines
438             # need to retest after each token
439 334 100       438 if ($self->[SAW_NL]) {
440 106         134 $self->[LINE_NR] += $self->[SAW_NL] * $self->[LINE_INC];
441 106         88 undef $self->[SAW_NL];
442             }
443            
444             # read next token
445 334         464 my $token = $self->tokenizer(\($self->[TEXT]));
446            
447             # check for newlines
448 334         497 my $end_pos = pos($self->[TEXT]);
449 334         539 $self->[SAW_NL] +=
450             substr($self->[TEXT], $start_pos, $end_pos - $start_pos)
451             =~ tr/\n/\n/;
452            
453             # check for special tokens
454 334 100       669 next unless defined $token;
455            
456 181         452 my $method = $self->can( $token->[0] );
457 181 100       203 if ($method) {
458 28         31 my $new_token = $self->$method($token);
459 25 50       36 return $new_token if defined $new_token;
460 25 100       59 next LINE unless defined $self->[TEXT]; # if context changed
461             }
462             else {
463 153         660 return $token;
464             }
465             }
466             # end of line
467 70         75 undef $self->[TEXT];
468             }
469             }
470            
471             #------------------------------------------------------------------------------
472             # special handlers: return $token to return changed token; return undef to continue loop
473             # changeable by subclass
474             sub INCLUDE {
475 21     21 0 17 my($self, $token) = @_;
476            
477 21         32 $self->from_file($token->[1]);
478            
479 19         52 return;
480             }
481            
482             sub INPUT_POS {
483 6     6 0 4 my($self, $token) = @_;
484            
485 6         12 @{$self}[ SAW_NL, FILE, LINE_NR, LINE_INC ] =
486 6         11 ( undef, @{$token}[1 .. $#$token] );
  6         7  
487            
488 6         7 return;
489             }
490            
491             sub ERROR {
492 1     1 0 1 my($self, $token) = @_;
493            
494 1         3 $self->error($token->[1]);
495            
496 0         0 return;
497             }
498            
499             #------------------------------------------------------------------------------
500             # get next token as [TYPE => VALUE] from the given string reference
501             # return undef to ignore a token
502             sub tokenizer {
503 310     310 1 236 my($self, $rtext) = @_;
504 310         202 our $LINE_NR; local $LINE_NR;
  310         226  
505            
506 310 50       1218 $$rtext =~ m{\G
507             (?:
508             # #include
509             (?> ^ (?&SP)* \# include (?&SP)*
510 11         59 (?: \' ( [^\'\n]+ ) \' (?{ [INCLUDE => $^N] })
511 3         16 | \" ( [^\"\n]+ ) \" (?{ [INCLUDE => $^N] })
512 5         25 | < ( [^>\n]+ ) > (?{ [INCLUDE => $^N] })
513 2         12 | ( \S+ ) (?{ [INCLUDE => $^N] })
514 1         6 | (?{ [ERROR =>
515             "#include expects a file name"] })
516             )
517             .* \n? # eat newline
518             )
519            
520             # #line
521             | (?> ^ (?&SP)* \# line (?&SP)+
522 6         19 (\d+) (?&SP)+ (?{ $LINE_NR = $^N })
523 6         26 \"? ([^\"\n]+) \"? (?{ [INPUT_POS => $^N, $LINE_NR, 1] })
524             .* \n? # eat newline
525             )
526            
527             # other #-lines - ignore
528             | (?> ^ (?&SP)* \# .* \n? (?{ undef })
529             )
530            
531             # white space
532             | (?> \s+ (?{ undef })
533             )
534            
535             # string
536             | (?> ( \" (?: \\. | [^\\\"] )* \" )
537 7         256 (?{ [STR => eval($^N)] })
538             )
539             | (?> ( \' (?: \\. | [^\\\'] )* \' )
540 7         259 (?{ [STR => eval($^N)] })
541             )
542            
543             # number
544 3         14 | (?> 0x ( [0-9a-f]+ ) \b (?{ [NUM => hex($^N)] })
545             )
546 3         16 | (?> 0b ( [01]+ ) \b (?{ [NUM => oct("0b".$^N)] })
547             )
548 3         16 | (?> 0 ( [0-7]+ ) \b (?{ [NUM => oct("0".$^N)] })
549             )
550 56         250 | (?> ( \d+ ) \b (?{ [NUM => 0+$^N] })
551             )
552            
553             # name
554 38         154 | (?> ( [a-z_]\w* ) (?{ [NAME => $^N] })
555             )
556            
557             # symbols
558             | (?> ( << | >> | == | != | >= | <= | . )
559 26         102 (?{ [$^N, $^N] })
560             )
561             )
562            
563             (?(DEFINE)
564             # horizontal blanks
565             (? [\t\f\r ] )
566             )
567             }gcxmi or die 'not reached';
568 310         390 return $^R;
569             }
570            
571             #------------------------------------------------------------------------------
572             # implemented by XSAccessor above
573            
574             =head1 METHODS - INPUT LOCATION AND ERRORS
575            
576             =head2 file
577            
578             Returns the current input file, C if reading from a list.
579            
580             =head2 line_nr
581            
582             Returns the current input line number, starting at 1.
583            
584             =head2 line_inc
585            
586             Increment of line number on each new-line found, usually 1.
587            
588             =head2 error
589            
590             Dies with the given error message, indicating the place in the input source file
591             where the error occurred.
592            
593             =cut
594            
595             #------------------------------------------------------------------------------
596             sub error {
597 10     10 1 1511 my($self, $message) = @_;
598 10         15 Parse::FSM::Error::error( $self->_error_msg($message),
599             $self->[FILE], $self->[LINE_NR] );
600             }
601             #------------------------------------------------------------------------------
602            
603             =head2 warning
604            
605             Warns with the given error message, indicating the place in the input source file
606             where the warning occurred.
607            
608             =cut
609            
610             #------------------------------------------------------------------------------
611             sub warning {
612 6     6 1 1432 my($self, $message) = @_;
613 6         8 Parse::FSM::Error::warning( $self->_error_msg($message),
614             $self->[FILE], $self->[LINE_NR] );
615             }
616            
617             #------------------------------------------------------------------------------
618             # error message for error() and warning()
619             sub _error_msg {
620 16     16   11 my($self, $message) = @_;
621            
622 16 100       57 defined($message) and $message =~ s/\s+\z//;
623            
624 16         22 my $near;
625 16 100 66     43 if (defined($self->[TEXT]) && defined(pos($self->[TEXT]))) {
626 3         5 my $code = substr($self->[TEXT], pos($self->[TEXT]), 20);
627 3         4 $code =~ s/\n.*//s;
628 3 50       5 if ($code ne "") {
629 0         0 $near = "near ".dump($code);
630             }
631             }
632            
633 16         19 return join(" ", grep {defined} $message, $near);
  32         88  
634             }
635             #------------------------------------------------------------------------------
636            
637             =head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT
638            
639             See L
640            
641             =cut
642            
643             #------------------------------------------------------------------------------
644            
645             1;