File Coverage

blib/lib/Parse/FSM/Lexer.pm

Criterion	Covered	Total	%
statement	166	168	98.8
branch	48	52	92.3
condition	4	5	80.0
subroutine	34	34	100.0
pod	10	13	76.9
total	262	272	96.3

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: Lexer.pm,v 1.10 2013/07/27 00:34:39 Paulo Exp $
2
3							package Parse::FSM::Lexer;
4
5							#------------------------------------------------------------------------------
6
7							=head1 NAME
8
9							Parse::FSM::Lexer - Companion Lexer for the Parse::FSM parser
10
11							=cut
12
13							#------------------------------------------------------------------------------
14
15	1			1		39789	use 5.010;
	1					4
	1					36
16	1			1		5	use strict;
	1					4
	1					33
17	1			1		3	use warnings;
	1					2
	1					33
18
19	1			1		7	use File::Spec;
	1					2
	1					33
20	1			1		6	use Data::Dump 'dump';
	1					1
	1					66
21	1			1		751	use Parse::FSM::Error;
	1					2
	1					94
22
23							our $VERSION = '1.11';
24
25							#------------------------------------------------------------------------------
26
27							=head1 SYNOPSIS
28
29							use Parse::FSM::Lexer;
30							$lex = Parse::FSM::Lexer->new;
31							$lex = Parse::FSM::Lexer->new(@files);
32
33							$lex->add_path(@dirs); @dirs = $lex->path;
34							$full_path = $lex->path_search($file);
35
36							$lex->from_file($filename);
37							$lex->from_list(@input);
38							$lex->from_list(sub {});
39
40							$lex->get_token;
41
42							$lex->error($message);
43							$lex->warning($message);
44							$lex->file;
45							$lex->line_nr;
46
47							# in a nearby piece of code
48							use MyParser; # isa Parse::FSM::Driver;
49							my $parser = MyParser->new;
50							$parser->input(sub {$lex->get_token});
51							eval {$parser->parse}; $@ and $lex->error($@);
52
53							=head1 DESCRIPTION
54
55							This module implements a generic tokenizer that can be used by
56							L parsers, and can also be used stand alone
57							independently of the parser.
58
59							It supports recursive file includes and takes track of current file name
60							and line number. It keeps the path of search directories to search for
61							input files.
62
63							The C method can be called by the C method of the parser
64							to retrieves the next input token to parse.
65
66							The module can be used directly if the supplied tokenizer is enough for the
67							application, but usually a derived class has to be written implementing a
68							custom version of the C method.
69
70							=head1 METHODS - SETUP
71
72							=head2 new
73
74							Creates a new object. If an argument list is given, calls C
75							for each of the file starting from the last, so that the files are
76							read in the given order.
77
78							=cut
79
80							#------------------------------------------------------------------------------
81	1			1		7	use constant INPUT => 0; # input stream, code ref
	1					2
	1					85
82	1			1		4	use constant FILE => 1; # name of the input file, undef for list
	1					1
	1					34
83	1			1		3	use constant LINE_NR => 2; # current input line number
	1					2
	1					29
84	1			1		4	use constant LINE_INC => 3; # increment to next line number
	1					1
	1					29
85	1			1		3	use constant SAW_NL => 4; # true if saw a newline before
	1					1
	1					30
86							# used to increment LINE_INC on next token
87	1			1		3	use constant TEXT => 5; # line text being lexed
	1					1
	1					34
88
89	1			1		4	use constant STACK => 6; # stack of previous contexts for recursive
	1					1
	1					35
90							# includes, saves
91							# [input, file, line_nr, line_inc, saw_nl,
92							# text, pos(text)]
93	1			1		4	use constant PATH => 7; # path of search directories
	1					1
	1					58
94
95							# only limited accessors
96							use Class::XSAccessor::Array {
97	1					10	accessors => {
98							file => FILE,
99							line_nr => LINE_NR,
100							line_inc => LINE_INC,
101							}
102	1			1		549	};
	1					3444
103
104							#------------------------------------------------------------------------------
105							sub new {
106	34			34	1	16118	my($class, @files) = @_;
107	34					113	my $self = bless [], $class;
108	34					76	$self->[STACK] = [];
109	34					71	$self->[PATH] = [];
110	34					104	$self->from_file($_) for reverse @files;
111	33					83	return $self;
112							}
113
114							#------------------------------------------------------------------------------
115							# push context for include file
116							sub _push_context {
117	21			21		30	my($self) = @_;
118	21					50	push @{$self->[STACK]},
	21					83
119	21					22	[ @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) ];
120	21					33	return;
121							}
122
123							#------------------------------------------------------------------------------
124							# pop context
125							sub _pop_context {
126	47			47		62	my($self) = @_;
127	47					180	( @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) )
	47					238
128	47	100				51	= @{ pop(@{$self->[STACK]}) \|\| [] };
	47					49
129	47					270	return;
130							}
131
132							#------------------------------------------------------------------------------
133
134							=head1 METHODS - SEARCH PATH FOR FILES
135
136							=head2 path
137
138							Returns the list of directories to search in sequence for source files.
139
140							=cut
141
142							#------------------------------------------------------------------------------
143	3			3	1	608	sub path { @{$_[0][PATH]} } ## no critic
	3					19
144							#------------------------------------------------------------------------------
145
146							=head2 add_path
147
148							Adds the given directories to the path searched for include files.
149
150							=cut
151
152							#------------------------------------------------------------------------------
153							sub add_path {
154	2			2	1	6	my($self, @dirs) = @_;
155	2					2	push @{$self->[PATH]}, @dirs;
	2					6
156							}
157							#------------------------------------------------------------------------------
158
159							=head2 path_search
160
161							Searches for the given file name in the C created by C, returns
162							the first full path name where the file can be found.
163
164							Returns the given input file name unchanged if:
165
166							=over 4
167
168							=item *
169
170							the file is found in the current directory; or
171
172							=item *
173
174							the file is not found in any of the C directories.
175
176							=back
177
178							=cut
179
180							#------------------------------------------------------------------------------
181							sub path_search {
182	39			39	1	46	my($self, $file) = @_;
183
184	39	100				672	return $file if -f $file; # found
185
186	8					9	for my $dir (@{$self->[PATH]}) {
	8					23
187	8					77	my $full_path = File::Spec->catfile($dir, $file);
188	8	100				100	return $full_path if -f $full_path;
189							}
190
191	3					43	return $file; # not found
192							}
193							#------------------------------------------------------------------------------
194
195							=head1 METHODS - INPUT STREAM
196
197							=head2 from_file
198
199							Saves the current input context, searches for the given input file name
200							in the C, opens the file and sets-up the object to read
201							each line in sequence. At the end of the
202							file input resumes to the place where it was when C was called.
203
204							Dies if the input file cannot be read, or if a file is
205							included recursively, to avoid an infinite include loop.
206
207							=cut
208
209							#------------------------------------------------------------------------------
210							sub from_file {
211	35			35	1	1541	my($self, $file) = @_;
212
213							# search include path
214	35					80	$file = $self->path_search($file);
215
216							# check for include loop
217	35	100	100			49	if (grep {($_->[FILE] // "") eq $file} @{$self->[STACK]}) {
	12					78
	35					94
218	1					6	$self->error("#include loop");
219							}
220
221							# open the file
222	34	100				934	open(my $fh, "<", $file)
223							or $self->error("unable to open input file '$file'");
224
225							# create a new iterator to read file lines
226							my $input = sub {
227	87	50		87		139	$fh or return;
228	87					688	my $line = <$fh>;
229	87	100				179	if (defined $line) {
230	59	100				286	$line .= "\n" unless $line =~ /\n\z/; # add \n if missing
231	59					118	return $line;
232							}
233	28					35	$fh = undef; # free handle when file ends
234	28					345	return;
235	32					174	};
236	32					90	$self->from_list($input);
237	32					42	$self->[FILE] = $file;
238
239	32					59	return;
240							}
241							#------------------------------------------------------------------------------
242
243							=head2 from_list
244
245							Saves the current input context and sets-up the object to read each element
246							of the passed input list. Each element either a text string
247							or a code reference of an iterator that returns text strings.
248							The iterator returns C at the end of input.
249
250							=cut
251
252							#------------------------------------------------------------------------------
253							# input from text string (if scalar) or iterator (if CODE ref)
254							sub from_list {
255	51			51	1	8056	my($self, @input) = @_;
256
257							# save previous context
258	51	100				164	$self->_push_context if defined $self->[INPUT];
259
260							# iterator
261							my $input = sub {
262	141			141		124	while (1) {
263	170	100				385	@input or return; # end of input
264	122					248	for ($input[0]) {
265	122	100				264	if (! ref $_) {
266	30					97	return shift @input; # scalar -> return it
267							}
268							else { # has to be a CODE ref
269	92					132	my $element = $_->();
270	92	100				171	if (defined $element) { # iterator returned something
271	63					175	return $element;
272							}
273							else { # end of iterator
274	29					180	shift @input; # continue loop
275							}
276							}
277							}
278							}
279	51					182	};
280
281							# initialize
282	51					92	@{$self}[ INPUT, FILE, LINE_NR, LINE_INC, SAW_NL, TEXT ]
	51					137
283							= ( $input, undef, 0, 1, 1, undef );
284
285	51					94	return;
286							}
287							#------------------------------------------------------------------------------
288
289							=head1 METHODS - INPUT
290
291							=head2 get_token
292
293							Retrieves the next token from the input as an array reference containing
294							token type and token value.
295
296							Returns C on end of input.
297
298							=head2 tokenizer
299
300							Method responsible to match the next token from the given input string.
301
302							This method can be overriden by a child class in order to implement a different
303							set ot tokens to be retrieved from the input.
304
305							It is implemented with features from the Perl 5.010 regex engine:
306
307							=over 4
308
309							=item *
310
311							one big regex with C to match from where the
312							last match ended; the string to match is passed as a scalar reference, so that
313							the position of last match C is preserved;
314
315							=item *
316
317							one sequence of C<(?:...\|...)> alternations for each token to be matched;
318
319							=item *
320
321							using C<(?E...)> for each token to make sure there is no
322							backtracking;
323
324							=item *
325
326							using capturing parentheses and embedded code evaluation
327							C<(?{ [TYPE =E $^N] })> to return the token value
328							from the regex match;
329
330							=item *
331
332							using C<$^R> as the value of the matched token;
333
334							As the regex engine is not
335							reentrant, any operation that may call another regex match
336							(e.g. recursive file include) cannot be done inside
337							the C<(?{ ... })> code block, and is done after the regex match by checking the
338							C<$^R> for special tokens.
339
340							=item *
341
342							using C as the return of C<$^R> to ignore a token, e.g. white space.
343
344							=back
345
346							The default tokenizer recognizes and returns the following token types:
347
348							=over 4
349
350							=item [STR => $value]
351
352							Perl-like single or double quoted string, C<$value> contains the string
353							without the quotes and with any backslash escapes resolved.
354
355							The string cannot span multiple input lines.
356
357							=item [NUM => $value]
358
359							Perl-like integer in decimal, hexadecimal, octal or binary notation,
360							C<$value> contains decimal value of the integer.
361
362							=item [NAME => $name]
363
364							Perl-like identifier name, i.e. word starting with a letter or underscore and
365							followed by letters, underscores or digits.
366
367							=item [$token => $token]
368
369							All other characters except white space are returned in the form
370							C<[$token=E$token]>, where C<$token> is a single character or one
371							of the following composed tokens: << >> == != >= <=
372
373							=item white space
374
375							All white space is ignored, i.e. the tokenizer returns C.
376
377							=item [INCLUDE => $file]
378
379							Returned when a C<#include> statement is recognized, causes the lexer to
380							recursively include the file at the current input stream location.
381
382							=item [INPUT_POS => $file, $line_nr, $line_inc]
383
384							Returned when a C<#line> statement is recognized, causes the lexer to
385							set the current input location to the given C<$file>, C<$line_nr> and
386							C<$line_inc>.
387
388							=item [ERROR => $message]
389
390							Causes the lexer to call C with the given error message, can be
391							used when the input cannot be tokenized.
392
393							=back
394
395							=cut
396
397							#------------------------------------------------------------------------------
398							# get the next line from input, set TEXT, return true
399							# accumulate lines ending in \\, to allow lexer to handle continuation lines
400							sub _readline {
401	148			148		183	my($self) = @_;
402
403	148					126	while (1) {
404	195	100				697	my $input = $self->[INPUT] or return; # no input, return false
405	139	100				244	if ( defined( $self->[TEXT] = $input->() ) ) {
406	92					262	while ( $self->[TEXT] =~ /\\\Z/ ) {
407	2					5	my $next_line = $input->();
408	2	100				7	last unless defined $next_line;
409	1					4	$self->[TEXT] .= $next_line;
410							}
411	92					214	pos($self->[TEXT]) = 0;
412	92					162	last;
413							}
414							else {
415	47					147	$self->_pop_context; # pop and continue
416							}
417							}
418	92					217	return 1;
419							}
420
421							#------------------------------------------------------------------------------
422							# get next token as [TYPE => VALUE], undef on end of input
423							sub get_token {
424	212			212	1	269387	my($self) = @_;
425
426							LINE:
427	212					308	while (1) {
428							# read line
429	301	100				703	if (! defined $self->[TEXT]) {
430	148	100				293	$self->_readline or return; # end of input
431							}
432
433							# return tokens
434	245					675	while ( (my $start_pos = pos($self->[TEXT]))
435							< length($self->[TEXT])
436							) {
437							# increment line number if last token included newlines
438							# need to retest after each token
439	334	100				598	if ($self->[SAW_NL]) {
440	106					193	$self->[LINE_NR] += $self->[SAW_NL] * $self->[LINE_INC];
441	106					114	undef $self->[SAW_NL];
442							}
443
444							# read next token
445	334					686	my $token = $self->tokenizer(\($self->[TEXT]));
446
447							# check for newlines
448	334					726	my $end_pos = pos($self->[TEXT]);
449	334					758	$self->[SAW_NL] +=
450							substr($self->[TEXT], $start_pos, $end_pos - $start_pos)
451							=~ tr/\n/\n/;
452
453							# check for special tokens
454	334	100				857	next unless defined $token;
455
456	181					693	my $method = $self->can( $token->[0] );
457	181	100				305	if ($method) {
458	28					51	my $new_token = $self->$method($token);
459	25	50				52	return $new_token if defined $new_token;
460	25	100				123	next LINE unless defined $self->[TEXT]; # if context changed
461							}
462							else {
463	153					818	return $token;
464							}
465							}
466							# end of line
467	70					120	undef $self->[TEXT];
468							}
469							}
470
471							#------------------------------------------------------------------------------
472							# special handlers: return $token to return changed token; return undef to continue loop
473							# changeable by subclass
474							sub INCLUDE {
475	21			21	0	31	my($self, $token) = @_;
476
477	21					47	$self->from_file($token->[1]);
478
479	19					74	return;
480							}
481
482							sub INPUT_POS {
483	6			6	0	9	my($self, $token) = @_;
484
485	6					16	@{$self}[ SAW_NL, FILE, LINE_NR, LINE_INC ] =
	6					12
486	6					14	( undef, @{$token}[1 .. $#$token] );
487
488	6					13	return;
489							}
490
491							sub ERROR {
492	1			1	0	2	my($self, $token) = @_;
493
494	1					3	$self->error($token->[1]);
495
496	0					0	return;
497							}
498
499							#------------------------------------------------------------------------------
500							# get next token as [TYPE => VALUE] from the given string reference
501							# return undef to ignore a token
502							sub tokenizer {
503	310			310	1	351	my($self, $rtext) = @_;
504	310					244	our $LINE_NR; local $LINE_NR;
	310					271
505
506	310	50				1890	$$rtext =~ m{\G
507							(?:
508							# #include
509							(?> ^ (?&SP)* \# include (?&SP)*
510							(?: \' ( [^\'\n]+ ) \' (?{ [INCLUDE => $^N] })
511							\| \" ( [^\"\n]+ ) \" (?{ [INCLUDE => $^N] })
512							\| < ( [^>\n]+ ) > (?{ [INCLUDE => $^N] })
513							\| ( \S+ ) (?{ [INCLUDE => $^N] })
514							\| (?{ [ERROR =>
515							"#include expects a file name"] })
516							)
517							.* \n? # eat newline
518							)
519
520							# #line
521							\| (?> ^ (?&SP)* \# line (?&SP)+
522							(\d+) (?&SP)+ (?{ $LINE_NR = $^N })
523							\"? ([^\"\n]+) \"? (?{ [INPUT_POS => $^N, $LINE_NR, 1] })
524							.* \n? # eat newline
525							)
526
527							# other #-lines - ignore
528							\| (?> ^ (?&SP)* \# .* \n? (?{ undef })
529							)
530
531							# white space
532							\| (?> \s+ (?{ undef })
533							)
534
535							# string
536							\| (?> ( \" (?: \\. \| [^\\\"] )* \" )
537							(?{ [STR => eval($^N)] })
538							)
539							\| (?> ( \' (?: \\. \| [^\\\'] )* \' )
540							(?{ [STR => eval($^N)] })
541							)
542
543							# number
544							\| (?> 0x ( [0-9a-f]+ ) \b (?{ [NUM => hex($^N)] })
545							)
546							\| (?> 0b ( [01]+ ) \b (?{ [NUM => oct("0b".$^N)] })
547							)
548							\| (?> 0 ( [0-7]+ ) \b (?{ [NUM => oct("0".$^N)] })
549							)
550							\| (?> ( \d+ ) \b (?{ [NUM => 0+$^N] })
551							)
552
553							# name
554							\| (?> ( [a-z_]\w* ) (?{ [NAME => $^N] })
555							)
556
557							# symbols
558							\| (?> ( << \| >> \| == \| != \| >= \| <= \| . )
559							(?{ [$^N, $^N] })
560							)
561							)
562
563							(?(DEFINE)
564							# horizontal blanks
565							(? [\t\f\r ] )
566							)
567							}gcxmi or die 'not reached';
568	310					616	return $^R;
569							}
570
571							#------------------------------------------------------------------------------
572							# implemented by XSAccessor above
573
574							=head1 METHODS - INPUT LOCATION AND ERRORS
575
576							=head2 file
577
578							Returns the current input file, C if reading from a list.
579
580							=head2 line_nr
581
582							Returns the current input line number, starting at 1.
583
584							=head2 line_inc
585
586							Increment of line number on each new-line found, usually 1.
587
588							=head2 error
589
590							Dies with the given error message, indicating the place in the input source file
591							where the error occured.
592
593							=cut
594
595							#------------------------------------------------------------------------------
596							sub error {
597	10			10	1	2516	my($self, $message) = @_;
598	10					21	Parse::FSM::Error::error( $self->_error_msg($message),
599							$self->[FILE], $self->[LINE_NR] );
600							}
601							#------------------------------------------------------------------------------
602
603							=head2 warning
604
605							Warns with the given error message, indicating the place in the input source file
606							where the warning occured.
607
608							=cut
609
610							#------------------------------------------------------------------------------
611							sub warning {
612	6			6	1	2648	my($self, $message) = @_;
613	6					14	Parse::FSM::Error::warning( $self->_error_msg($message),
614							$self->[FILE], $self->[LINE_NR] );
615							}
616
617							#------------------------------------------------------------------------------
618							# error message for error() and warning()
619							sub _error_msg {
620	16			16		19	my($self, $message) = @_;
621
622	16	100				71	defined($message) and $message =~ s/\s+\z//;
623
624	16					17	my $near;
625	16	100	66			56	if (defined($self->[TEXT]) && defined(pos($self->[TEXT]))) {
626	3					7	my $code = substr($self->[TEXT], pos($self->[TEXT]), 20);
627	3					7	$code =~ s/\n.*//s;
628	3	50				9	if ($code ne "") {
629	0					0	$near = "near ".dump($code);
630							}
631							}
632
633	16					24	return join(" ", grep {defined} $message, $near);
	32					114
634							}
635							#------------------------------------------------------------------------------
636
637							=head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT
638
639							See L
640
641							=cut
642
643							#------------------------------------------------------------------------------
644
645							1;