File Coverage

blib/lib/Parse/FSM/Lexer.pm

Criterion	Covered	Total	%
statement	180	182	98.9
branch	48	52	92.3
condition	4	5	80.0
subroutine	34	34	100.0
pod	10	13	76.9
total	276	286	96.5

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: Lexer.pm,v 1.10 2013/07/27 00:34:39 Paulo Exp $
2
3							package Parse::FSM::Lexer;
4
5							#------------------------------------------------------------------------------
6
7							=head1 NAME
8
9							Parse::FSM::Lexer - Companion Lexer for the Parse::FSM parser
10
11							=cut
12
13							#------------------------------------------------------------------------------
14
15	1			1		30447	use 5.010;
	1					3
16	1			1		3	use strict;
	1					0
	1					19
17	1			1		3	use warnings;
	1					0
	1					17
18
19	1			1		3	use File::Spec;
	1					1
	1					17
20	1			1		3	use Data::Dump 'dump';
	1					1
	1					32
21	1			1		347	use Parse::FSM::Error;
	1					1
	1					49
22
23							our $VERSION = '1.13';
24
25							#------------------------------------------------------------------------------
26
27							=head1 SYNOPSIS
28
29							use Parse::FSM::Lexer;
30							$lex = Parse::FSM::Lexer->new;
31							$lex = Parse::FSM::Lexer->new(@files);
32
33							$lex->add_path(@dirs); @dirs = $lex->path;
34							$full_path = $lex->path_search($file);
35
36							$lex->from_file($filename);
37							$lex->from_list(@input);
38							$lex->from_list(sub {});
39
40							$lex->get_token;
41
42							$lex->error($message);
43							$lex->warning($message);
44							$lex->file;
45							$lex->line_nr;
46
47							# in a nearby piece of code
48							use MyParser; # isa Parse::FSM::Driver;
49							my $parser = MyParser->new;
50							$parser->input(sub {$lex->get_token});
51							eval {$parser->parse}; $@ and $lex->error($@);
52
53							=head1 DESCRIPTION
54
55							This module implements a generic tokenizer that can be used by
56							L parsers, and can also be used standalone
57							independently of the parser.
58
59							It supports recursive file includes and takes track of current file name
60							and line number. It keeps the path of search directories to search for
61							input files.
62
63							The C method can be called by the C method of the parser
64							to retrieves the next input token to parse.
65
66							The module can be used directly if the supplied tokenizer is enough for the
67							application, but usually a derived class has to be written implementing a
68							custom version of the C method.
69
70							=head1 METHODS - SETUP
71
72							=head2 new
73
74							Creates a new object. If an argument list is given, calls C
75							for each of the file starting from the last, so that the files are
76							read in the given order.
77
78							=cut
79
80							#------------------------------------------------------------------------------
81	1			1		4	use constant INPUT => 0; # input stream, code ref
	1					1
	1					67
82	1			1		4	use constant FILE => 1; # name of the input file, undef for list
	1					1
	1					31
83	1			1		3	use constant LINE_NR => 2; # current input line number
	1					1
	1					29
84	1			1		3	use constant LINE_INC => 3; # increment to next line number
	1					1
	1					28
85	1			1		3	use constant SAW_NL => 4; # true if saw a newline before
	1					1
	1					31
86							# used to increment LINE_INC on next token
87	1			1		2	use constant TEXT => 5; # line text being lexed
	1					5
	1					31
88
89	1			1		3	use constant STACK => 6; # stack of previous contexts for recursive
	1					1
	1					29
90							# includes, saves
91							# [input, file, line_nr, line_inc, saw_nl,
92							# text, pos(text)]
93	1			1		2	use constant PATH => 7; # path of search directories
	1					1
	1					55
94
95							# only limited accessors
96							use Class::XSAccessor::Array {
97	1					6	accessors => {
98							file => FILE,
99							line_nr => LINE_NR,
100							line_inc => LINE_INC,
101							}
102	1			1		434	};
	1					2663
103
104							#------------------------------------------------------------------------------
105							sub new {
106	34			34	1	8277	my($class, @files) = @_;
107	34					51	my $self = bless [], $class;
108	34					54	$self->[STACK] = [];
109	34					41	$self->[PATH] = [];
110	34					74	$self->from_file($_) for reverse @files;
111	33					50	return $self;
112							}
113
114							#------------------------------------------------------------------------------
115							# push context for include file
116							sub _push_context {
117	21			21		20	my($self) = @_;
118	21					23	push @{$self->[STACK]},
119	21					18	[ @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) ];
	21					51
120	21					22	return;
121							}
122
123							#------------------------------------------------------------------------------
124							# pop context
125							sub _pop_context {
126	47			47		39	my($self) = @_;
127	47					139	( @{$self}[ 0 .. STACK - 1 ], pos($self->[TEXT]) )
128	47	100				32	= @{ pop(@{$self->[STACK]}) \|\| [] };
	47					31
	47					155
129	47					162	return;
130							}
131
132							#------------------------------------------------------------------------------
133
134							=head1 METHODS - SEARCH PATH FOR FILES
135
136							=head2 path
137
138							Returns the list of directories to search in sequence for source files.
139
140							=cut
141
142							#------------------------------------------------------------------------------
143	3			3	1	269	sub path { @{$_[0][PATH]} } ## no critic
	3					12
144							#------------------------------------------------------------------------------
145
146							=head2 add_path
147
148							Adds the given directories to the path searched for include files.
149
150							=cut
151
152							#------------------------------------------------------------------------------
153							sub add_path {
154	2			2	1	3	my($self, @dirs) = @_;
155	2					2	push @{$self->[PATH]}, @dirs;
	2					5
156							}
157							#------------------------------------------------------------------------------
158
159							=head2 path_search
160
161							Searches for the given file name in the C created by C, returns
162							the first full path name where the file can be found.
163
164							Returns the given input file name unchanged if:
165
166							=over 4
167
168							=item *
169
170							the file is found in the current directory; or
171
172							=item *
173
174							the file is not found in any of the C directories.
175
176							=back
177
178							=cut
179
180							#------------------------------------------------------------------------------
181							sub path_search {
182	39			39	1	28	my($self, $file) = @_;
183
184	39	100				458	return $file if -f $file; # found
185
186	8					8	for my $dir (@{$self->[PATH]}) {
	8					15
187	8					65	my $full_path = File::Spec->catfile($dir, $file);
188	8	100				106	return $full_path if -f $full_path;
189							}
190
191	3					10	return $file; # not found
192							}
193							#------------------------------------------------------------------------------
194
195							=head1 METHODS - INPUT STREAM
196
197							=head2 from_file
198
199							Saves the current input context, searches for the given input file name
200							in the C, opens the file and sets-up the object to read
201							each line in sequence. At the end of the
202							file input resumes to the place where it was when C was called.
203
204							Dies if the input file cannot be read, or if a file is
205							included recursively, to avoid an infinite include loop.
206
207							=cut
208
209							#------------------------------------------------------------------------------
210							sub from_file {
211	35			35	1	840	my($self, $file) = @_;
212
213							# search include path
214	35					47	$file = $self->path_search($file);
215
216							# check for include loop
217	35	100	100			31	if (grep {($_->[FILE] // "") eq $file} @{$self->[STACK]}) {
	12					44
	35					69
218	1					3	$self->error("#include loop");
219							}
220
221							# open the file
222	34	100				627	open(my $fh, "<", $file)
223							or $self->error("unable to open input file '$file'");
224
225							# create a new iterator to read file lines
226							my $input = sub {
227	87	50		87		97	$fh or return;
228	87					443	my $line = <$fh>;
229	87	100				116	if (defined $line) {
230	59	100				165	$line .= "\n" unless $line =~ /\n\z/; # add \n if missing
231	59					79	return $line;
232							}
233	28					23	$fh = undef; # free handle when file ends
234	28					186	return;
235	32					93	};
236	32					55	$self->from_list($input);
237	32					29	$self->[FILE] = $file;
238
239	32					41	return;
240							}
241							#------------------------------------------------------------------------------
242
243							=head2 from_list
244
245							Saves the current input context and sets-up the object to read each element
246							of the passed input list. Each element either a text string
247							or a code reference of an iterator that returns text strings.
248							The iterator returns C at the end of input.
249
250							=cut
251
252							#------------------------------------------------------------------------------
253							# input from text string (if scalar) or iterator (if CODE ref)
254							sub from_list {
255	51			51	1	3970	my($self, @input) = @_;
256
257							# save previous context
258	51	100				110	$self->_push_context if defined $self->[INPUT];
259
260							# iterator
261							my $input = sub {
262	141			141		90	while (1) {
263	170	100				286	@input or return; # end of input
264	122					154	for ($input[0]) {
265	122	100				175	if (! ref $_) {
266	30					71	return shift @input; # scalar -> return it
267							}
268							else { # has to be a CODE ref
269	92					88	my $element = $_->();
270	92	100				110	if (defined $element) { # iterator returned something
271	63					121	return $element;
272							}
273							else { # end of iterator
274	29					89	shift @input; # continue loop
275							}
276							}
277							}
278							}
279	51					102	};
280
281							# initialize
282	51					51	@{$self}[ INPUT, FILE, LINE_NR, LINE_INC, SAW_NL, TEXT ]
	51					97
283							= ( $input, undef, 0, 1, 1, undef );
284
285	51					62	return;
286							}
287							#------------------------------------------------------------------------------
288
289							=head1 METHODS - INPUT
290
291							=head2 get_token
292
293							Retrieves the next token from the input as an array reference containing
294							token type and token value.
295
296							Returns C on end of input.
297
298							=head2 tokenizer
299
300							Method responsible to match the next token from the given input string.
301
302							This method can be overridden by a child class in order to implement a different
303							set of tokens to be retrieved from the input.
304
305							It is implemented with features from the Perl 5.010 regex engine:
306
307							=over 4
308
309							=item *
310
311							one big regex with C to match from where the
312							last match ended; the string to match is passed as a scalar reference, so that
313							the position of last match C is preserved;
314
315							=item *
316
317							one sequence of C<(?:...\|...)> alternations for each token to be matched;
318
319							=item *
320
321							using C<(?E...)> for each token to make sure there is no
322							backtracking;
323
324							=item *
325
326							using capturing parentheses and embedded code evaluation
327							C<(?{ [TYPE =E $^N] })> to return the token value
328							from the regex match;
329
330							=item *
331
332							using C<$^R> as the value of the matched token;
333
334							As the regex engine is not
335							reentrant, any operation that may call another regex match
336							(e.g. recursive file include) cannot be done inside
337							the C<(?{ ... })> code block, and is done after the regex match by checking the
338							C<$^R> for special tokens.
339
340							=item *
341
342							using C as the return of C<$^R> to ignore a token, e.g. white space.
343
344							=back
345
346							The default tokenizer recognizes and returns the following token types:
347
348							=over 4
349
350							=item [STR => $value]
351
352							Perl-like single or double quoted string, C<$value> contains the string
353							without the quotes and with any backslash escapes resolved.
354
355							The string cannot span multiple input lines.
356
357							=item [NUM => $value]
358
359							Perl-like integer in decimal, hexadecimal, octal or binary notation,
360							C<$value> contains decimal value of the integer.
361
362							=item [NAME => $name]
363
364							Perl-like identifier name, i.e. word starting with a letter or underscore and
365							followed by letters, underscores or digits.
366
367							=item [$token => $token]
368
369							All other characters except white space are returned in the form
370							C<[$token=E$token]>, where C<$token> is a single character or one
371							of the following composed tokens: << >> == != >= <=
372
373							=item white space
374
375							All white space is ignored, i.e. the tokenizer returns C.
376
377							=item [INCLUDE => $file]
378
379							Returned when a C<#include> statement is recognized, causes the lexer to
380							recursively include the file at the current input stream location.
381
382							=item [INPUT_POS => $file, $line_nr, $line_inc]
383
384							Returned when a C<#line> statement is recognized, causes the lexer to
385							set the current input location to the given C<$file>, C<$line_nr> and
386							C<$line_inc>.
387
388							=item [ERROR => $message]
389
390							Causes the lexer to call C with the given error message, can be
391							used when the input cannot be tokenized.
392
393							=back
394
395							=cut
396
397							#------------------------------------------------------------------------------
398							# get the next line from input, set TEXT, return true
399							# accumulate lines ending in \\, to allow lexer to handle continuation lines
400							sub _readline {
401	148			148		116	my($self) = @_;
402
403	148					99	while (1) {
404	195	100				449	my $input = $self->[INPUT] or return; # no input, return false
405	139	100				149	if ( defined( $self->[TEXT] = $input->() ) ) {
406	92					170	while ( $self->[TEXT] =~ /\\\Z/ ) {
407	2					3	my $next_line = $input->();
408	2	100				6	last unless defined $next_line;
409	1					4	$self->[TEXT] .= $next_line;
410							}
411	92					137	pos($self->[TEXT]) = 0;
412	92					116	last;
413							}
414							else {
415	47					68	$self->_pop_context; # pop and continue
416							}
417							}
418	92					155	return 1;
419							}
420
421							#------------------------------------------------------------------------------
422							# get next token as [TYPE => VALUE], undef on end of input
423							sub get_token {
424	212			212	1	149762	my($self) = @_;
425
426							LINE:
427	212					189	while (1) {
428							# read line
429	301	100				530	if (! defined $self->[TEXT]) {
430	148	100				201	$self->_readline or return; # end of input
431							}
432
433							# return tokens
434	245					468	while ( (my $start_pos = pos($self->[TEXT]))
435							< length($self->[TEXT])
436							) {
437							# increment line number if last token included newlines
438							# need to retest after each token
439	334	100				438	if ($self->[SAW_NL]) {
440	106					134	$self->[LINE_NR] += $self->[SAW_NL] * $self->[LINE_INC];
441	106					88	undef $self->[SAW_NL];
442							}
443
444							# read next token
445	334					464	my $token = $self->tokenizer(\($self->[TEXT]));
446
447							# check for newlines
448	334					497	my $end_pos = pos($self->[TEXT]);
449	334					539	$self->[SAW_NL] +=
450							substr($self->[TEXT], $start_pos, $end_pos - $start_pos)
451							=~ tr/\n/\n/;
452
453							# check for special tokens
454	334	100				669	next unless defined $token;
455
456	181					452	my $method = $self->can( $token->[0] );
457	181	100				203	if ($method) {
458	28					31	my $new_token = $self->$method($token);
459	25	50				36	return $new_token if defined $new_token;
460	25	100				59	next LINE unless defined $self->[TEXT]; # if context changed
461							}
462							else {
463	153					660	return $token;
464							}
465							}
466							# end of line
467	70					75	undef $self->[TEXT];
468							}
469							}
470
471							#------------------------------------------------------------------------------
472							# special handlers: return $token to return changed token; return undef to continue loop
473							# changeable by subclass
474							sub INCLUDE {
475	21			21	0	17	my($self, $token) = @_;
476
477	21					32	$self->from_file($token->[1]);
478
479	19					52	return;
480							}
481
482							sub INPUT_POS {
483	6			6	0	4	my($self, $token) = @_;
484
485	6					12	@{$self}[ SAW_NL, FILE, LINE_NR, LINE_INC ] =
486	6					11	( undef, @{$token}[1 .. $#$token] );
	6					7
487
488	6					7	return;
489							}
490
491							sub ERROR {
492	1			1	0	1	my($self, $token) = @_;
493
494	1					3	$self->error($token->[1]);
495
496	0					0	return;
497							}
498
499							#------------------------------------------------------------------------------
500							# get next token as [TYPE => VALUE] from the given string reference
501							# return undef to ignore a token
502							sub tokenizer {
503	310			310	1	236	my($self, $rtext) = @_;
504	310					202	our $LINE_NR; local $LINE_NR;
	310					226
505
506	310	50				1218	$$rtext =~ m{\G
507							(?:
508							# #include
509							(?> ^ (?&SP)* \# include (?&SP)*
510	11					59	(?: \' ( [^\'\n]+ ) \' (?{ [INCLUDE => $^N] })
511	3					16	\| \" ( [^\"\n]+ ) \" (?{ [INCLUDE => $^N] })
512	5					25	\| < ( [^>\n]+ ) > (?{ [INCLUDE => $^N] })
513	2					12	\| ( \S+ ) (?{ [INCLUDE => $^N] })
514	1					6	\| (?{ [ERROR =>
515							"#include expects a file name"] })
516							)
517							.* \n? # eat newline
518							)
519
520							# #line
521							\| (?> ^ (?&SP)* \# line (?&SP)+
522	6					19	(\d+) (?&SP)+ (?{ $LINE_NR = $^N })
523	6					26	\"? ([^\"\n]+) \"? (?{ [INPUT_POS => $^N, $LINE_NR, 1] })
524							.* \n? # eat newline
525							)
526
527							# other #-lines - ignore
528							\| (?> ^ (?&SP)* \# .* \n? (?{ undef })
529							)
530
531							# white space
532							\| (?> \s+ (?{ undef })
533							)
534
535							# string
536							\| (?> ( \" (?: \\. \| [^\\\"] )* \" )
537	7					256	(?{ [STR => eval($^N)] })
538							)
539							\| (?> ( \' (?: \\. \| [^\\\'] )* \' )
540	7					259	(?{ [STR => eval($^N)] })
541							)
542
543							# number
544	3					14	\| (?> 0x ( [0-9a-f]+ ) \b (?{ [NUM => hex($^N)] })
545							)
546	3					16	\| (?> 0b ( [01]+ ) \b (?{ [NUM => oct("0b".$^N)] })
547							)
548	3					16	\| (?> 0 ( [0-7]+ ) \b (?{ [NUM => oct("0".$^N)] })
549							)
550	56					250	\| (?> ( \d+ ) \b (?{ [NUM => 0+$^N] })
551							)
552
553							# name
554	38					154	\| (?> ( [a-z_]\w* ) (?{ [NAME => $^N] })
555							)
556
557							# symbols
558							\| (?> ( << \| >> \| == \| != \| >= \| <= \| . )
559	26					102	(?{ [$^N, $^N] })
560							)
561							)
562
563							(?(DEFINE)
564							# horizontal blanks
565							(? [\t\f\r ] )
566							)
567							}gcxmi or die 'not reached';
568	310					390	return $^R;
569							}
570
571							#------------------------------------------------------------------------------
572							# implemented by XSAccessor above
573
574							=head1 METHODS - INPUT LOCATION AND ERRORS
575
576							=head2 file
577
578							Returns the current input file, C if reading from a list.
579
580							=head2 line_nr
581
582							Returns the current input line number, starting at 1.
583
584							=head2 line_inc
585
586							Increment of line number on each new-line found, usually 1.
587
588							=head2 error
589
590							Dies with the given error message, indicating the place in the input source file
591							where the error occurred.
592
593							=cut
594
595							#------------------------------------------------------------------------------
596							sub error {
597	10			10	1	1511	my($self, $message) = @_;
598	10					15	Parse::FSM::Error::error( $self->_error_msg($message),
599							$self->[FILE], $self->[LINE_NR] );
600							}
601							#------------------------------------------------------------------------------
602
603							=head2 warning
604
605							Warns with the given error message, indicating the place in the input source file
606							where the warning occurred.
607
608							=cut
609
610							#------------------------------------------------------------------------------
611							sub warning {
612	6			6	1	1432	my($self, $message) = @_;
613	6					8	Parse::FSM::Error::warning( $self->_error_msg($message),
614							$self->[FILE], $self->[LINE_NR] );
615							}
616
617							#------------------------------------------------------------------------------
618							# error message for error() and warning()
619							sub _error_msg {
620	16			16		11	my($self, $message) = @_;
621
622	16	100				57	defined($message) and $message =~ s/\s+\z//;
623
624	16					22	my $near;
625	16	100	66			43	if (defined($self->[TEXT]) && defined(pos($self->[TEXT]))) {
626	3					5	my $code = substr($self->[TEXT], pos($self->[TEXT]), 20);
627	3					4	$code =~ s/\n.*//s;
628	3	50				5	if ($code ne "") {
629	0					0	$near = "near ".dump($code);
630							}
631							}
632
633	16					19	return join(" ", grep {defined} $message, $near);
	32					88
634							}
635							#------------------------------------------------------------------------------
636
637							=head1 AUTHOR, BUGS, FEEDBACK, LICENSE, COPYRIGHT
638
639							See L
640
641							=cut
642
643							#------------------------------------------------------------------------------
644
645							1;