File Coverage

blib/lib/TOML/Parser/Tokenizer.pm

Criterion	Covered	Total	%
statement	201	204	98.5
branch	74	82	90.2
condition	17	26	65.3
subroutine	19	19	100.0
pod	0	2	0.0
total	311	333	93.3

line	stmt	bran	cond	sub	pod	time	code
1							package TOML::Parser::Tokenizer;
2	20			20		411	use 5.010000;
	20					67
3	20			20		92	use strict;
	20					33
	20					343
4	20			20		85	use warnings;
	20					35
	20					544
5
6	20			20		93	use Exporter 5.57 'import';
	20					244
	20					799
7
8	20	50		20		115	use constant DEBUG => $ENV{TOML_PARSER_TOKENIZER_DEBUG} ? 1 : 0;
	20					39
	20					3117
9
10							BEGIN {
11	20			20		292	my @TOKENS = map uc, qw/
12							comment
13							table
14							array_of_table
15							key
16							integer
17							float
18							boolean
19							datetime
20							string
21							multi_line_string_begin
22							multi_line_string_end
23							inline_table_begin
24							inline_table_end
25							array_begin
26							array_end
27							/;
28							my %CONSTANTS = map {
29	20					51	("TOKEN_$_" => $_)
	300					744
30							} @TOKENS;
31
32	20					113	require constant;
33	20					2667	constant->import(\%CONSTANTS);
34
35							# Exporter
36	20					99	our @EXPORT_OK = keys %CONSTANTS;
37	20					41815	our %EXPORT_TAGS = (
38							constant => [keys %CONSTANTS],
39							);
40							};
41
42							sub grammar_regexp {
43							return +{
44	878			878	0	16151	comment => qr{#(.*)},
45							table => {
46							start => qr{\[},
47							key => qr{(?:"(.*?)(?
48							sep => qr{\.},
49							end => qr{\]},
50							},
51							array_of_table => {
52							start => qr{\[\[},
53							key => qr{(?:"(.*?)(?
54							sep => qr{\.},
55							end => qr{\]\]},
56							},
57							key => qr{(?:"(.*?)(?
58							value => {
59							datetime => qr{([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?(?:Z\|[-+][0-9]{2}:[0-9]{2}))},
60							float => qr{([-+]?(?:[0-9_]+(?:\.[0-9_]+)?[eE][-+]?[0-9_]+\|[0-9_]*\.[0-9_]+))},
61							integer => qr{([-+]?[0-9_]+)},
62							boolean => qr{(true\|false)},
63							string => qr{(?:"(.*?)(?
64							mlstring => qr{("""\|''')},
65							inline => {
66							start => qr{\{},
67							sep => qr{\s,\s},
68							end => qr{\}},
69							},
70							array => {
71							start => qr{\[},
72							sep => qr{\s,\s},
73							end => qr{\]},
74							},
75							},
76							};
77							}
78
79							sub tokenize {
80	56			56	0	135	my ($class, $src) = @_;
81
82	56					90	local $_ = $src;
83	56					184	return $class->_tokenize();
84							}
85
86							sub _tokenize {
87	56			56		91	my $class = shift;
88	56					158	my $grammar_regexp = $class->grammar_regexp();
89
90	56					110	my @tokens;
91	56					226	until (/\G\z/mgco) {
92	467	100				3358	if (/\G$grammar_regexp->{comment}/mgc) {
		100
		100
		100
		100
93	72					126	warn "[TOKEN] COMMENT: $1" if DEBUG;
94	72					216	$class->_skip_whitespace();
95	72		100			444	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
96							}
97							elsif (/\G$grammar_regexp->{array_of_table}->{start}/mgc) {
98	23					70	push @tokens => $class->_tokenize_array_of_table();
99							}
100							elsif (/\G$grammar_regexp->{table}->{start}/mgc) {
101	49					125	push @tokens => $class->_tokenize_table();
102							}
103							elsif (my @t = $class->_tokenize_key_and_value()) {
104	225					839	push @tokens => @t;
105							}
106							elsif (/\G\s+/mgco) {
107							# pass through
108	89					201	$class->_skip_whitespace();
109							}
110							else {
111	3					36	$class->_syntax_error();
112							}
113							}
114	36					453	return @tokens;
115							}
116
117							sub _tokenize_key_and_value {
118	369			369		573	my $class = shift;
119	369					691	my $grammar_regexp = $class->grammar_regexp();
120
121	369					542	my @tokens;
122	369	100				3324	if (/\G$grammar_regexp->{key}/mgc) {
123	277		66			1534	my $key = $1 \|\| $2 \|\| $3;
124	277					353	warn "[TOKEN] KEY: $key" if DEBUG;
125	277					685	$class->_skip_whitespace();
126	277					750	push @tokens => [TOKEN_KEY, $key];
127	277					609	push @tokens => $class->_tokenize_value();
128	271					1932	return @tokens;
129							}
130
131	92					736	return;
132							}
133
134							sub _tokenize_value {
135	332			332		473	my $class = shift;
136	332					618	my $grammar_regexp = $class->grammar_regexp();
137	332					466	warn "[CALL] _tokenize_value" if DEBUG;
138
139	332	50				6647	if (/\G$grammar_regexp->{comment}/mgc) {
		100
		100
		100
		100
		100
		100
		100
		100
140	0					0	warn "[TOKEN] COMMENT: $1" if DEBUG;
141	0					0	$class->_skip_whitespace();
142	0		0			0	return [TOKEN_COMMENT, $1 \|\| ''];
143							}
144							elsif (/\G$grammar_regexp->{value}->{datetime}/mgc) {
145	2					6	warn "[TOKEN] DATETIME: $1" if DEBUG;
146	2					5	$class->_skip_whitespace();
147	2					16	return [TOKEN_DATETIME, $1];
148							}
149							elsif (/\G$grammar_regexp->{value}->{float}/mgc) {
150	20					79	warn "[TOKEN] FLOAT: $1" if DEBUG;
151	20					53	$class->_skip_whitespace();
152	20					182	return [TOKEN_FLOAT, $1];
153							}
154							elsif (/\G$grammar_regexp->{value}->{integer}/mgc) {
155	78					124	warn "[TOKEN] INTEGER: $1" if DEBUG;
156	78					184	$class->_skip_whitespace();
157	78					705	return [TOKEN_INTEGER, $1];
158							}
159							elsif (/\G$grammar_regexp->{value}->{boolean}/mgc) {
160	4					11	warn "[TOKEN] BOOLEAN: $1" if DEBUG;
161	4					16	$class->_skip_whitespace();
162	4					46	return [TOKEN_BOOLEAN, $1];
163							}
164							elsif (/\G$grammar_regexp->{value}->{mlstring}/mgc) {
165	21					31	warn "[TOKEN] MULTI LINE STRING: $1" if DEBUG;
166							return (
167	21					58	[TOKEN_MULTI_LINE_STRING_BEGIN],
168							$class->_extract_multi_line_string($1),
169							[TOKEN_MULTI_LINE_STRING_END],
170							);
171							}
172							elsif (/\G$grammar_regexp->{value}->{string}/mgc) {
173	155					241	warn "[TOKEN] STRING: $1" if DEBUG;
174	155					365	$class->_skip_whitespace();
175
176	155					289	my $is_raw = defined $2;
177	155	50				1351	return [TOKEN_STRING, defined $1 ? $1 : defined $2 ? $2 : '', $is_raw];
		100
178							}
179							elsif (/\G$grammar_regexp->{value}->{inline}->{start}/mgc) {
180	17					22	warn "[TOKEN] INLINE TABLE" if DEBUG;
181	17					38	$class->_skip_whitespace();
182							return (
183	17					41	[TOKEN_INLINE_TABLE_BEGIN],
184							$class->_tokenize_inline_table(),
185							[TOKEN_INLINE_TABLE_END],
186							);
187							}
188							elsif (/\G$grammar_regexp->{value}->{array}->{start}/mgc) {
189	32					54	warn "[TOKEN] ARRAY" if DEBUG;
190	32					80	$class->_skip_whitespace();
191							return (
192	32					117	[TOKEN_ARRAY_BEGIN],
193							$class->_tokenize_array(),
194							[TOKEN_ARRAY_END],
195							);
196							}
197
198	3					10	$class->_syntax_error();
199							}
200
201							sub _tokenize_table {
202	49			49		112	my $class = shift;
203
204	49					132	my $grammar_regexp = $class->grammar_regexp()->{table};
205	49					223	warn "[CALL] _tokenize_table" if DEBUG;
206
207	49					121	$class->_skip_whitespace();
208
209	49					104	my @expected = ($grammar_regexp->{key});
210
211	49					63	my @keys;
212							LOOP:
213	49					65	while (1) {
214	157					236	for my $rx (@expected) {
215	201	100				3410	if (/\G$rx/smgc) {
216	151	100				492	if ($rx eq $grammar_regexp->{key}) {
		100
		50
217	76		66			391	my $key = $1 \|\| $2 \|\| $3;
218	76					101	warn "[TOKEN] table key: $key" if DEBUG;
219	76					138	push @keys => $key;
220	76					158	@expected = ($grammar_regexp->{sep}, $grammar_regexp->{end});
221							}
222							elsif ($rx eq $grammar_regexp->{sep}) {
223	32					42	warn "[TOKEN] table key separator" if DEBUG;
224	32					56	@expected = ($grammar_regexp->{key});
225							}
226							elsif ($rx eq $grammar_regexp->{end}) {
227	43					50	warn "[TOKEN] table key end" if DEBUG;
228	43					73	@expected = ();
229	43					103	last LOOP;
230							}
231	108					238	$class->_skip_whitespace();
232	108					243	next LOOP;
233							}
234							}
235
236	6					17	$class->_syntax_error();
237							}
238
239	43					57	warn "[TOKEN] TABLE: @{[ join '.', @keys ]}" if DEBUG;
240	43					247	return [TOKEN_TABLE, \@keys];
241							}
242
243							sub _tokenize_array_of_table {
244	23			23		34	my $class = shift;
245
246	23					45	my $grammar_regexp = $class->grammar_regexp()->{array_of_table};
247	23					100	warn "[CALL] _tokenize_array_of_table" if DEBUG;
248
249	23					79	$class->_skip_whitespace();
250
251	23					48	my @expected = ($grammar_regexp->{key});
252
253	23					29	my @keys;
254							LOOP:
255	23					30	while (1) {
256	57					91	for my $rx (@expected) {
257	75	100				1317	if (/\G$rx/smgc) {
258	52	100				169	if ($rx eq $grammar_regexp->{key}) {
		100
		50
259	26		66			146	my $key = $1 \|\| $2 \|\| $3;
260	26					33	warn "[TOKEN] table key: $key" if DEBUG;
261	26					50	push @keys => $key;
262	26					342	@expected = ($grammar_regexp->{sep}, $grammar_regexp->{end});
263							}
264							elsif ($rx eq $grammar_regexp->{sep}) {
265	8					10	warn "[TOKEN] table key separator" if DEBUG;
266	8					15	@expected = ($grammar_regexp->{key});
267							}
268							elsif ($rx eq $grammar_regexp->{end}) {
269	18					24	warn "[TOKEN] table key end" if DEBUG;
270	18					33	@expected = ();
271	18					26	last LOOP;
272							}
273	34					79	$class->_skip_whitespace();
274	34					78	next LOOP;
275							}
276							}
277
278	5					13	$class->_syntax_error();
279							}
280
281	18					23	warn "[TOKEN] ARRAY_OF_TABLE: @{[ join '.', @keys ]}" if DEBUG;
282	18					103	return [TOKEN_ARRAY_OF_TABLE, \@keys];
283							}
284
285							sub _extract_multi_line_string {
286	21			21		65	my ($class, $delimiter) = @_;
287	21					37	my $is_raw = $delimiter eq q{'''};
288	21	100				167	if (/\G(.+?)\Q$delimiter/smgc) {
289	20					28	warn "[TOKEN] MULTI LINE STRING: $1" if DEBUG;
290	20					48	$class->_skip_whitespace();
291	20					170	return [TOKEN_STRING, $1, $is_raw];
292							}
293	1					3	$class->_syntax_error();
294							}
295
296							sub _tokenize_inline_table {
297	17			17		23	my $class = shift;
298
299	17					32	my $common_grammar_regexp = $class->grammar_regexp();
300	17					27	my $grammar_regexp = $common_grammar_regexp->{value}->{inline};
301
302	17					21	warn "[CALL] _tokenize_inline_table" if DEBUG;
303	17	50				164	return if /\G(?:$grammar_regexp->{sep})?$grammar_regexp->{end}/smgc;
304
305	17					30	my $need_sep = 0;
306
307	17					22	my @tokens;
308	17					20	while (1) {
309	94					98	warn "[CONTEXT] _tokenize_inline_table [loop]" if DEBUG;
310
311	94					185	$class->_skip_whitespace();
312	94	100				373	if (/\G$common_grammar_regexp->{comment}/mgc) {
		100
313	2					3	warn "[TOKEN] COMMENT: $1" if DEBUG;
314	2		50			8	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
315	2					2	next;
316							}
317							elsif (/\G$grammar_regexp->{end}/mgc) {
318	16					25	last;
319							}
320
321	76	100				116	if ($need_sep) {
322	30	100				132	if (/\G$grammar_regexp->{sep}/smgc) {
323	29					39	$need_sep = 0;
324	29					42	next;
325							}
326							}
327							else {
328	46	50				83	if (my @t = $class->_tokenize_key_and_value()) {
329	46					70	push @tokens => @t;
330	46					58	$need_sep = 1;
331	46					67	next;
332							}
333							}
334
335	1					2	$class->_syntax_error();
336							}
337
338	16					180	return @tokens;
339							}
340
341							sub _tokenize_array {
342	32			32		50	my $class = shift;
343
344	32					70	my $common_grammar_regexp = $class->grammar_regexp();
345	32					66	my $grammar_regexp = $common_grammar_regexp->{value}->{array};
346
347	32					41	warn "[CALL] _tokenize_array" if DEBUG;
348	32	100				319	return if /\G(?:$grammar_regexp->{sep})?$grammar_regexp->{end}/smgc;
349
350	30					60	my $need_sep = 0;
351
352	30					44	my @tokens;
353	30					40	while (1) {
354	126					146	warn "[CONTEXT] _tokenize_inline_table [loop]" if DEBUG;
355
356	126					251	$class->_skip_whitespace();
357	126	100				660	if (/\G$common_grammar_regexp->{comment}/mgc) {
		100
358	7					10	warn "[TOKEN] COMMENT: $1" if DEBUG;
359	7		50			31	push @tokens => [TOKEN_COMMENT, $1 \|\| ''];
360	7					16	next;
361							}
362							elsif (/\G$grammar_regexp->{end}/mgc) {
363	29					52	last;
364							}
365
366	90	100				167	if ($need_sep) {
367	35	100				208	if (/\G$grammar_regexp->{sep}/smgc) {
368	34					49	$need_sep = 0;
369	34					60	next;
370							}
371							}
372							else {
373	55	50				131	if (my @t = $class->_tokenize_value()) {
374	55					142	push @tokens => @t;
375	55					73	$need_sep = 1;
376	55					99	next;
377							}
378							}
379
380	1					2	$class->_syntax_error();
381							}
382
383	29					328	return @tokens;
384							}
385
386							sub _skip_whitespace {
387	1200			1200		1607	my $class = shift;
388	1200	100				3097	if (/\G\s+/smgco) {
389							# pass through
390	610					920	warn "[PASS] WHITESPACE" if DEBUG;
391							}
392							}
393
394	20			20		46	sub _syntax_error { shift->_error('Syntax Error') }
395
396							sub _error {
397	20			20		36	my ($class, $msg) = @_;
398
399	20					31	my $src = $_;
400	20		100			51	my $curr = pos \|\| 0;
401	20					25	my $line = 1;
402	20		50			59	my $start = pos $src \|\| 0;
403	20		66			103	while ($src =~ /$/smgco and pos $src <= $curr) {
404	28					36	$start = pos $src;
405	28					91	$line++;
406							}
407	20					29	my $end = pos $src;
408	20					23	my $len = $curr - $start;
409	20	100				40	$len-- if $len > 0;
410
411	20		100			103	my $trace = join "\n",
412							"${msg}: line:$line",
413							substr($src, $start \|\| 0, $end - $start),
414							(' ' x $len) . '^';
415	20					300	die $trace, "\n";
416							}
417
418							1;
419							__END__