File Coverage

blib/lib/TOML/Tiny/Tokenizer.pm

Criterion	Covered	Total	%
statement	132	138	95.6
branch	42	44	95.4
condition	24	29	82.7
subroutine	16	18	88.8
pod	0	12	0.0
total	214	241	88.8

line	stmt	bran	cond	sub	pod	time	code
1							package TOML::Tiny::Tokenizer;
2							# ABSTRACT: tokenizer used by TOML::Tiny
3							$TOML::Tiny::Tokenizer::VERSION = '0.16';
4	286			286		2166	use strict;
	286					690
	286					9871
5	286			286		1742	use warnings;
	286					735
	286					8373
6	286			286		1558	no warnings qw(experimental);
	286					763
	286					11734
7	286			286		151252	use charnames qw(:full);
	286					3019573
	286					1943
8	286			286		67067	use v5.18;
	286					1212
9
10	286					628135	use TOML::Tiny::Grammar qw(
11							$Comment
12							$CRLF
13							$DateTime
14							$EOL
15							$Escape
16							$Float
17							$Integer
18							$Key
19							$SimpleKey
20							$String
21							$WS
22	286			286		1957	);
	286					798
23
24							sub new {
25	424			424	0	2029	my ($class, %param) = @_;
26
27							my $self = bless{
28							source => $param{source},
29							last_position => length $param{source},
30	424					3629	position => 0,
31							line => 1,
32							last_token => undef,
33							}, $class;
34
35	424					2053	return $self;
36							}
37
38							sub last_token {
39	0			0	0	0	my $self = shift;
40	0					0	return $self->{last_token};
41							}
42
43							sub next_token {
44	6358			6358	0	9702	my $self = shift;
45
46							return unless defined $self->{source}
47	6358	100	66			23573	&& $self->{position} < $self->{last_position};
48
49	6114	100				12474	if (!$self->{last_token}) {
50	422					4091	return $self->{last_token} = {type => 'table', pos => 0, line => 1, value => []};
51							}
52
53							# Update the regex engine's position marker in case some other regex
54							# attempted to match against the source string and reset it.
55	5692					16063	pos($self->{source}) = $self->{position};
56
57	5692					14682	my $token;
58							my $type;
59	5692					0	my $value;
60
61	5692					42121	state $key_set = qr/\G ($Key) $WS* (?= =)/x;
62	5692					185214	state $table = qr/\G \[ $WS* ($Key) $WS* \] $WS* (?:$EOL \| $)/x;
63	5692					153880	state $array_table = qr/\G \[\[ $WS* ($Key) $WS* \]\] $WS* (?:$EOL \| $)/x;
64
65	5692					131974	state $simple = {
66							'[' => 'inline_array',
67							']' => 'inline_array_close',
68							'{' => 'inline_table',
69							'}' => 'inline_table_close',
70							',' => 'comma',
71							'=' => 'assign',
72							'true' => 'bool',
73							'false' => 'bool',
74							};
75
76							# More complex matches with regexps
77	5692		100			20539	while ($self->{position} < $self->{last_position} && !defined($type)) {
78	5888	50				13442	my $prev = $self->{last_token} ? $self->{last_token}{type} : 'EOL';
79	5888		100			22597	my $newline = !!($prev eq 'EOL' \|\| $prev eq 'table' \|\| $prev eq 'array_table');
80
81	5888					12187	for ($self->{source}) {
82	5888					31274	/\G$WS+/gc; # ignore whitespace
83	5888	100				27894	/\G$Comment$/mgc && next; # ignore comments
84
85	5788	100				16642	last if /\G$/gc;
86
87	5689	100				35682	if (/\G$EOL/gc) {
88	1323					2773	++$self->{line};
89	1323					2653	$type = 'EOL';
90	1323					2668	last;
91							}
92
93	4366	100				9944	if ($newline) {
94	1557	100				10966	if (/$table/gc) {
95	233					531	$type = 'table';
96	233					597	$value = $self->tokenize_key($1);
97	233					518	last;
98							}
99
100	1324	100				9417	if (/$array_table/gc) {
101	74					170	$type = 'array_table';
102	74					256	$value = $self->tokenize_key($1);
103	74					170	last;
104							}
105							}
106
107	4059	100				29969	if (/$key_set/gc) {
108	1059					2491	$type = 'key';
109	1059					2798	$value = $1;
110	1059					2452	last;
111							}
112
113	3000	100				10401	if (/\G ( [\[\]{}=,] \| true \| false )/xgc) {
114	1852					4884	$value = $1;
115	1852					4435	$type = $simple->{$value};
116	1852					3737	last;
117							}
118
119	1148	100				54822	if (/\G($String)/gc) {
120	461					1123	$type = 'string';
121	461					1142	$value = $1;
122	461					1222	last;
123							}
124
125	687	100				126080	if (/\G($DateTime)/gc) {
126	55					150	$type = 'datetime';
127	55					171	$value = $1;
128	55					172	last;
129							}
130
131	632	100				22098	if (/\G($Float)/gc) {
132	101					234	$type = 'float';
133	101					258	$value = $1;
134	101					278	last;
135							}
136
137	531	100				13106	if (/\G($Integer)/gc) {
138	422					1057	$type = 'integer';
139	422					1117	$value = $1;
140	422					1250	last;
141							}
142
143	109		50			1137	my $substr = substr($self->{source}, $self->{position}, 30) // 'undef';
144	109					1785	die "toml syntax error on line $self->{line}\n\t-->\|$substr\|\n";
145							}
146
147	5779	100				11937	if ($type) {
148	5580					8361	state $tokenizers = {};
149	5580		100			24665	my $tokenize = $tokenizers->{$type} //= $self->can("tokenize_$type") \|\| 0;
			100
150
151							$token = {
152							line => $self->{line},
153							pos => $self->{pos},
154							type => $type,
155							value => $tokenize ? $tokenize->($self, $value) : $value,
156							prev => $self->{last_token},
157	5580	100				23456	};
158
159							# Unset the previous token's 'prev' key to prevent keeping the entire
160							# chain of previously parsed tokens alive for the whole process.
161	5575					13699	undef $self->{last_token}{prev};
162
163	5575					9330	$self->{last_token} = $token;
164							}
165
166	5774					12145	$self->update_position;
167							}
168
169	5578					18460	return $token;
170							}
171
172							sub current_line {
173	0			0	0	0	my $self = shift;
174	0					0	my $rest = substr $self->{source}, $self->{position};
175	0					0	my $stop = index $rest, "\n";
176	0					0	substr $rest, 0, $stop;
177							}
178
179							sub update_position {
180	5774			5774	0	8804	my $self = shift;
181	5774		50			29808	$self->{position} = pos($self->{source}) // 0;
182							}
183
184							sub error {
185	5			5	0	15	my $self = shift;
186	5					12	my $token = shift;
187	5		50			18	my $msg = shift // 'unknown';
188	5	50				37	my $line = $token ? $token->{line} : $self->{line};
189	5					80	die "toml: parse error at line $line: $msg\n";
190							}
191
192							sub tokenize_key {
193	1366			1366	0	2465	my $self = shift;
194	1366					2669	my $toml = shift;
195	1366					22785	my @segs = $toml =~ /($SimpleKey)\.?/g;
196	1366					38418	my @keys;
197
198	1366					3450	for my $seg (@segs) {
199	1630					3964	$seg =~ s/^["']//;
200	1630					4359	$seg =~ s/["']$//;
201	1630					4242	$seg = $self->unescape_str($seg);
202	1630					4677	push @keys, $seg;
203							}
204
205	1366					7038	return \@keys;
206							}
207
208							sub tokenize_float {
209	101			101	0	319	$_[1] =~ tr/_//d;
210	101					416	$_[1];
211							}
212
213							sub tokenize_integer {
214	422			422	0	1839	$_[1] =~ tr/_+//d;
215	422					1831	$_[1];
216							}
217
218							sub tokenize_string {
219	461			461	0	891	my $self = shift;
220	461					846	my $toml = shift;
221	461		100			3024	my $ml = index($toml, q{'''}) == 0
222							\|\| index($toml, q{"""}) == 0;
223	461					1123	my $lit = index($toml, q{'}) == 0;
224	461					771	my $str = '';
225
226	461	100				991	if ($ml) {
227	35					167	$str = substr $toml, 3, length($toml) - 6;
228	35					466	my @newlines = $str =~ /($CRLF)/g;
229	35					104	$self->{line} += scalar @newlines;
230	35					383	$str =~ s/^$WS* $CRLF//x; # trim leading whitespace
231	35					408	$str =~ s/\\$EOL\s*//xgs; # trim newlines from lines ending in backslash
232							} else {
233	426					1658	$str = substr($toml, 1, length($toml) - 2);
234							}
235
236	461	100				1151	if (!$lit) {
237	428					1045	$str = $self->unescape_str($str);
238							}
239
240	456					2087	return $str;
241							}
242
243							sub unescape_chars {
244	119			119	0	371	state $esc = {
245							'\b' => "\x08",
246							'\t' => "\x09",
247							'\n' => "\x0A",
248							'\f' => "\x0C",
249							'\r' => "\x0D",
250							'\"' => "\x22",
251							'\/' => "\x2F",
252							'\\\\' => "\x5C",
253							};
254
255	119	100				409	if (exists $esc->{$_[0]}) {
256	100					463	return $esc->{$_[0]};
257							}
258
259	19					102	my $hex = hex substr($_[0], 2);
260
261	19	100	100			121	if ($hex < 0x10FFFF && charnames::viacode($hex)) {
262	14					51586	return chr $hex;
263							}
264
265	5					13824	return;
266							}
267
268							sub unescape_str {
269	2058			2058	0	12451	state $re = qr/($Escape)/;
270	2058		66			8774	$_[1] =~ s\|$re\|unescape_chars($1) // $_[0]->error(undef, "invalid unicode escape: $1")\|xge;
	119					305
271	2053					4766	$_[1];
272							}
273
274							1;
275
276							__END__
277
278							=pod
279
280							=encoding UTF-8
281
282							=head1 NAME
283
284							TOML::Tiny::Tokenizer - tokenizer used by TOML::Tiny
285
286							=head1 VERSION
287
288							version 0.16
289
290							=head1 AUTHOR
291
292							Jeff Ober <sysread@fastmail.fm>
293
294							=head1 COPYRIGHT AND LICENSE
295
296							This software is copyright (c) 2023 by Jeff Ober.
297
298							This is free software; you can redistribute it and/or modify it under
299							the same terms as the Perl 5 programming language system itself.
300
301							=cut