File Coverage

blib/lib/TOML/Tiny/Tokenizer.pm

Criterion	Covered	Total	%
statement	124	130	95.3
branch	22	24	91.6
condition	24	29	82.7
subroutine	16	18	88.8
pod	0	12	0.0
total	186	213	87.3

line	stmt	bran	cond	sub	pod	time	code
1							package TOML::Tiny::Tokenizer;
2							# ABSTRACT: tokenizer used by TOML::Tiny
3							$TOML::Tiny::Tokenizer::VERSION = '0.14';
4	285			285		2317	use strict;
	285					742
	285					10024
5	285			285		1660	use warnings;
	285					701
	285					9992
6	285			285		1558	no warnings qw(experimental);
	285					716
	285					11423
7	285			285		165555	use charnames qw(:full);
	285					3130107
	285					2056
8	285			285		72717	use v5.18;
	285					1181
9
10	285			285		1890	use TOML::Tiny::Grammar;
	285					666
	285					677823
11
12							sub new {
13	424			424	0	1911	my ($class, %param) = @_;
14
15							my $self = bless{
16							source => $param{source},
17							last_position => length $param{source},
18	424					3805	position => 0,
19							line => 1,
20							last_token => undef,
21							}, $class;
22
23	424					1766	return $self;
24							}
25
26							sub last_token {
27	0			0	0	0	my $self = shift;
28	0					0	return $self->{last_token};
29							}
30
31							sub next_token {
32	6358			6358	0	9427	my $self = shift;
33
34							return unless defined $self->{source}
35	6358	100	66			23724	&& $self->{position} < $self->{last_position};
36
37	6114	100				12454	if (!$self->{last_token}) {
38	422					3730	return $self->{last_token} = {type => 'table', pos => 0, line => 1, value => []};
39							}
40
41							# Update the regex engine's position marker in case some other regex
42							# attempted to match against the source string and reset it.
43	5692					15249	pos($self->{source}) = $self->{position};
44
45	5692					14863	my $token;
46							my $type;
47	5692					0	my $value;
48
49	5692					41435	state $key_set = qr/\G ($Key) $WS* (?= =)/x;
50	5692					184194	state $table = qr/\G \[ $WS* ($Key) $WS* \] $WS* (?:$EOL \| $)/x;
51	5692					155486	state $array_table = qr/\G \[\[ $WS* ($Key) $WS* \]\] $WS* (?:$EOL \| $)/x;
52
53	5692					130870	state $simple = {
54							'[' => 'inline_array',
55							']' => 'inline_array_close',
56							'{' => 'inline_table',
57							'}' => 'inline_table_close',
58							',' => 'comma',
59							'=' => 'assign',
60							'true' => 'bool',
61							'false' => 'bool',
62							};
63
64							# More complex matches with regexps
65	5692		100			19862	while ($self->{position} < $self->{last_position} && !defined($type)) {
66	5888	50				13444	my $prev = $self->{last_token} ? $self->{last_token}{type} : 'EOL';
67	5888		100			22009	my $newline = !!($prev eq 'EOL' \|\| $prev eq 'table' \|\| $prev eq 'array_table');
68
69	5888					10836	for ($self->{source}) {
70	5888					31342	/\G$WS+/gc; # ignore whitespace
71	5888	100				27064	/\G$Comment$/mgc && next; # ignore comments
72
73	5788					12955	last when /\G$/gc;
74
75	5689					31916	when (/\G$EOL/gc) {
76	1323					2614	++$self->{line};
77	1323					3601	$type = 'EOL';
78							}
79
80	4366	100				9991	if ($newline) {
81	1557					9799	when (/$table/gc) {
82	233					483	$type = 'table';
83	233					628	$value = $self->tokenize_key($1);
84							}
85
86	1324					6389	when (/$array_table/gc) {
87	74					152	$type = 'array_table';
88	74					210	$value = $self->tokenize_key($1);
89							}
90							}
91
92	4059					25235	when (/$key_set/gc) {
93	1059					2147	$type = 'key';
94	1059					4460	$value = $1;
95							}
96
97	3000					8157	when (/\G ( [\[\]{}=,] \| true \| false )/xgc) {
98	1852					4397	$value = $1;
99	1852					6523	$type = $simple->{$value};
100							}
101
102	1148					52042	when (/\G($String)/gc) {
103	461					1176	$type = 'string';
104	461					2178	$value = $1;
105							}
106
107	687					123979	when (/\G($DateTime)/gc) {
108	55					163	$type = 'datetime';
109	55					346	$value = $1;
110							}
111
112	632					20786	when (/\G($Float)/gc) {
113	101					224	$type = 'float';
114	101					500	$value = $1;
115							}
116
117	531					12124	when (/\G($Integer)/gc) {
118	422					1065	$type = 'integer';
119	422					2333	$value = $1;
120							}
121
122	109					462	default{
123	109		50			860	my $substr = substr($self->{source}, $self->{position}, 30) // 'undef';
124	109					1689	die "toml syntax error on line $self->{line}\n\t-->\|$substr\|\n";
125							}
126							}
127
128	5779	100				12224	if ($type) {
129	5580					7789	state $tokenizers = {};
130	5580		100			25203	my $tokenize = $tokenizers->{$type} //= $self->can("tokenize_$type") \|\| 0;
			100
131
132							$token = {
133							line => $self->{line},
134							pos => $self->{pos},
135							type => $type,
136							value => $tokenize ? $tokenize->($self, $value) : $value,
137							prev => $self->{last_token},
138	5580	100				22680	};
139
140							# Unset the previous token's 'prev' key to prevent keeping the entire
141							# chain of previously parsed tokens alive for the whole process.
142	5575					13210	undef $self->{last_token}{prev};
143
144	5575					9169	$self->{last_token} = $token;
145							}
146
147	5774					11703	$self->update_position;
148							}
149
150	5578					18038	return $token;
151							}
152
153							sub current_line {
154	0			0	0	0	my $self = shift;
155	0					0	my $rest = substr $self->{source}, $self->{position};
156	0					0	my $stop = index $rest, "\n";
157	0					0	substr $rest, 0, $stop;
158							}
159
160							sub update_position {
161	5774			5774	0	8398	my $self = shift;
162	5774		50			30799	$self->{position} = pos($self->{source}) // 0;
163							}
164
165							sub error {
166	5			5	0	13	my $self = shift;
167	5					13	my $token = shift;
168	5		50			18	my $msg = shift // 'unknown';
169	5	50				31	my $line = $token ? $token->{line} : $self->{line};
170	5					78	die "toml: parse error at line $line: $msg\n";
171							}
172
173							sub tokenize_key {
174	1366			1366	0	2574	my $self = shift;
175	1366					2511	my $toml = shift;
176	1366					23329	my @segs = $toml =~ /($SimpleKey)\.?/g;
177	1366					38523	my @keys;
178
179	1366					2883	for my $seg (@segs) {
180	1630					3915	$seg =~ s/^["']//;
181	1630					4367	$seg =~ s/["']$//;
182	1630					4448	$seg = $self->unescape_str($seg);
183	1630					4490	push @keys, $seg;
184							}
185
186	1366					7302	return \@keys;
187							}
188
189							sub tokenize_float {
190	101			101	0	312	$_[1] =~ tr/_//d;
191	101					398	$_[1];
192							}
193
194							sub tokenize_integer {
195	422			422	0	1480	$_[1] =~ tr/_+//d;
196	422					1838	$_[1];
197							}
198
199							sub tokenize_string {
200	461			461	0	782	my $self = shift;
201	461					833	my $toml = shift;
202	461		100			2735	my $ml = index($toml, q{'''}) == 0
203							\|\| index($toml, q{"""}) == 0;
204	461					1054	my $lit = index($toml, q{'}) == 0;
205	461					803	my $str = '';
206
207	461	100				918	if ($ml) {
208	35					136	$str = substr $toml, 3, length($toml) - 6;
209	35					423	my @newlines = $str =~ /($CRLF)/g;
210	35					95	$self->{line} += scalar @newlines;
211	35					398	$str =~ s/^$WS* $CRLF//x; # trim leading whitespace
212	35					473	$str =~ s/\\$EOL\s*//xgs; # trim newlines from lines ending in backslash
213							} else {
214	426					1550	$str = substr($toml, 1, length($toml) - 2);
215							}
216
217	461	100				1131	if (!$lit) {
218	428					998	$str = $self->unescape_str($str);
219							}
220
221	456					2045	return $str;
222							}
223
224							sub unescape_chars {
225	119			119	0	439	state $esc = {
226							'\b' => "\x08",
227							'\t' => "\x09",
228							'\n' => "\x0A",
229							'\f' => "\x0C",
230							'\r' => "\x0D",
231							'\"' => "\x22",
232							'\/' => "\x2F",
233							'\\\\' => "\x5C",
234							};
235
236	119	100				358	if (exists $esc->{$_[0]}) {
237	100					414	return $esc->{$_[0]};
238							}
239
240	19					70	my $hex = hex substr($_[0], 2);
241
242	19	100	100			92	if ($hex < 0x10FFFF && charnames::viacode($hex)) {
243	14					46253	return chr $hex;
244							}
245
246	5					13362	return;
247							}
248
249							sub unescape_str {
250	2058			2058	0	11992	state $re = qr/($Escape)/;
251	2058		66			8104	$_[1] =~ s\|$re\|unescape_chars($1) // $_[0]->error(undef, "invalid unicode escape: $1")\|xge;
	119					342
252	2053					4578	$_[1];
253							}
254
255							1;
256
257							__END__
258
259							=pod
260
261							=encoding UTF-8
262
263							=head1 NAME
264
265							TOML::Tiny::Tokenizer - tokenizer used by TOML::Tiny
266
267							=head1 VERSION
268
269							version 0.14
270
271							=head1 AUTHOR
272
273							Jeff Ober <sysread@fastmail.fm>
274
275							=head1 COPYRIGHT AND LICENSE
276
277							This software is copyright (c) 2021 by Jeff Ober.
278
279							This is free software; you can redistribute it and/or modify it under
280							the same terms as the Perl 5 programming language system itself.
281
282							=cut