File Coverage

json-common.c

Criterion	Covered	Total	%
statement	212	311	68.1
branch	132	228	57.8
condition			n/a
subroutine			n/a
pod			n/a
total	344	539	63.8

line	stmt	bran	code
1			/* These things are common between the validation and the parsing
2			routines. This is #included into "Json3.xs". */
3
4			/* The following matches bytes which are not allowed in JSON
5			strings. "All Unicode characters may be placed within the quotation
6			marks except for the characters that must be escaped: quotation
7			mark, reverse solidus, and the control characters (U+0000 through
8			U+001F)." - from section 2.5 of RFC 4627 */
9
10			#define BADBYTES \
11			'\0':case 0x01:case 0x02:case 0x03: \
12			case 0x04:case 0x05:case 0x06:case 0x07: \
13			case 0x08:case 0x09:case 0x0A:case 0x0B: \
14			case 0x0C:case 0x0D:case 0x0E:case 0x0F: \
15			case 0x10:case 0x11:case 0x12:case 0x13: \
16			case 0x14:case 0x15:case 0x16:case 0x17: \
17			case 0x18:case 0x19:case 0x1A:case 0x1B: \
18			case 0x1C:case 0x1D:case 0x1E:case 0x1F
19
20			/* Match whitespace. Whitespace is as defined by the JSON standard,
21			not by Perl.
22
23			"Insignificant whitespace is allowed before or after any of the six
24			structural characters.
25
26			ws = *(
27			%x20 / ; Space
28			%x09 / ; Horizontal tab
29			%x0A / ; Line feed or New line
30			%x0D ; Carriage return
31			)"
32
33			From JSON RFC.
34			*/
35
36			#define WHITESPACE \
37			'\n': \
38			parser->line++; \
39			/* Fallthrough. */ \
40			case ' ': \
41			case '\t': \
42			case '\r'
43
44			/* Match digits. */
45
46			#define DIGIT \
47			'0': \
48			case '1': \
49			case '2': \
50			case '3': \
51			case '4': \
52			case '5': \
53			case '6': \
54			case '7': \
55			case '8': \
56			case '9'
57
58			/* Match digits from 1-9. This is handled differently because JSON
59			disallows leading zeros in numbers. */
60
61			#define DIGIT19 \
62			'1': \
63			case '2': \
64			case '3': \
65			case '4': \
66			case '5': \
67			case '6': \
68			case '7': \
69			case '8': \
70			case '9'
71
72			/* Hexadecimal, in upper and lower case. */
73
74			#define UHEX 'A': case 'B': case 'C': case 'D': case 'E': case 'F'
75			#define LHEX 'a': case 'b': case 'c': case 'd': case 'e': case 'f'
76
77			/* As of version 0.45 of JSON::Parse, most of the UTF-8 switches are
78			now in "unicode.c", but the following one is JSON-specific. */
79
80			/* This excludes '"' and '\'. */
81
82			#define BYTE_20_7F \
83			0x20: case 0x21:\
84			case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:\
85			case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F: case 0x30:\
86			case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:\
87			case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E:\
88			case 0x3F: case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45:\
89			case 0x46: case 0x47: case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:\
90			case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: case 0x52: case 0x53:\
91			case 0x54: case 0x55: case 0x56: case 0x57: case 0x58: case 0x59: case 0x5A:\
92			case 0x5B: case 0x5D: case 0x5E: case 0x5F: case 0x60: case 0x61:\
93			case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: case 0x68:\
94			case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:\
95			case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76:\
96			case 0x77: case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D:\
97			case 0x7E: case 0x7F
98
99			/* A "string_t" is a pointer into the input, which lives in
100			"parser->input". The "string_t" structure is used for copying
101			strings when the string does not contain any escapes. When a string
102			contains escapes, it is copied into "parser->buffer". */
103
104			typedef struct string {
105
106			unsigned char * start;
107			#ifdef NOPERL
108			int length;
109			#else /* def NOPERL */
110			STRLEN length;
111			#endif /* def NOPERL */
112			/* The "contains_escapes" flag is set if there are backslash escapes in
113			the string like "\r", so that it needs to be cleaned up before
114			using it. That means we use "parser->buffer". This is to speed
115			things up, by not doing the cleanup when it isn't necessary. */
116
117			unsigned contains_escapes : 1;
118			}
119			string_t;
120
121			typedef enum
122			{
123			json_invalid,
124			json_initial_state,
125			json_string,
126			json_number,
127			json_literal,
128			json_object,
129			json_array,
130			json_unicode_escape,
131			json_overflow
132			}
133			json_type_t;
134
135			const char * type_names[json_overflow] = {
136			"invalid",
137			"initial state",
138			"string",
139			"number",
140			"literal",
141			"object",
142			"array",
143			"unicode escape"
144			};
145
146			/* The maximum value of bytes to check for. */
147
148			#define JSON3MAXBYTE 0x100
149
150			// uncomment this when running random test to terminal otherwise the
151			// random characters will mess up the terminal.
152
153			//#define JSON3MAXBYTE 0x80
154
155			#include "errors.c"
156
157			/* Anything which could be the start of a value. */
158
159			#define VALUE_START (XARRAYOBJECTSTART\|XSTRING_START\|XDIGIT\|XMINUS\|XLITERAL)
160
161			typedef struct parser {
162
163			/* The length of "input". */
164
165			unsigned int length;
166
167			/* The input. This is fixed at the beginning throughout
168			parsing. */
169
170			unsigned char * input;
171
172			/* The end-point of the parsing, the last parsed thing. This
173			increments through "input". */
174
175			unsigned char * end;
176
177			/* The last byte of "input", "parser->input +
178			parser->length". This is used to detect overflows. */
179
180			unsigned char * last_byte;
181
182			/* Allocated size of "buffer". */
183
184			int buffer_size;
185
186			/* Buffer to stick strings into temporarily. */
187
188			unsigned char * buffer;
189
190			/* Line number. */
191
192			int line;
193
194			/* Where the beginning of the series of unfortunate events
195			was. For example if we are parsing an array, this points to the
196			"[" at the start of the array, or if we are parsing a string,
197			this points to the byte after the '"' at the start of the
198			string. */
199
200			unsigned char * bad_beginning;
201
202			/* The bad type itself. */
203
204			json_type_t bad_type;
205
206			/* What we were expecting to see when the error occurred. */
207
208			int expected;
209
210			/* The byte which caused the parser to fail. */
211
212			unsigned char * bad_byte;
213			unsigned bad_length;
214
215			/* The type of error encountered. */
216
217			json_error_t error;
218
219			/* If we were parsing a literal and found a bad character, what
220			were we expecting? */
221
222			unsigned char literal_char;
223
224			/* The end expected. */
225
226			int end_expected;
227
228			/* Number of mallocs. */
229
230			int n_mallocs;
231
232			/* Bytes we accept. */
233
234			int valid_bytes[JSON3MAXBYTE];
235
236			/* Current depth into arrays or objects. */
237
238			int depth;
239
240			/* Maximum depth we accept. */
241
242			int max_depth;
243
244			/* Perl SV * pointers to copy for our true, false, and null
245			values. */
246			void * user_true;
247			void * user_false;
248			void * user_null;
249			/* If this is 1, we copy the literals into new SVs. */
250			unsigned int copy_literals : 1;
251			/* If this is 1, we don't die on errors. */
252			unsigned int warn_only : 1;
253			/* If this is 1, we check for hash collisions before inserting values. */
254			unsigned int detect_collisions : 1;
255			/* Don't warn the user about non-false false and untrue true
256			values, etc. */
257			unsigned int no_warn_literals : 1;
258			/* Are we tokenizing the input? */
259			unsigned int tokenizing : 1;
260
261
262			#ifdef TESTRANDOM
263
264			/* Return point for longjmp. */
265
266			jmp_buf biscuit;
267
268			char * last_error;
269
270			#endif
271
272			/* Unicode? */
273
274			unsigned int unicode : 1;
275
276			/* Force unicode. This happens when we hit "\uxyzy". */
277
278			unsigned int force_unicode : 1;
279
280			/* Upgrade the input from bytes to characters. */
281
282			unsigned int upgrade_utf8 : 1;
283
284			/* Top-level value? We need to know this for the case when we are
285			parsing a number and suddenly meet a '\0' byte. If it's a top
286			level value then we can assume that is just the end of the
287			JSON, but if it's not a top-level value then that is an error,
288			since the end array or end object at least are missing. */
289
290			unsigned int top_level_value : 1;
291
292			/* Produce diagnostics as a hash rather than a string. */
293
294			unsigned int diagnostics_hash : 1;
295
296			#ifdef TESTRANDOM
297
298			/* This is true if we are testing with random bytes. */
299
300			unsigned randomtest : 1;
301
302			#endif /* def TESTRANDOM */
303			}
304			json_parse_t;
305
306			/* Maximum depth of parsing. */
307
308			#define JSON_PARSE_DEFAULT_MAX_DEPTH 10000
309
310			static void
311	290		json_parse_init (json_parse_t * parser)
312			{
313	290		parser->max_depth = JSON_PARSE_DEFAULT_MAX_DEPTH;
314	290		}
315
316			/* Check if the user has set something different from the default, and
317			don't croak if we are still OK. */
318
319			/* Increment the parsing depth, with check. */
320
321			//#define DEBUG_DEPTH
322
323			#ifdef DEBUG_DEPTH
324			#define PRINT_DEPTH \
325			printf ("%s:%d: %d\n", __FILE__, __LINE__, parser->depth);
326			#else
327			#define PRINT_DEPTH
328			#endif
329
330			#define INCDEPTH \
331			PRINT_DEPTH; \
332			parser->depth++; \
333			if (parser->depth > parser->max_depth) { \
334			croak ("error: too many [ or {, maximum is %d", \
335			parser->max_depth); \
336			}
337
338			#define DECDEPTH \
339			parser->depth--; \
340			PRINT_DEPTH;
341
342			#ifndef NOPERL
343			static SV * error_to_hash (json_parse_t * parser, char * error_as_string);
344			#endif /* ndef NOPERL */
345
346			#ifdef __GNUC__
347			#define INLINE inline
348			#else
349			#define INLINE
350			#endif /* def __GNUC__ */
351
352			/* The size of the buffer for printing errors. */
353
354			#define ERRORMSGBUFFERSIZE 0x1000
355
356			/* Declare all bad inputs as non-returning. */
357
358			#ifdef __GNUC__
359			#if 0
360			static void failbadinput_json (json_parse_t * parser) __attribute__ ((noreturn));
361			#endif /* 0 */
362			static void failbadinput (json_parse_t * parser) __attribute__ ((noreturn));
363			static INLINE void
364			failbug (char * file, int line, json_parse_t * parser, const char * format, ...)
365			__attribute__ ((noreturn));
366			#endif
367
368			/* Assert failure handler. Coming here means there is a bug in the
369			code rather than in the JSON input. We still send it to Perl via
370			"croak". */
371
372			static INLINE void
373	0		failbug (char * file, int line, json_parse_t * parser, const char * format, ...)
374			{
375			char buffer[ERRORMSGBUFFERSIZE];
376			va_list a;
377	0		va_start (a, format);
378	0		vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a);
379	0		va_end (a);
380	0		croak ("JSON::Parse: %s:%d: Internal error at line %d: %s",
381			file, line, parser->line, buffer);
382			}
383
384			/* This is a test for whether the string has ended, which we use when
385			we catch a zero byte in an unexpected part of the input. Here we
386			use ">" rather than ">=" because "parser->end" is incremented by
387			one after each access. See the NEXTBYTE macro. */
388
389			#define STRINGEND (parser->end > parser->last_byte)
390
391			/* One of the types which demands a specific next byte. */
392
393			#define SPECIFIC(c) (((c) & XIN_LITERAL) \|\| ((c) & XIN_SURROGATE_PAIR))
394
395			/* Make the list of valid bytes. */
396
397	0		static void make_valid_bytes (json_parse_t * parser)
398			{
399			int i;
400	0	0	for (i = 0; i < JSON3MAXBYTE; i++) {
401	0		parser->valid_bytes[i] = 0;
402			}
403	0	0	for (i = 0; i < n_expectations; i++) {
404			int X;
405	0		X = 1<
406	0	0	if (SPECIFIC (X)) {
		0
407	0		continue;
408			}
409	0	0	if (parser->expected & X) {
410			int j;
411	0	0	for (j = 0; j < JSON3MAXBYTE; j++) {
412	0		parser->valid_bytes[j] \|= allowed[i][j];
413			}
414			}
415			}
416	0	0	if (SPECIFIC (parser->expected)) {
		0
417	0		parser->valid_bytes[parser->literal_char] = 1;
418			}
419	0		}
420
421			/* Repeated arguments to snprintf. */
422
423			#define SNEND buffer + string_end
424			#define SNSIZE ERRORMSGBUFFERSIZE - string_end
425			/*
426
427			Disabled due to clash with Darwin compiler:
428
429			http://www.cpantesters.org/cpan/report/7c69e0f0-70c0-11e3-95aa-bcf4d95af652
430			http://www.cpantesters.org/cpan/report/6cde36da-6fd1-11e3-946f-2b87da5af652
431
432			#define SNEND, SNSIZE buffer + string_end, ERRORMSGBUFFERSIZE - string_end
433
434			*/
435
436			#define EROVERFLOW \
437			if (string_end >= ERRORMSGBUFFERSIZE - 0x100) { \
438			failbug (__FILE__, __LINE__, parser, \
439			"Error string length is %d" \
440			" of maximum %d. Bailing out.", \
441			string_end, ERRORMSGBUFFERSIZE); \
442			}
443
444
445			#if 0
446
447			/* Coming in to this routine, we have checked the error for validity
448			and converted at failbadinput. If this is called directly the bug
449			traps won't work. */
450
451			static void
452			failbadinput_json (json_parse_t * parser)
453			{
454			char buffer[ERRORMSGBUFFERSIZE];
455			int string_end;
456
457			string_end = 0;
458			string_end +=
459			snprintf (SNEND, SNSIZE,
460			"{"
461			"\"input length\":%d"
462			",\"bad type\":\"%s\""
463			",\"error\":\"%s\"",
464			parser->length,
465			type_names[parser->bad_type],
466			json_errors[parser->error]);
467			EROVERFLOW;
468			if (parser->bad_byte) {
469			int position;
470			position = (int) (parser->bad_byte - parser->input) + 1,
471
472			string_end += snprintf (SNEND, SNSIZE,
473			",\"bad byte position\":%d"
474			",\"bad byte contents\":%d",
475			position,
476			* parser->bad_byte);
477			EROVERFLOW;
478			}
479			if (parser->bad_beginning) {
480			int bcstart;
481			bcstart = (int) (parser->bad_beginning - parser->input) + 1;
482			string_end +=
483			snprintf (SNEND, SNSIZE, ",\"start of broken component\":%d",
484			bcstart);
485			EROVERFLOW;
486			}
487			if (parser->error == json_error_unexpected_character) {
488			int j;
489			make_valid_bytes (parser);
490			string_end +=
491			snprintf (SNEND, SNSIZE, ",\"valid bytes\":[%d",
492			parser->valid_bytes[0]);
493			EROVERFLOW;
494			for (j = 1; j < JSON3MAXBYTE; j++) {
495			string_end += snprintf (SNEND, SNSIZE, ",%d",
496			parser->valid_bytes[j]);
497			}
498			EROVERFLOW;
499			string_end += snprintf (SNEND, SNSIZE, "]");
500			EROVERFLOW;
501			}
502			string_end += snprintf (SNEND, SNSIZE, "}\n");
503			EROVERFLOW;
504			croak (buffer);
505			}
506
507			#endif /* 0 */
508
509			static void
510	180		failbadinput (json_parse_t * parser)
511			{
512			char buffer[ERRORMSGBUFFERSIZE];
513			int string_end;
514			int i;
515			int l;
516			const char * format;
517
518			/* If the error is "unexpected character", and we are at the end
519			of the input, change to "unexpected end of input". This is
520			probably triggered by reading a byte with value '\0', but we
521			don't check the value of "* parser->bad_byte" in the following
522			"if" statement, since it's an error to go past the expected end
523			of the string regardless of whether the byte is '\0'. */
524
525	180	100	if (parser->error == json_error_unexpected_character &&
		100
526	167		STRINGEND) {
527	15		parser->error = json_error_unexpected_end_of_input;
528			/* We don't care about what byte it was, we went past the end
529			of the string, which is already a failure. */
530	15		parser->bad_byte = 0;
531			/* It trips an assertion if "parser->expected" is set for
532			anything other than an "unexpected character" error. */
533	15		parser->expected = 0;
534			}
535			/* Array bounds check for error message. */
536	180	50	if (parser->error <= json_error_invalid &&
		0
537	0		parser->error >= json_error_overflow) {
538	0		failbug (__FILE__, __LINE__, parser,
539	0		"Bad value for parser->error: %d\n", parser->error);
540			}
541
542	180		format = json_errors[parser->error];
543	180		l = strlen (format);
544	180	50	if (l >= ERRORMSGBUFFERSIZE - 1) {
545	0		l = ERRORMSGBUFFERSIZE - 1;
546			}
547	3801	100	for (i = 0; i < l; i++) {
548	3621		buffer[i] = format[i];
549			}
550	180		buffer[l] = '\0';
551	180		string_end = l;
552
553			/* If we got an unexpected character somewhere, append the exact
554			value of the character to the error message. */
555
556	180	100	if (parser->error == json_error_unexpected_character) {
557
558			/* This contains the unexpected character itself, from the
559			"parser->bad_byte" pointer. */
560
561			unsigned char bb;
562
563			/* Make sure that we were told where the unexpected character
564			was. Unlocated unexpected characters are a bug. */
565
566	152	50	if (! parser->bad_byte) {
567	0		failbug (__FILE__, __LINE__, parser,
568			"unexpected character error but "
569			"parser->bad_byte is invalid");
570			}
571
572	152		bb = * parser->bad_byte;
573
574			/* We have to check what kind of character. For example
575			printing '\0' with %c will just give a message which
576			suddenly ends when printed to the terminal, and other
577			control characters will be invisible. So display the
578			character in a different way depending on whether it's
579			printable or not. */
580
581			/* Don't use "isprint" because on Windows it seems to think
582			that 0x80 is printable:
583			http://www.cpantesters.org/cpan/report/d6438b68-6bf4-1014-8647-737bdb05e747 */
584
585	152	100	if (bb >= 0x20 && bb < 0x7F) {
		100
586			/* Printable character, print the character itself. */
587	65		string_end += snprintf (SNEND, SNSIZE, " '%c'", bb);
588	65	50	EROVERFLOW;
589			}
590			else {
591			/* Unprintable character, print its hexadecimal value. */
592	87		string_end += snprintf (SNEND, SNSIZE, " 0x%02x", bb);
593	152	50	EROVERFLOW;
594			}
595			}
596	28	100	else if (parser->error == json_error_name_is_not_unique) {
597	3		string_end += snprintf (SNEND, SNSIZE, ": \"%.*s\"",
598			parser->bad_length,
599			parser->bad_byte);
600			}
601			/* "parser->bad_type" contains what was being parsed when the
602			error occurred. This should never be undefined. */
603	180	50	if (parser->bad_type <= json_invalid \|\|
		50
604	180		parser->bad_type >= json_overflow) {
605	0		failbug (__FILE__, __LINE__, parser,
606			"parsing type set to invalid value %d in error message",
607	0		parser->bad_type);
608			}
609	180		string_end += snprintf (SNEND, SNSIZE, " parsing %s",
610	180		type_names[parser->bad_type]);
611	180	50	EROVERFLOW;
612	180	100	if (parser->bad_beginning) {
613			int bad_byte;
614	87		bad_byte = (parser->bad_beginning - parser->input) + 1;
615	87		string_end += snprintf (SNEND, SNSIZE, " starting from byte %d",
616			bad_byte);
617	87	50	EROVERFLOW;
618			}
619
620			/* "parser->expected" is set for the "unexpected character" error
621			and it tells the user what kind of input was expected. It
622			contains various flags or'd together, so this goes through each
623			possible flag and prints a message for it. */
624
625	180	100	if (parser->expected) {
626	152	50	if (parser->error == json_error_unexpected_character) {
627			int i;
628			int joined;
629			unsigned char bb;
630	152		bb = * parser->bad_byte;
631
632	152		string_end += snprintf (SNEND, SNSIZE, ": expecting ");
633	152	50	EROVERFLOW;
634	152		joined = 0;
635
636	152	100	if (SPECIFIC (parser->expected)) {
		100
637	7	50	if (! parser->literal_char) {
638	0		failbug (__FILE__, __LINE__, parser,
639			"expected literal character unset");
640			}
641	7		string_end += snprintf (SNEND, SNSIZE, "'%c'", parser->literal_char);
642	7	50	EROVERFLOW;
643			}
644	3648	100	for (i = 0; i < n_expectations; i++) {
645			int X;
646	3496		X = 1<
647	3496	100	if (SPECIFIC (X)) {
		100
648	304		continue;
649			}
650	3192	50	if (i == xin_literal) {
651	0		failbug (__FILE__, __LINE__, parser,
652			"Literal passed through \"if SPECIFIC(X)\" test");
653			}
654	3192	100	if (parser->expected & X) {
655
656			/* Check that this really is disallowed. */
657
658	647	50	if (allowed[i][bb]) {
659	0		failbug (__FILE__, __LINE__, parser,
660			"mismatch parsing %s: got %X "
661			"but it's allowed by %s (%d)",
662	0		type_names[parser->bad_type], bb,
663			input_expectation[i], i);
664			}
665	647	100	if (joined) {
666	502		string_end += snprintf (SNEND, SNSIZE, " or ");
667	502	50	EROVERFLOW;
668			}
669	647		string_end += snprintf (SNEND, SNSIZE, "%s", input_expectation[i]);
670	647	50	EROVERFLOW;
671	647		joined = 1;
672			}
673			}
674			}
675			else {
676	0		failbug (__FILE__, __LINE__, parser,
677			"'expected' is set but error %s != unexp. char",
678	0		json_errors[parser->error]);
679			}
680			}
681	28	50	else if (parser->error == json_error_unexpected_character) {
682	0		failbug (__FILE__, __LINE__, parser,
683			"unexpected character error for 0X%02X at byte %d "
684	0		"with no expected value set", * parser->bad_byte,
685	0		parser->bad_byte - parser->input);
686			}
687
688			#undef SNEND
689			#undef SNSIZE
690
691			#ifdef TESTRANDOM
692
693			/* Go back to where we came from. */
694
695			if (parser->randomtest) {
696			parser->last_error = buffer;
697			make_valid_bytes (parser);
698			longjmp (parser->biscuit, 1);
699			}
700
701			#endif /* def TESTRANDOM */
702
703			#ifndef NOPERL
704	180	100	if (parser->diagnostics_hash) {
705			#if PERL_VERSION > 12
706	1		croak_sv (error_to_hash (parser, buffer));
707			#endif /* PERL_VERSION > 12 */
708			}
709			#endif /* ndef NOPERL */
710
711	179	100	if (parser->length > 0) {
712	176	100	if (parser->end - parser->input > parser->length) {
713	15		croak ("JSON error at line %d: %s", parser->line,
714			buffer);
715			}
716	161	100	else if (parser->bad_byte) {
717	155		croak ("JSON error at line %d, byte %d/%d: %s",
718			parser->line,
719	155		(int) (parser->bad_byte - parser->input + 1),
720			parser->length, buffer);
721			}
722			else {
723	6		croak ("JSON error at line %d: %s",
724			parser->line, buffer);
725			}
726			}
727			else {
728	3		croak ("JSON error: %s", buffer);
729			}
730			}
731
732			#undef SPECIFIC
733
734			/* This is for failures not due to errors in the input or to bugs but
735			to exhaustion of resources, i.e. out of memory, or file errors
736			would go here if there were any C file opening things anywhere. */
737
738	0		static INLINE void failresources (json_parse_t * parser, const char * format, ...)
739			{
740			char buffer[ERRORMSGBUFFERSIZE];
741			va_list a;
742	0		va_start (a, format);
743	0		vsnprintf (buffer, ERRORMSGBUFFERSIZE, format, a);
744	0		va_end (a);
745	0		croak ("Parsing failed at line %d, byte %d/%d: %s", parser->line,
746	0		(int) (parser->end - parser->input),
747			parser->length, buffer);
748			}
749
750			#undef ERRORMSGBUFFERSIZE
751
752			/* Get more memory for "parser->buffer". */
753
754			static void
755	29		expand_buffer (json_parse_t * parser, int length)
756			{
757	29	50	if (parser->buffer_size < 2 * length + 0x100) {
758	29		parser->buffer_size = 2 * length + 0x100;
759	29	50	if (parser->buffer) {
760	0		Renew (parser->buffer, parser->buffer_size, unsigned char);
761			}
762			else {
763	29		Newx (parser->buffer, parser->buffer_size, unsigned char);
764	29		parser->n_mallocs++;
765			}
766	29	50	if (! parser->buffer) {
767	0		failresources (parser, "out of memory");
768			}
769			}
770	29		}
771
772			#define UNIFAIL(err) \
773			parser->bad_type = json_unicode_escape; \
774			parser->error = json_error_ ## err; \
775			failbadinput (parser)
776
777			/* Parse the hex bit of a \uXYZA escape. */
778
779			static INLINE int
780	33		parse_hex_bytes (json_parse_t * parser, unsigned char * p)
781			{
782			int k;
783			int unicode;
784
785	33		unicode = 0;
786
787	155	100	for (k = 0; k < strlen ("ABCD"); k++) {
788
789			unsigned char c;
790
791	126		c = p[k];
792
793	126		switch (c) {
794
795			case DIGIT:
796	70		unicode = unicode * 16 + c - '0';
797	70		break;
798
799			case UHEX:
800	21		unicode = unicode * 16 + c - 'A' + 10;
801	21		break;
802
803			case LHEX:
804	31		unicode = unicode * 16 + c - 'a' + 10;
805	31		break;
806
807			case '\0':
808	2	50	if (p + k - parser->input >= parser->length) {
809	2		UNIFAIL (unexpected_end_of_input);
810			}
811	0		break;
812
813			default:
814	2		parser->bad_byte = p + k;
815	2		parser->expected = XHEXADECIMAL_CHARACTER;
816	2		UNIFAIL (unexpected_character);
817			}
818			}
819	29		return unicode;
820			}
821
822			/* STRINGFAIL applies for any kind of failure within a string, not
823			just unexpected character errors. */
824
825			#define STRINGFAIL(err) \
826			parser->error = json_error_ ## err; \
827			parser->bad_type = json_string; \
828			failbadinput (parser)
829
830			#define FAILSURROGATEPAIR(c) \
831			parser->expected = XIN_SURROGATE_PAIR; \
832			parser->literal_char = c; \
833			parser->bad_beginning = start - 2; \
834			parser->error = json_error_unexpected_character; \
835			parser->bad_type = json_unicode_escape; \
836			parser->bad_byte = p - 1; \
837			failbadinput (parser)
838
839			static INLINE unsigned char *
840	28		do_unicode_escape (json_parse_t * parser, unsigned char * p,
841			unsigned char ** b_ptr)
842			{
843			int unicode;
844			unsigned int plus;
845			unsigned char * start;
846	28		start = p;
847	28		unicode = parse_hex_bytes (parser, p);
848	24		p += 4;
849	24		plus = ucs2_to_utf8 (unicode, *b_ptr);
850	24	50	if (plus == UTF8_BAD_LEADING_BYTE \|\|
		50
851			plus == UTF8_BAD_CONTINUATION_BYTE) {
852	0		failbug (__FILE__, __LINE__, parser,
853			"Failed to parse unicode input %.4s", start);
854			}
855	24	100	else if (plus == UNICODE_SURROGATE_PAIR) {
856			int unicode2;
857			int plus2;
858	12	100	if (parser->last_byte - p < 6) {
859	4		parser->bad_beginning = start - 2;
860	4		parser->bad_type = json_unicode_escape;
861	4		parser->error = json_error_unexpected_end_of_input;
862	4		failbadinput (parser);
863			}
864	8	100	if (*p++ == '\\') {
865	5	50	if (*p++ == 'u') {
866	5		unicode2 = parse_hex_bytes (parser, p);
867	5		p += 4;
868	5		plus2 = surrogate_to_utf8 (unicode, unicode2, * b_ptr);
869	5	50	if (plus2 <= 0) {
870	0	0	if (plus2 == UNICODE_NOT_SURROGATE_PAIR) {
871	0		parser->bad_byte = 0;
872	0		parser->bad_beginning = p - 4;
873	0		UNIFAIL (not_surrogate_pair);
874			}
875			else {
876	0		failbug (__FILE__, __LINE__, parser,
877			"unhandled error %d from surrogate_to_utf8",
878			plus2);
879			}
880			}
881	5		* b_ptr += plus2;
882	5		goto end;
883			}
884			else {
885	0		FAILSURROGATEPAIR ('u');
886			}
887			}
888			else {
889	3		FAILSURROGATEPAIR ('\\');
890			}
891			}
892	12	50	else if (plus <= 0) {
893	0		failbug (__FILE__, __LINE__, parser,
894			"unhandled error code %d while decoding unicode escape",
895			plus);
896			}
897	12		* b_ptr += plus;
898			end:
899	17	100	if (unicode >= 0x80 && ! parser->unicode) {
		100
900			/* Force the UTF-8 flag on for this string. */
901	8		parser->force_unicode = 1;
902			}
903	17		return p;
904			}
905
906			/* Handle backslash escapes. We can't use the NEXTBYTE macro here for
907			the reasons outlined below. */
908
909			#if 0
910
911			/* I expected a switch statement to compile to faster code, but it
912			doesn't seem to. */
913
914			#define HANDLE_ESCAPES(p,start) \
915			switch (c = * ((p)++)) { \
916			\
917			case '\\': \
918			case '/': \
919			case '"': \
920			*b++ = c; \
921			break; \
922			\
923			case 'b': \
924			*b++ = '\b'; \
925			break; \
926			\
927			case 'f': \
928			*b++ = '\f'; \
929			break; \
930			\
931			case 'n': \
932			*b++ = '\n'; \
933			break; \
934			\
935			case 'r': \
936			*b++ = '\r'; \
937			break; \
938			\
939			case 't': \
940			*b++ = '\t'; \
941			break; \
942			\
943			case 'u': \
944			p = do_unicode_escape (parser, p, & b); \
945			break; \
946			\
947			default: \
948			parser->bad_beginning = start; \
949			parser->bad_byte = p - 1; \
950			parser->expected = XESCAPE; \
951			STRINGFAIL (unexpected_character); \
952			}
953
954			#else
955
956			/* This is identical to the above macro, but it uses if statements
957			rather than a switch statement. Using the Clang compiler, this
958			results in about 2.5% faster code, for some reason or another. */
959
960			#define HANDLE_ESCAPES(p,start) \
961			c = * ((p)++); \
962			if (c == '\\' \|\| c == '/' \|\| c == '"') { \
963			*b++ = c; \
964			} \
965			else if (c == 'b') { \
966			*b++ = '\b'; \
967			} \
968			else if (c == 'f') { \
969			*b++ = '\f'; \
970			} \
971			else if (c == 'n') { \
972			*b++ = '\n'; \
973			} \
974			else if (c == 'r') { \
975			*b++ = '\r'; \
976			} \
977			else if (c == 't') { \
978			*b++ = '\t'; \
979			} \
980			else if (c == 'u') { \
981			p = do_unicode_escape (parser, p, & b); \
982			} \
983			else { \
984			parser->bad_beginning = start; \
985			parser->bad_byte = p - 1; \
986			parser->expected = XESCAPE; \
987			STRINGFAIL (unexpected_character); \
988			}
989			#endif
990			/* Resolve "s" by converting escapes into the appropriate things. Put
991			the result into "parser->buffer". The return value is the length of
992			the string. */
993
994			static INLINE int
995	2		resolve_string (json_parse_t * parser, string_t * s)
996			{
997			/* The pointer where we copy the string. This points into
998			"parser->buffer". */
999
1000			unsigned char * b;
1001
1002			/* "p" is the pointer into "parser->input", using "s->start" to
1003			get the start point. We don't use "parser->end" for this job
1004			because "resolve_string" is called only after the value of the
1005			object is resolved. E.g. if the object goes like
1006
1007			{"hot":{"potatoes":"tomatoes"}}
1008
1009			then this routine is called first for "potatoes" and then for
1010			"hot" as each sub-element of the hashes is resolved. We don't
1011			want to mess around with the value of "parser->end", which is
1012			always pointing to one after the last byte viewed. */
1013
1014			unsigned char * p;
1015
1016	2		p = s->start;
1017
1018			/* Ensure we have enough memory to fit the string. */
1019
1020	2		expand_buffer (parser, s->length);
1021
1022	2		b = parser->buffer;
1023
1024	62	100	while (p - s->start < s->length) {
1025			unsigned char c;
1026
1027	60		c = *p++;
1028	60	100	if (c == '\\') {
1029	4	50	HANDLE_ESCAPES (p, s->start - 1);
		50
		50
		0
		0
		0
		0
		0
		0
1030			}
1031			else {
1032	56		*b++ = c;
1033			}
1034			}
1035
1036			/* This is the length of the string in bytes. */
1037
1038	2		return b - parser->buffer;
1039			}
1040
1041			#define NEXTBYTE (c = *parser->end++)
1042
1043			/* Get an object key value and put it into "key". Check for
1044			escapes. */
1045
1046			static INLINE void
1047	636		get_key_string (json_parse_t * parser, string_t * key)
1048			{
1049			unsigned char c;
1050			int i;
1051
1052	636		key->start = parser->end;
1053	636		key->contains_escapes = 0;
1054
1055			key_string_next:
1056
1057	6564		switch (NEXTBYTE) {
1058
1059			case '"':
1060			/* Go on eating bytes until we find a ". */
1061
1062	634		break;
1063
1064			case '\\':
1065			/* Mark this string as containing escapes. */
1066	4		key->contains_escapes = 1;
1067
1068	4		switch (NEXTBYTE) {
1069
1070			case '\\':
1071			case '/':
1072			case '"':
1073			case 'b':
1074			case 'f':
1075			case 'n':
1076			case 'r':
1077			case 't':
1078			/* Eat another byte. */
1079	4		goto key_string_next;
1080
1081			case 'u':
1082
1083			/* i counts the bytes, from 0 to 3. */
1084	0		i = 0;
1085			unitunes:
1086	0	0	switch (NEXTBYTE) {
1087			case DIGIT:
1088			case UHEX:
1089			case LHEX:
1090	0		i++;
1091	0	0	if (i >= strlen ("ABCD")) {
1092	0		goto key_string_next;
1093			}
1094			else {
1095	0		goto unitunes;
1096			}
1097			/* not a fall through, we always "goto" above. */
1098			default:
1099	0		parser->bad_beginning = parser->end - 1 - i;
1100	0		parser->expected = XHEXADECIMAL_CHARACTER;
1101	0		parser->bad_byte = parser->end - 1;
1102	0		UNIFAIL (unexpected_character);
1103			}
1104			/* not a fall through, we either UNIFAIL or goto above. */
1105
1106			default:
1107	0		parser->bad_beginning = key->start - 1;
1108	0		parser->expected = XESCAPE;
1109	0		parser->bad_byte = parser->end - 1;
1110	0		STRINGFAIL (unexpected_character);
1111			}
1112			/* Not a fall through, we never arrive here. */
1113
1114			case BADBYTES:
1115
1116	2		parser->bad_beginning = key->start - 1;
1117	2		parser->expected = XSTRINGCHAR;
1118	2		parser->bad_byte = parser->end - 1;
1119	2		STRINGFAIL (unexpected_character);
1120			/* Not a fall through, STRINGFAIL does not return. */
1121
1122			#define ADDBYTE
1123			#define string_start key_string_next
1124			#define startofutf8string (key->start)
1125			#include "utf8-byte-one.c"
1126			/* Not a fall through. */
1127			default:
1128
1129	0		parser->bad_beginning = key->start - 1;
1130	0		parser->expected = XSTRINGCHAR;
1131	0		parser->bad_byte = parser->end - 1;
1132	0		STRINGFAIL (unexpected_character);
1133			}
1134	634		key->length = parser->end - key->start - 1;
1135	634		return;
1136
1137			#include "utf8-next-byte.c"
1138			#undef startofutf8string
1139			#undef string_start
1140			#undef ADDBYTE
1141			}
1142
1143			/* "start - 1" puts the start on the " rather than after it. "start"
1144			is usually after the quote because the quote is eaten on the way
1145			here. */
1146
1147			#define ILLEGALBYTE \
1148			parser->bad_beginning = start - 1; \
1149			parser->bad_byte = parser->end - 1; \
1150			parser->expected = XSTRINGCHAR; \
1151			STRINGFAIL (unexpected_character)
1152
1153
1154			/* Resolve the string pointed to by "parser->end" into
1155			"parser->buffer". The return value is the length of the
1156			string. This is only called if the string has \ escapes in it. */
1157
1158			static INLINE int
1159	27		get_string (json_parse_t * parser)
1160			{
1161			unsigned char * b;
1162			unsigned char c;
1163			unsigned char * start;
1164
1165	27		start = parser->end;
1166
1167	27	50	if (! parser->buffer) {
1168	27		expand_buffer (parser, 0x1000);
1169			}
1170	27		b = parser->buffer;
1171
1172			string_start:
1173
1174	722	50	if (b - parser->buffer >= parser->buffer_size - 0x100) {
1175			/* Save our offset in parser->buffer, because "realloc" is
1176			called by "expand_buffer", and "b" may no longer point
1177			to a meaningful location. */
1178	0		int size = b - parser->buffer;
1179	0		expand_buffer (parser, 2 * parser->buffer_size);
1180	0		b = parser->buffer + size;
1181			}
1182	722		switch (NEXTBYTE) {
1183
1184			case '"':
1185	12		goto string_end;
1186			break;
1187
1188			case '\\':
1189	48	100	HANDLE_ESCAPES (parser->end, start - 1);
		100
		100
		100
		100
		100
		100
		100
		100
1190	33		goto string_start;
1191
1192			#define ADDBYTE (* b++ = c)
1193			#define startofutf8string start
1194			#include "utf8-byte-one.c"
1195
1196			/* Not a fall through. */
1197			default:
1198			/* fall through */
1199			case BADBYTES:
1200	0		ILLEGALBYTE;
1201			}
1202
1203			if (STRINGEND) {
1204			STRINGFAIL (unexpected_end_of_input);
1205			}
1206
1207			string_end:
1208	12		return b - parser->buffer;
1209
1210			#include "utf8-next-byte.c"
1211			#undef ADDBYTE
1212
1213			goto string_end;
1214			}
1215
1216			static void
1217	122		parser_free (json_parse_t * parser)
1218			{
1219	122	100	if (parser->buffer) {
1220	13		Safefree (parser->buffer);
1221	13		parser->n_mallocs--;
1222			}
1223			/* There is a discrepancy between the number of things used and
1224			the number freed. */
1225	122	100	if (parser->n_mallocs != 0) {
1226			/* The tokenizing parser is freed before the tokens themselves
1227			are freed. Whether or not the tokens are freed correctly
1228			can be checked in "tokenize_free" in
1229			"json-entry-points.c". */
1230	3	50	if (! parser->tokenizing) {
1231	0		fprintf (stderr, "%s:%d: %d pieces of unfreed memory remain.\n",
1232			__FILE__, __LINE__, parser->n_mallocs);
1233			}
1234			}
1235	122		parser->buffer = 0;
1236	122		parser->buffer_size = 0;
1237	122		}
1238
1239
1240			typedef enum json_token_type {
1241			json_token_invalid,
1242			json_token_number,
1243			json_token_string,
1244			json_token_key,
1245			json_token_literal,
1246			json_token_comma,
1247			json_token_colon,
1248			json_token_object,
1249			json_token_array,
1250			n_json_tokens
1251			}
1252			json_token_type_t;
1253
1254			const char * token_names[n_json_tokens] = {
1255			"invalid",
1256			"number",
1257			"string",
1258			"key",
1259			"literal",
1260			"comma",
1261			"colon",
1262			"object",
1263			"array"
1264			};
1265
1266			typedef struct json_token json_token_t;
1267
1268			struct json_token {
1269			json_token_t * child;
1270			json_token_t * next;
1271			unsigned int start;
1272			unsigned int end;
1273			json_token_type_t type;
1274			unsigned int parent;
1275			unsigned blessed : 1;
1276			};
1277
1278			#define JSON_TOKEN_PARENT_INVALID 0
1279
1280			/* "start" is the first character of the thing. "end" is the last
1281			character of the thing. If the thing only takes one character then
1282			"start == end" should be true. */
1283
1284			static json_token_t *
1285	93		json_token_new (json_parse_t * parser, unsigned char * start,
1286			unsigned char * end, json_token_type_t type)
1287			{
1288			json_token_t * new;
1289
1290			/* Check the token in various ways. */
1291
1292	93		switch (type) {
1293			case json_token_string:
1294			case json_token_key:
1295	34	50	if (* start != '"') {
1296	0	0	if (end) {
1297	0		failbug (__FILE__, __LINE__, parser,
1298			"no quotes at start of string '%.*s'",
1299			end - start, start);
1300			}
1301			else {
1302	0		failbug (__FILE__, __LINE__, parser,
1303			"no quotes at start of string '%.10s'",
1304			start);
1305			}
1306			}
1307	34	100	if (end && * end != '"') {
		50
1308	0		failbug (__FILE__, __LINE__, parser,
1309			"'%c' is not a quote at end of string '%.*s'",
1310	0		* end, end - start, start);
1311			}
1312	34		break;
1313			case json_token_number:
1314	9	50	if (* start - '0' > 9 && * start != '-') {
		0
1315	0		failbug (__FILE__, __LINE__, parser,
1316			"bad character %c at start of number",
1317	0		* start);
1318			}
1319	9	50	if (* end - '0' > 9) {
1320	0		failbug (__FILE__, __LINE__, parser,
1321			"bad character %c at end of number",
1322	0		* end);
1323			}
1324	9		break;
1325			case json_token_object:
1326	7	50	if (* start != '{' \|\| (end && * end != '}')) {
		50
		0
1327	0	0	failbug (__FILE__, __LINE__, parser,
1328			"no { or } in object %.*s: char %X",
1329	0		end ? end - start : strlen ((char *) start),
1330	0		start, * start);
1331			}
1332	7		break;
1333			case json_token_array:
1334	1	50	if (* start != '[' \|\| (end && * end != ']')) {
		50
		0
1335	0		failbug (__FILE__, __LINE__, parser,
1336			"no [ or ] in array");
1337			}
1338	1		break;
1339			case json_token_comma:
1340	18	50	if (end - start != 0 \|\| * start != ',') {
		50
1341	0		failbug (__FILE__, __LINE__, parser,
1342			"not a comma %.*s",
1343			end - start);
1344			}
1345	18		break;
1346			case json_token_colon:
1347	23	50	if (end - start != 0 \|\| * start != ':') {
		50
1348	0		failbug (__FILE__, __LINE__, parser,
1349			"not a colon %.*s",
1350			end - start);
1351			}
1352	23		break;
1353			case json_token_literal:
1354	1		break;
1355			default:
1356	0		croak ("%s:%d: bad type %d\n", __FILE__, __LINE__, type);
1357			}
1358	93		Newx (new, 1, json_token_t);
1359			// static int nnew;
1360			// nnew++;
1361			// fprintf (stderr, "New %d %p\n", nnew, new);
1362	93		parser->n_mallocs++;
1363			#if 0
1364			fprintf (stderr, "%s:%d: parser->n_mallocs = %d\n",
1365			__FILE__, __LINE__, parser->n_mallocs);
1366			#endif /* 0 */
1367	93		new->start = start - parser->input;
1368	93	100	if (end) {
1369	62		new->end = end - parser->input + 1;
1370			}
1371			else {
1372	31		new->end = 0;
1373			}
1374	93		new->type = type;
1375	93		new->parent = JSON_TOKEN_PARENT_INVALID;
1376	93		new->child = 0;
1377	93		new->next = 0;
1378	93		return new;
1379			}
1380
1381			static void
1382	31		json_token_set_end (json_parse_t * parser, json_token_t * jt, unsigned char * end)
1383			{
1384	31	50	if (jt->end != 0) {
1385	0		int offset = (int) (end - parser->input);
1386	0		failbug (__FILE__, __LINE__, parser,
1387			"attempt to set end as %d is now %d\n",
1388			offset, jt->end);
1389			}
1390
1391	31		switch (jt->type) {
1392			case json_token_string:
1393			case json_token_key:
1394	23	50	if (* end != '"') {
1395	0		failbug (__FILE__, __LINE__, parser,
1396			"no quotes at end of string");
1397			}
1398	23		break;
1399			case json_token_object:
1400	7	50	if (* end != '}') {
1401	0		failbug (__FILE__, __LINE__, parser,
1402			"no } at end of object");
1403			}
1404	7		break;
1405			case json_token_array:
1406	1	50	if (* end != ']') {
1407	0		failbug (__FILE__, __LINE__, parser,
1408			"no ] at end of array");
1409			}
1410	1		break;
1411			default:
1412	0		failbug (__FILE__, __LINE__, parser,
1413	0		"set end for unknown type %d", jt->type);
1414			break;
1415			}
1416	31		jt->end = end - parser->input + 1;
1417	31		}
1418
1419			static json_token_t *
1420	8		json_token_set_child (json_parse_t * parser, json_token_t * parent,
1421			json_token_t * child)
1422			{
1423	8	50	switch (parent->type) {
1424			case json_token_object:
1425			case json_token_array:
1426	8		break;
1427			default:
1428	0		failbug (__FILE__, __LINE__, parser,
1429			"bad parent type %d\n",
1430	0		parent->type);
1431			}
1432	8		parent->child = child;
1433	8		return child;
1434			}
1435
1436			static json_token_t *
1437	82		json_token_set_next (json_token_t * prev, json_token_t * next)
1438			{
1439	82		prev->next = next;
1440	82		return next;
1441			}
1442