Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * jsonapi.c
4 : * JSON parser and lexer interfaces
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/common/jsonapi.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #ifndef FRONTEND
15 : #include "postgres.h"
16 : #else
17 : #include "postgres_fe.h"
18 : #endif
19 :
20 : #include "common/jsonapi.h"
21 : #include "mb/pg_wchar.h"
22 : #include "port/pg_lfind.h"
23 :
24 : #ifndef FRONTEND
25 : #include "miscadmin.h"
26 : #endif
27 :
28 : /*
29 : * The context of the parser is maintained by the recursive descent
30 : * mechanism, but is passed explicitly to the error reporting routine
31 : * for better diagnostics.
32 : */
33 : typedef enum /* contexts of JSON parser */
34 : {
35 : JSON_PARSE_VALUE, /* expecting a value */
36 : JSON_PARSE_STRING, /* expecting a string (for a field name) */
37 : JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */
38 : JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */
39 : JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */
40 : JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */
41 : JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */
42 : JSON_PARSE_OBJECT_COMMA, /* saw object ',', expecting next label */
43 : JSON_PARSE_END, /* saw the end of a document, expect nothing */
44 : } JsonParseContext;
45 :
46 : static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
47 : static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
48 : bool *num_err, int *total_len);
49 : static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem);
50 : static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem);
51 : static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem);
52 : static JsonParseErrorType parse_array_element(JsonLexContext *lex, JsonSemAction *sem);
53 : static JsonParseErrorType parse_array(JsonLexContext *lex, JsonSemAction *sem);
54 : static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex);
55 :
56 : /* the null action object used for pure validation */
57 : JsonSemAction nullSemAction =
58 : {
59 : NULL, NULL, NULL, NULL, NULL,
60 : NULL, NULL, NULL, NULL, NULL
61 : };
62 :
63 : /* Recursive Descent parser support routines */
64 :
65 : /*
66 : * lex_peek
67 : *
68 : * what is the current look_ahead token?
69 : */
70 : static inline JsonTokenType
71 4962466 : lex_peek(JsonLexContext *lex)
72 : {
73 4962466 : return lex->token_type;
74 : }
75 :
76 : /*
77 : * lex_expect
78 : *
79 : * move the lexer to the next token if the current look_ahead token matches
80 : * the parameter token. Otherwise, report an error.
81 : */
82 : static inline JsonParseErrorType
83 933456 : lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
84 : {
85 933456 : if (lex_peek(lex) == token)
86 933348 : return json_lex(lex);
87 : else
88 108 : return report_parse_error(ctx, lex);
89 : }
90 :
91 : /* chars to consider as part of an alphanumeric token */
92 : #define JSON_ALPHANUMERIC_CHAR(c) \
93 : (((c) >= 'a' && (c) <= 'z') || \
94 : ((c) >= 'A' && (c) <= 'Z') || \
95 : ((c) >= '0' && (c) <= '9') || \
96 : (c) == '_' || \
97 : IS_HIGHBIT_SET(c))
98 :
99 : /*
100 : * Utility function to check if a string is a valid JSON number.
101 : *
102 : * str is of length len, and need not be null-terminated.
103 : */
104 : bool
105 2962 : IsValidJsonNumber(const char *str, int len)
106 : {
107 : bool numeric_error;
108 : int total_len;
109 : JsonLexContext dummy_lex;
110 :
111 2962 : if (len <= 0)
112 0 : return false;
113 :
114 : /*
115 : * json_lex_number expects a leading '-' to have been eaten already.
116 : *
117 : * having to cast away the constness of str is ugly, but there's not much
118 : * easy alternative.
119 : */
120 2962 : if (*str == '-')
121 : {
122 58 : dummy_lex.input = unconstify(char *, str) + 1;
123 58 : dummy_lex.input_length = len - 1;
124 : }
125 : else
126 : {
127 2904 : dummy_lex.input = unconstify(char *, str);
128 2904 : dummy_lex.input_length = len;
129 : }
130 :
131 2962 : json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
132 :
133 2962 : return (!numeric_error) && (total_len == dummy_lex.input_length);
134 : }
135 :
136 : /*
137 : * makeJsonLexContextCstringLen
138 : * Initialize the given JsonLexContext object, or create one
139 : *
140 : * If a valid 'lex' pointer is given, it is initialized. This can
141 : * be used for stack-allocated structs, saving overhead. If NULL is
142 : * given, a new struct is allocated.
143 : *
144 : * If need_escapes is true, ->strval stores the unescaped lexemes.
145 : * Unescaping is expensive, so only request it when necessary.
146 : *
147 : * If need_escapes is true or lex was given as NULL, then caller is
148 : * responsible for freeing the returned struct, either by calling
149 : * freeJsonLexContext() or (in backend environment) via memory context
150 : * cleanup.
151 : */
152 : JsonLexContext *
153 30932 : makeJsonLexContextCstringLen(JsonLexContext *lex, char *json,
154 : int len, int encoding, bool need_escapes)
155 : {
156 30932 : if (lex == NULL)
157 : {
158 5634 : lex = palloc0(sizeof(JsonLexContext));
159 5634 : lex->flags |= JSONLEX_FREE_STRUCT;
160 : }
161 : else
162 25298 : memset(lex, 0, sizeof(JsonLexContext));
163 :
164 30932 : lex->input = lex->token_terminator = lex->line_start = json;
165 30932 : lex->line_number = 1;
166 30932 : lex->input_length = len;
167 30932 : lex->input_encoding = encoding;
168 30932 : if (need_escapes)
169 : {
170 23584 : lex->strval = makeStringInfo();
171 23584 : lex->flags |= JSONLEX_FREE_STRVAL;
172 : }
173 :
174 30932 : return lex;
175 : }
176 :
177 : /*
178 : * Free memory in a JsonLexContext. There's no need for this if a *lex
179 : * pointer was given when the object was made and need_escapes was false,
180 : * or (in backend environment) a memory context delete/reset is imminent.
181 : */
182 : void
183 6382 : freeJsonLexContext(JsonLexContext *lex)
184 : {
185 6382 : if (lex->flags & JSONLEX_FREE_STRVAL)
186 : {
187 5992 : pfree(lex->strval->data);
188 5992 : pfree(lex->strval);
189 : }
190 6382 : if (lex->flags & JSONLEX_FREE_STRUCT)
191 5404 : pfree(lex);
192 6382 : }
193 :
194 : /*
195 : * pg_parse_json
196 : *
197 : * Publicly visible entry point for the JSON parser.
198 : *
199 : * lex is a lexing context, set up for the json to be processed by calling
200 : * makeJsonLexContext(). sem is a structure of function pointers to semantic
201 : * action routines to be called at appropriate spots during parsing, and a
202 : * pointer to a state object to be passed to those routines.
203 : */
204 : JsonParseErrorType
205 30194 : pg_parse_json(JsonLexContext *lex, JsonSemAction *sem)
206 : {
207 : JsonTokenType tok;
208 : JsonParseErrorType result;
209 :
210 : /* get the initial token */
211 30194 : result = json_lex(lex);
212 30194 : if (result != JSON_SUCCESS)
213 214 : return result;
214 :
215 29980 : tok = lex_peek(lex);
216 :
217 : /* parse by recursive descent */
218 29980 : switch (tok)
219 : {
220 19166 : case JSON_TOKEN_OBJECT_START:
221 19166 : result = parse_object(lex, sem);
222 19100 : break;
223 5242 : case JSON_TOKEN_ARRAY_START:
224 5242 : result = parse_array(lex, sem);
225 5138 : break;
226 5572 : default:
227 5572 : result = parse_scalar(lex, sem); /* json can be a bare scalar */
228 : }
229 :
230 29738 : if (result == JSON_SUCCESS)
231 29280 : result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
232 :
233 29738 : return result;
234 : }
235 :
236 : /*
237 : * json_count_array_elements
238 : *
239 : * Returns number of array elements in lex context at start of array token
240 : * until end of array token at same nesting level.
241 : *
242 : * Designed to be called from array_start routines.
243 : */
244 : JsonParseErrorType
245 6 : json_count_array_elements(JsonLexContext *lex, int *elements)
246 : {
247 : JsonLexContext copylex;
248 : int count;
249 : JsonParseErrorType result;
250 :
251 : /*
252 : * It's safe to do this with a shallow copy because the lexical routines
253 : * don't scribble on the input. They do scribble on the other pointers
254 : * etc, so doing this with a copy makes that safe.
255 : */
256 6 : memcpy(©lex, lex, sizeof(JsonLexContext));
257 6 : copylex.strval = NULL; /* not interested in values here */
258 6 : copylex.lex_level++;
259 :
260 6 : count = 0;
261 6 : result = lex_expect(JSON_PARSE_ARRAY_START, ©lex,
262 : JSON_TOKEN_ARRAY_START);
263 6 : if (result != JSON_SUCCESS)
264 0 : return result;
265 6 : if (lex_peek(©lex) != JSON_TOKEN_ARRAY_END)
266 : {
267 : while (1)
268 : {
269 48 : count++;
270 48 : result = parse_array_element(©lex, &nullSemAction);
271 48 : if (result != JSON_SUCCESS)
272 0 : return result;
273 48 : if (copylex.token_type != JSON_TOKEN_COMMA)
274 6 : break;
275 42 : result = json_lex(©lex);
276 42 : if (result != JSON_SUCCESS)
277 0 : return result;
278 : }
279 : }
280 6 : result = lex_expect(JSON_PARSE_ARRAY_NEXT, ©lex,
281 : JSON_TOKEN_ARRAY_END);
282 6 : if (result != JSON_SUCCESS)
283 0 : return result;
284 :
285 6 : *elements = count;
286 6 : return JSON_SUCCESS;
287 : }
288 :
289 : /*
290 : * Recursive Descent parse routines. There is one for each structural
291 : * element in a json document:
292 : * - scalar (string, number, true, false, null)
293 : * - array ( [ ] )
294 : * - array element
295 : * - object ( { } )
296 : * - object field
297 : */
298 : static inline JsonParseErrorType
299 711458 : parse_scalar(JsonLexContext *lex, JsonSemAction *sem)
300 : {
301 711458 : char *val = NULL;
302 711458 : json_scalar_action sfunc = sem->scalar;
303 711458 : JsonTokenType tok = lex_peek(lex);
304 : JsonParseErrorType result;
305 :
306 : /* a scalar must be a string, a number, true, false, or null */
307 711458 : if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
308 23740 : tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
309 : tok != JSON_TOKEN_NULL)
310 180 : return report_parse_error(JSON_PARSE_VALUE, lex);
311 :
312 : /* if no semantic function, just consume the token */
313 711278 : if (sfunc == NULL)
314 10562 : return json_lex(lex);
315 :
316 : /* extract the de-escaped string value, or the raw lexeme */
317 700716 : if (lex_peek(lex) == JSON_TOKEN_STRING)
318 : {
319 457118 : if (lex->strval != NULL)
320 451310 : val = pstrdup(lex->strval->data);
321 : }
322 : else
323 : {
324 243598 : int len = (lex->token_terminator - lex->token_start);
325 :
326 243598 : val = palloc(len + 1);
327 243598 : memcpy(val, lex->token_start, len);
328 243598 : val[len] = '\0';
329 : }
330 :
331 : /* consume the token */
332 700716 : result = json_lex(lex);
333 700716 : if (result != JSON_SUCCESS)
334 0 : return result;
335 :
336 : /* invoke the callback */
337 700716 : result = (*sfunc) (sem->semstate, val, tok);
338 :
339 700622 : return result;
340 : }
341 :
342 : static JsonParseErrorType
343 713302 : parse_object_field(JsonLexContext *lex, JsonSemAction *sem)
344 : {
345 : /*
346 : * An object field is "fieldname" : value where value can be a scalar,
347 : * object or array. Note: in user-facing docs and error messages, we
348 : * generally call a field name a "key".
349 : */
350 :
351 713302 : char *fname = NULL; /* keep compiler quiet */
352 713302 : json_ofield_action ostart = sem->object_field_start;
353 713302 : json_ofield_action oend = sem->object_field_end;
354 : bool isnull;
355 : JsonTokenType tok;
356 : JsonParseErrorType result;
357 :
358 713302 : if (lex_peek(lex) != JSON_TOKEN_STRING)
359 12 : return report_parse_error(JSON_PARSE_STRING, lex);
360 713290 : if ((ostart != NULL || oend != NULL) && lex->strval != NULL)
361 667270 : fname = pstrdup(lex->strval->data);
362 713290 : result = json_lex(lex);
363 713290 : if (result != JSON_SUCCESS)
364 12 : return result;
365 :
366 713278 : result = lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON);
367 713278 : if (result != JSON_SUCCESS)
368 114 : return result;
369 :
370 713164 : tok = lex_peek(lex);
371 713164 : isnull = tok == JSON_TOKEN_NULL;
372 :
373 713164 : if (ostart != NULL)
374 : {
375 667162 : result = (*ostart) (sem->semstate, fname, isnull);
376 667154 : if (result != JSON_SUCCESS)
377 0 : return result;
378 : }
379 :
380 713156 : switch (tok)
381 : {
382 11760 : case JSON_TOKEN_OBJECT_START:
383 11760 : result = parse_object(lex, sem);
384 3952 : break;
385 14922 : case JSON_TOKEN_ARRAY_START:
386 14922 : result = parse_array(lex, sem);
387 14888 : break;
388 686474 : default:
389 686474 : result = parse_scalar(lex, sem);
390 : }
391 705310 : if (result != JSON_SUCCESS)
392 42 : return result;
393 :
394 705268 : if (oend != NULL)
395 : {
396 113750 : result = (*oend) (sem->semstate, fname, isnull);
397 113750 : if (result != JSON_SUCCESS)
398 0 : return result;
399 : }
400 :
401 705268 : return JSON_SUCCESS;
402 : }
403 :
404 : static JsonParseErrorType
405 143694 : parse_object(JsonLexContext *lex, JsonSemAction *sem)
406 : {
407 : /*
408 : * an object is a possibly empty sequence of object fields, separated by
409 : * commas and surrounded by curly braces.
410 : */
411 143694 : json_struct_action ostart = sem->object_start;
412 143694 : json_struct_action oend = sem->object_end;
413 : JsonTokenType tok;
414 : JsonParseErrorType result;
415 :
416 : #ifndef FRONTEND
417 42808 : check_stack_depth();
418 : #endif
419 :
420 143682 : if (ostart != NULL)
421 : {
422 124058 : result = (*ostart) (sem->semstate);
423 124038 : if (result != JSON_SUCCESS)
424 0 : return result;
425 : }
426 :
427 : /*
428 : * Data inside an object is at a higher nesting level than the object
429 : * itself. Note that we increment this after we call the semantic routine
430 : * for the object start and restore it before we call the routine for the
431 : * object end.
432 : */
433 143662 : lex->lex_level++;
434 :
435 : Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START);
436 143662 : result = json_lex(lex);
437 143662 : if (result != JSON_SUCCESS)
438 60 : return result;
439 :
440 143602 : tok = lex_peek(lex);
441 143602 : switch (tok)
442 : {
443 141324 : case JSON_TOKEN_STRING:
444 141324 : result = parse_object_field(lex, sem);
445 705448 : while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
446 : {
447 571978 : result = json_lex(lex);
448 571978 : if (result != JSON_SUCCESS)
449 0 : break;
450 571978 : result = parse_object_field(lex, sem);
451 : }
452 133470 : break;
453 2264 : case JSON_TOKEN_OBJECT_END:
454 2264 : break;
455 14 : default:
456 : /* case of an invalid initial token inside the object */
457 14 : result = report_parse_error(JSON_PARSE_OBJECT_START, lex);
458 : }
459 135748 : if (result != JSON_SUCCESS)
460 194 : return result;
461 :
462 135554 : result = lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END);
463 135554 : if (result != JSON_SUCCESS)
464 36 : return result;
465 :
466 135518 : lex->lex_level--;
467 :
468 135518 : if (oend != NULL)
469 : {
470 117594 : result = (*oend) (sem->semstate);
471 117548 : if (result != JSON_SUCCESS)
472 0 : return result;
473 : }
474 :
475 135472 : return JSON_SUCCESS;
476 : }
477 :
478 : static JsonParseErrorType
479 144188 : parse_array_element(JsonLexContext *lex, JsonSemAction *sem)
480 : {
481 144188 : json_aelem_action astart = sem->array_element_start;
482 144188 : json_aelem_action aend = sem->array_element_end;
483 144188 : JsonTokenType tok = lex_peek(lex);
484 : JsonParseErrorType result;
485 : bool isnull;
486 :
487 144188 : isnull = tok == JSON_TOKEN_NULL;
488 :
489 144188 : if (astart != NULL)
490 : {
491 7780 : result = (*astart) (sem->semstate, isnull);
492 7780 : if (result != JSON_SUCCESS)
493 0 : return result;
494 : }
495 :
496 : /* an array element is any object, array or scalar */
497 144188 : switch (tok)
498 : {
499 112768 : case JSON_TOKEN_OBJECT_START:
500 112768 : result = parse_object(lex, sem);
501 112710 : break;
502 12008 : case JSON_TOKEN_ARRAY_START:
503 12008 : result = parse_array(lex, sem);
504 3202 : break;
505 19412 : default:
506 19412 : result = parse_scalar(lex, sem);
507 : }
508 :
509 135306 : if (result != JSON_SUCCESS)
510 66 : return result;
511 :
512 135240 : if (aend != NULL)
513 : {
514 7270 : result = (*aend) (sem->semstate, isnull);
515 7258 : if (result != JSON_SUCCESS)
516 0 : return result;
517 : }
518 :
519 135228 : return JSON_SUCCESS;
520 : }
521 :
522 : static JsonParseErrorType
523 32172 : parse_array(JsonLexContext *lex, JsonSemAction *sem)
524 : {
525 : /*
526 : * an array is a possibly empty sequence of array elements, separated by
527 : * commas and surrounded by square brackets.
528 : */
529 32172 : json_struct_action astart = sem->array_start;
530 32172 : json_struct_action aend = sem->array_end;
531 : JsonParseErrorType result;
532 :
533 : #ifndef FRONTEND
534 31920 : check_stack_depth();
535 : #endif
536 :
537 32160 : if (astart != NULL)
538 : {
539 14008 : result = (*astart) (sem->semstate);
540 13994 : if (result != JSON_SUCCESS)
541 0 : return result;
542 : }
543 :
544 : /*
545 : * Data inside an array is at a higher nesting level than the array
546 : * itself. Note that we increment this after we call the semantic routine
547 : * for the array start and restore it before we call the routine for the
548 : * array end.
549 : */
550 32146 : lex->lex_level++;
551 :
552 32146 : result = lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START);
553 32146 : if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
554 : {
555 25408 : result = parse_array_element(lex, sem);
556 :
557 135246 : while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
558 : {
559 118732 : result = json_lex(lex);
560 118732 : if (result != JSON_SUCCESS)
561 0 : break;
562 118732 : result = parse_array_element(lex, sem);
563 : }
564 : }
565 23252 : if (result != JSON_SUCCESS)
566 66 : return result;
567 :
568 23186 : result = lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END);
569 23186 : if (result != JSON_SUCCESS)
570 24 : return result;
571 :
572 23162 : lex->lex_level--;
573 :
574 23162 : if (aend != NULL)
575 : {
576 7388 : result = (*aend) (sem->semstate);
577 7364 : if (result != JSON_SUCCESS)
578 0 : return result;
579 : }
580 :
581 23138 : return JSON_SUCCESS;
582 : }
583 :
584 : /*
585 : * Lex one token from the input stream.
586 : */
587 : JsonParseErrorType
588 3223262 : json_lex(JsonLexContext *lex)
589 : {
590 : char *s;
591 3223262 : char *const end = lex->input + lex->input_length;
592 : JsonParseErrorType result;
593 :
594 : /* Skip leading whitespace. */
595 3223262 : s = lex->token_terminator;
596 7694448 : while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
597 : {
598 4471186 : if (*s++ == '\n')
599 : {
600 272576 : ++lex->line_number;
601 272576 : lex->line_start = s;
602 : }
603 : }
604 3223262 : lex->token_start = s;
605 :
606 : /* Determine token type. */
607 3223262 : if (s >= end)
608 : {
609 58740 : lex->token_start = NULL;
610 58740 : lex->prev_token_terminator = lex->token_terminator;
611 58740 : lex->token_terminator = s;
612 58740 : lex->token_type = JSON_TOKEN_END;
613 : }
614 : else
615 : {
616 3164522 : switch (*s)
617 : {
618 : /* Single-character token, some kind of punctuation mark. */
619 144006 : case '{':
620 144006 : lex->prev_token_terminator = lex->token_terminator;
621 144006 : lex->token_terminator = s + 1;
622 144006 : lex->token_type = JSON_TOKEN_OBJECT_START;
623 144006 : break;
624 135552 : case '}':
625 135552 : lex->prev_token_terminator = lex->token_terminator;
626 135552 : lex->token_terminator = s + 1;
627 135552 : lex->token_type = JSON_TOKEN_OBJECT_END;
628 135552 : break;
629 32310 : case '[':
630 32310 : lex->prev_token_terminator = lex->token_terminator;
631 32310 : lex->token_terminator = s + 1;
632 32310 : lex->token_type = JSON_TOKEN_ARRAY_START;
633 32310 : break;
634 23312 : case ']':
635 23312 : lex->prev_token_terminator = lex->token_terminator;
636 23312 : lex->token_terminator = s + 1;
637 23312 : lex->token_type = JSON_TOKEN_ARRAY_END;
638 23312 : break;
639 690806 : case ',':
640 690806 : lex->prev_token_terminator = lex->token_terminator;
641 690806 : lex->token_terminator = s + 1;
642 690806 : lex->token_type = JSON_TOKEN_COMMA;
643 690806 : break;
644 713278 : case ':':
645 713278 : lex->prev_token_terminator = lex->token_terminator;
646 713278 : lex->token_terminator = s + 1;
647 713278 : lex->token_type = JSON_TOKEN_COLON;
648 713278 : break;
649 1173484 : case '"':
650 : /* string */
651 1173484 : result = json_lex_string(lex);
652 1173484 : if (result != JSON_SUCCESS)
653 172 : return result;
654 1173312 : lex->token_type = JSON_TOKEN_STRING;
655 1173312 : break;
656 142 : case '-':
657 : /* Negative number. */
658 142 : result = json_lex_number(lex, s + 1, NULL, NULL);
659 142 : if (result != JSON_SUCCESS)
660 0 : return result;
661 142 : lex->token_type = JSON_TOKEN_NUMBER;
662 142 : break;
663 220512 : case '0':
664 : case '1':
665 : case '2':
666 : case '3':
667 : case '4':
668 : case '5':
669 : case '6':
670 : case '7':
671 : case '8':
672 : case '9':
673 : /* Positive number. */
674 220512 : result = json_lex_number(lex, s, NULL, NULL);
675 220512 : if (result != JSON_SUCCESS)
676 48 : return result;
677 220464 : lex->token_type = JSON_TOKEN_NUMBER;
678 220464 : break;
679 31120 : default:
680 : {
681 : char *p;
682 :
683 : /*
684 : * We're not dealing with a string, number, legal
685 : * punctuation mark, or end of string. The only legal
686 : * tokens we might find here are true, false, and null,
687 : * but for error reporting purposes we scan until we see a
688 : * non-alphanumeric character. That way, we can report
689 : * the whole word as an unexpected token, rather than just
690 : * some unintuitive prefix thereof.
691 : */
692 174216 : for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++)
693 : /* skip */ ;
694 :
695 : /*
696 : * We got some sort of unexpected punctuation or an
697 : * otherwise unexpected character, so just complain about
698 : * that one character.
699 : */
700 31120 : if (p == s)
701 : {
702 24 : lex->prev_token_terminator = lex->token_terminator;
703 24 : lex->token_terminator = s + 1;
704 24 : return JSON_INVALID_TOKEN;
705 : }
706 :
707 : /*
708 : * We've got a real alphanumeric token here. If it
709 : * happens to be true, false, or null, all is well. If
710 : * not, error out.
711 : */
712 31096 : lex->prev_token_terminator = lex->token_terminator;
713 31096 : lex->token_terminator = p;
714 31096 : if (p - s == 4)
715 : {
716 12048 : if (memcmp(s, "true", 4) == 0)
717 7308 : lex->token_type = JSON_TOKEN_TRUE;
718 4740 : else if (memcmp(s, "null", 4) == 0)
719 4728 : lex->token_type = JSON_TOKEN_NULL;
720 : else
721 12 : return JSON_INVALID_TOKEN;
722 : }
723 19048 : else if (p - s == 5 && memcmp(s, "false", 5) == 0)
724 18910 : lex->token_type = JSON_TOKEN_FALSE;
725 : else
726 138 : return JSON_INVALID_TOKEN;
727 : }
728 : } /* end of switch */
729 : }
730 :
731 3222868 : return JSON_SUCCESS;
732 : }
733 :
734 : /*
735 : * The next token in the input stream is known to be a string; lex it.
736 : *
737 : * If lex->strval isn't NULL, fill it with the decoded string.
738 : * Set lex->token_terminator to the end of the decoded input, and in
739 : * success cases, transfer its previous value to lex->prev_token_terminator.
740 : * Return JSON_SUCCESS or an error code.
741 : *
742 : * Note: be careful that all error exits advance lex->token_terminator
743 : * to the point after the character we detected the error on.
744 : */
745 : static inline JsonParseErrorType
746 1173484 : json_lex_string(JsonLexContext *lex)
747 : {
748 : char *s;
749 1173484 : char *const end = lex->input + lex->input_length;
750 1173484 : int hi_surrogate = -1;
751 :
752 : /* Convenience macros for error exits */
753 : #define FAIL_AT_CHAR_START(code) \
754 : do { \
755 : lex->token_terminator = s; \
756 : return code; \
757 : } while (0)
758 : #define FAIL_AT_CHAR_END(code) \
759 : do { \
760 : lex->token_terminator = \
761 : s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
762 : return code; \
763 : } while (0)
764 :
765 1173484 : if (lex->strval != NULL)
766 1127022 : resetStringInfo(lex->strval);
767 :
768 : Assert(lex->input_length > 0);
769 1173484 : s = lex->token_start;
770 : for (;;)
771 : {
772 2347690 : s++;
773 : /* Premature end of the string. */
774 2347690 : if (s >= end)
775 12 : FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
776 2347678 : else if (*s == '"')
777 1173312 : break;
778 1174366 : else if (*s == '\\')
779 : {
780 : /* OK, we have an escape character. */
781 738 : s++;
782 738 : if (s >= end)
783 0 : FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
784 738 : else if (*s == 'u')
785 : {
786 : int i;
787 324 : int ch = 0;
788 :
789 1536 : for (i = 1; i <= 4; i++)
790 : {
791 1248 : s++;
792 1248 : if (s >= end)
793 0 : FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
794 1248 : else if (*s >= '0' && *s <= '9')
795 798 : ch = (ch * 16) + (*s - '0');
796 450 : else if (*s >= 'a' && *s <= 'f')
797 390 : ch = (ch * 16) + (*s - 'a') + 10;
798 60 : else if (*s >= 'A' && *s <= 'F')
799 24 : ch = (ch * 16) + (*s - 'A') + 10;
800 : else
801 36 : FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
802 : }
803 288 : if (lex->strval != NULL)
804 : {
805 : /*
806 : * Combine surrogate pairs.
807 : */
808 180 : if (is_utf16_surrogate_first(ch))
809 : {
810 60 : if (hi_surrogate != -1)
811 12 : FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
812 48 : hi_surrogate = ch;
813 48 : continue;
814 : }
815 120 : else if (is_utf16_surrogate_second(ch))
816 : {
817 48 : if (hi_surrogate == -1)
818 24 : FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
819 24 : ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
820 24 : hi_surrogate = -1;
821 : }
822 :
823 96 : if (hi_surrogate != -1)
824 0 : FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
825 :
826 : /*
827 : * Reject invalid cases. We can't have a value above
828 : * 0xFFFF here (since we only accepted 4 hex digits
829 : * above), so no need to test for out-of-range chars.
830 : */
831 96 : if (ch == 0)
832 : {
833 : /* We can't allow this, since our TEXT type doesn't */
834 24 : FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
835 : }
836 :
837 : /*
838 : * Add the represented character to lex->strval. In the
839 : * backend, we can let pg_unicode_to_server_noerror()
840 : * handle any required character set conversion; in
841 : * frontend, we can only deal with trivial conversions.
842 : */
843 : #ifndef FRONTEND
844 : {
845 : char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
846 :
847 72 : if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
848 28 : FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE);
849 44 : appendStringInfoString(lex->strval, cbuf);
850 : }
851 : #else
852 0 : if (lex->input_encoding == PG_UTF8)
853 : {
854 : /* OK, we can map the code point to UTF8 easily */
855 : char utf8str[5];
856 : int utf8len;
857 :
858 0 : unicode_to_utf8(ch, (unsigned char *) utf8str);
859 0 : utf8len = pg_utf_mblen((unsigned char *) utf8str);
860 0 : appendBinaryStringInfo(lex->strval, utf8str, utf8len);
861 : }
862 0 : else if (ch <= 0x007f)
863 : {
864 : /* The ASCII range is the same in all encodings */
865 0 : appendStringInfoChar(lex->strval, (char) ch);
866 : }
867 : else
868 0 : FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
869 : #endif /* FRONTEND */
870 : }
871 : }
872 414 : else if (lex->strval != NULL)
873 : {
874 300 : if (hi_surrogate != -1)
875 0 : FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
876 :
877 300 : switch (*s)
878 : {
879 204 : case '"':
880 : case '\\':
881 : case '/':
882 204 : appendStringInfoChar(lex->strval, *s);
883 204 : break;
884 36 : case 'b':
885 36 : appendStringInfoChar(lex->strval, '\b');
886 36 : break;
887 0 : case 'f':
888 0 : appendStringInfoChar(lex->strval, '\f');
889 0 : break;
890 54 : case 'n':
891 54 : appendStringInfoChar(lex->strval, '\n');
892 54 : break;
893 0 : case 'r':
894 0 : appendStringInfoChar(lex->strval, '\r');
895 0 : break;
896 0 : case 't':
897 0 : appendStringInfoChar(lex->strval, '\t');
898 0 : break;
899 6 : default:
900 :
901 : /*
902 : * Not a valid string escape, so signal error. We
903 : * adjust token_start so that just the escape sequence
904 : * is reported, not the whole string.
905 : */
906 6 : lex->token_start = s;
907 6 : FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
908 : }
909 : }
910 114 : else if (strchr("\"\\/bfnrt", *s) == NULL)
911 : {
912 : /*
913 : * Simpler processing if we're not bothered about de-escaping
914 : *
915 : * It's very tempting to remove the strchr() call here and
916 : * replace it with a switch statement, but testing so far has
917 : * shown it's not a performance win.
918 : */
919 6 : lex->token_start = s;
920 6 : FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
921 : }
922 : }
923 : else
924 : {
925 1173628 : char *p = s;
926 :
927 1173628 : if (hi_surrogate != -1)
928 12 : FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
929 :
930 : /*
931 : * Skip to the first byte that requires special handling, so we
932 : * can batch calls to appendBinaryStringInfo.
933 : */
934 1459832 : while (p < end - sizeof(Vector8) &&
935 1431700 : !pg_lfind8('\\', (uint8 *) p, sizeof(Vector8)) &&
936 1431208 : !pg_lfind8('"', (uint8 *) p, sizeof(Vector8)) &&
937 286216 : !pg_lfind8_le(31, (uint8 *) p, sizeof(Vector8)))
938 286216 : p += sizeof(Vector8);
939 :
940 9208948 : for (; p < end; p++)
941 : {
942 9208936 : if (*p == '\\' || *p == '"')
943 : break;
944 8035344 : else if ((unsigned char) *p <= 31)
945 : {
946 : /* Per RFC4627, these characters MUST be escaped. */
947 : /*
948 : * Since *p isn't printable, exclude it from the context
949 : * string
950 : */
951 12 : lex->token_terminator = p;
952 12 : return JSON_ESCAPING_REQUIRED;
953 : }
954 : }
955 :
956 1173604 : if (lex->strval != NULL)
957 1127106 : appendBinaryStringInfo(lex->strval, s, p - s);
958 :
959 : /*
960 : * s will be incremented at the top of the loop, so set it to just
961 : * behind our lookahead position
962 : */
963 1173604 : s = p - 1;
964 : }
965 : }
966 :
967 1173312 : if (hi_surrogate != -1)
968 : {
969 0 : lex->token_terminator = s + 1;
970 0 : return JSON_UNICODE_LOW_SURROGATE;
971 : }
972 :
973 : /* Hooray, we found the end of the string! */
974 1173312 : lex->prev_token_terminator = lex->token_terminator;
975 1173312 : lex->token_terminator = s + 1;
976 1173312 : return JSON_SUCCESS;
977 :
978 : #undef FAIL_AT_CHAR_START
979 : #undef FAIL_AT_CHAR_END
980 : }
981 :
982 : /*
983 : * The next token in the input stream is known to be a number; lex it.
984 : *
985 : * In JSON, a number consists of four parts:
986 : *
987 : * (1) An optional minus sign ('-').
988 : *
989 : * (2) Either a single '0', or a string of one or more digits that does not
990 : * begin with a '0'.
991 : *
992 : * (3) An optional decimal part, consisting of a period ('.') followed by
993 : * one or more digits. (Note: While this part can be omitted
994 : * completely, it's not OK to have only the decimal point without
995 : * any digits afterwards.)
996 : *
997 : * (4) An optional exponent part, consisting of 'e' or 'E', optionally
998 : * followed by '+' or '-', followed by one or more digits. (Note:
999 : * As with the decimal part, if 'e' or 'E' is present, it must be
1000 : * followed by at least one digit.)
1001 : *
1002 : * The 's' argument to this function points to the ostensible beginning
1003 : * of part 2 - i.e. the character after any optional minus sign, or the
1004 : * first character of the string if there is none.
1005 : *
1006 : * If num_err is not NULL, we return an error flag to *num_err rather than
1007 : * raising an error for a badly-formed number. Also, if total_len is not NULL
1008 : * the distance from lex->input to the token end+1 is returned to *total_len.
1009 : */
1010 : static inline JsonParseErrorType
1011 223616 : json_lex_number(JsonLexContext *lex, char *s,
1012 : bool *num_err, int *total_len)
1013 : {
1014 223616 : bool error = false;
1015 223616 : int len = s - lex->input;
1016 :
1017 : /* Part (1): leading sign indicator. */
1018 : /* Caller already did this for us; so do nothing. */
1019 :
1020 : /* Part (2): parse main digit string. */
1021 223616 : if (len < lex->input_length && *s == '0')
1022 : {
1023 33438 : s++;
1024 33438 : len++;
1025 : }
1026 190178 : else if (len < lex->input_length && *s >= '1' && *s <= '9')
1027 : {
1028 : do
1029 : {
1030 608568 : s++;
1031 608568 : len++;
1032 608568 : } while (len < lex->input_length && *s >= '0' && *s <= '9');
1033 : }
1034 : else
1035 20 : error = true;
1036 :
1037 : /* Part (3): parse optional decimal portion. */
1038 223616 : if (len < lex->input_length && *s == '.')
1039 : {
1040 37310 : s++;
1041 37310 : len++;
1042 37310 : if (len == lex->input_length || *s < '0' || *s > '9')
1043 12 : error = true;
1044 : else
1045 : {
1046 : do
1047 : {
1048 91744 : s++;
1049 91744 : len++;
1050 91744 : } while (len < lex->input_length && *s >= '0' && *s <= '9');
1051 : }
1052 : }
1053 :
1054 : /* Part (4): parse optional exponent. */
1055 223616 : if (len < lex->input_length && (*s == 'e' || *s == 'E'))
1056 : {
1057 64 : s++;
1058 64 : len++;
1059 64 : if (len < lex->input_length && (*s == '+' || *s == '-'))
1060 : {
1061 10 : s++;
1062 10 : len++;
1063 : }
1064 64 : if (len == lex->input_length || *s < '0' || *s > '9')
1065 12 : error = true;
1066 : else
1067 : {
1068 : do
1069 : {
1070 164 : s++;
1071 164 : len++;
1072 164 : } while (len < lex->input_length && *s >= '0' && *s <= '9');
1073 : }
1074 : }
1075 :
1076 : /*
1077 : * Check for trailing garbage. As in json_lex(), any alphanumeric stuff
1078 : * here should be considered part of the token for error-reporting
1079 : * purposes.
1080 : */
1081 223886 : for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
1082 270 : error = true;
1083 :
1084 223616 : if (total_len != NULL)
1085 2962 : *total_len = len;
1086 :
1087 223616 : if (num_err != NULL)
1088 : {
1089 : /* let the caller handle any error */
1090 2962 : *num_err = error;
1091 : }
1092 : else
1093 : {
1094 : /* return token endpoint */
1095 220654 : lex->prev_token_terminator = lex->token_terminator;
1096 220654 : lex->token_terminator = s;
1097 : /* handle error if any */
1098 220654 : if (error)
1099 48 : return JSON_INVALID_TOKEN;
1100 : }
1101 :
1102 223568 : return JSON_SUCCESS;
1103 : }
1104 :
1105 : /*
1106 : * Report a parse error.
1107 : *
1108 : * lex->token_start and lex->token_terminator must identify the current token.
1109 : */
1110 : static JsonParseErrorType
1111 314 : report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
1112 : {
1113 : /* Handle case where the input ended prematurely. */
1114 314 : if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
1115 128 : return JSON_EXPECTED_MORE;
1116 :
1117 : /* Otherwise choose the error type based on the parsing context. */
1118 186 : switch (ctx)
1119 : {
1120 24 : case JSON_PARSE_END:
1121 24 : return JSON_EXPECTED_END;
1122 102 : case JSON_PARSE_VALUE:
1123 102 : return JSON_EXPECTED_JSON;
1124 12 : case JSON_PARSE_STRING:
1125 12 : return JSON_EXPECTED_STRING;
1126 0 : case JSON_PARSE_ARRAY_START:
1127 0 : return JSON_EXPECTED_ARRAY_FIRST;
1128 0 : case JSON_PARSE_ARRAY_NEXT:
1129 0 : return JSON_EXPECTED_ARRAY_NEXT;
1130 12 : case JSON_PARSE_OBJECT_START:
1131 12 : return JSON_EXPECTED_OBJECT_FIRST;
1132 24 : case JSON_PARSE_OBJECT_LABEL:
1133 24 : return JSON_EXPECTED_COLON;
1134 12 : case JSON_PARSE_OBJECT_NEXT:
1135 12 : return JSON_EXPECTED_OBJECT_NEXT;
1136 0 : case JSON_PARSE_OBJECT_COMMA:
1137 0 : return JSON_EXPECTED_STRING;
1138 : }
1139 :
1140 : /*
1141 : * We don't use a default: case, so that the compiler will warn about
1142 : * unhandled enum values.
1143 : */
1144 : Assert(false);
1145 0 : return JSON_SUCCESS; /* silence stupider compilers */
1146 : }
1147 :
1148 :
1149 : #ifndef FRONTEND
1150 : /*
1151 : * Extract the current token from a lexing context, for error reporting.
1152 : */
1153 : static char *
1154 258 : extract_token(JsonLexContext *lex)
1155 : {
1156 258 : int toklen = lex->token_terminator - lex->token_start;
1157 258 : char *token = palloc(toklen + 1);
1158 :
1159 258 : memcpy(token, lex->token_start, toklen);
1160 258 : token[toklen] = '\0';
1161 258 : return token;
1162 : }
1163 :
1164 : /*
1165 : * Construct an (already translated) detail message for a JSON error.
1166 : *
1167 : * Note that the error message generated by this routine may not be
1168 : * palloc'd, making it unsafe for frontend code as there is no way to
1169 : * know if this can be safely pfree'd or not.
1170 : */
1171 : char *
1172 466 : json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
1173 : {
1174 466 : switch (error)
1175 : {
1176 0 : case JSON_SUCCESS:
1177 : /* fall through to the error code after switch */
1178 0 : break;
1179 12 : case JSON_ESCAPING_INVALID:
1180 12 : return psprintf(_("Escape sequence \"\\%s\" is invalid."),
1181 : extract_token(lex));
1182 12 : case JSON_ESCAPING_REQUIRED:
1183 12 : return psprintf(_("Character with value 0x%02x must be escaped."),
1184 12 : (unsigned char) *(lex->token_terminator));
1185 24 : case JSON_EXPECTED_END:
1186 24 : return psprintf(_("Expected end of input, but found \"%s\"."),
1187 : extract_token(lex));
1188 0 : case JSON_EXPECTED_ARRAY_FIRST:
1189 0 : return psprintf(_("Expected array element or \"]\", but found \"%s\"."),
1190 : extract_token(lex));
1191 0 : case JSON_EXPECTED_ARRAY_NEXT:
1192 0 : return psprintf(_("Expected \",\" or \"]\", but found \"%s\"."),
1193 : extract_token(lex));
1194 24 : case JSON_EXPECTED_COLON:
1195 24 : return psprintf(_("Expected \":\", but found \"%s\"."),
1196 : extract_token(lex));
1197 48 : case JSON_EXPECTED_JSON:
1198 48 : return psprintf(_("Expected JSON value, but found \"%s\"."),
1199 : extract_token(lex));
1200 60 : case JSON_EXPECTED_MORE:
1201 60 : return _("The input string ended unexpectedly.");
1202 12 : case JSON_EXPECTED_OBJECT_FIRST:
1203 12 : return psprintf(_("Expected string or \"}\", but found \"%s\"."),
1204 : extract_token(lex));
1205 12 : case JSON_EXPECTED_OBJECT_NEXT:
1206 12 : return psprintf(_("Expected \",\" or \"}\", but found \"%s\"."),
1207 : extract_token(lex));
1208 12 : case JSON_EXPECTED_STRING:
1209 12 : return psprintf(_("Expected string, but found \"%s\"."),
1210 : extract_token(lex));
1211 114 : case JSON_INVALID_TOKEN:
1212 114 : return psprintf(_("Token \"%s\" is invalid."),
1213 : extract_token(lex));
1214 24 : case JSON_UNICODE_CODE_POINT_ZERO:
1215 24 : return _("\\u0000 cannot be converted to text.");
1216 36 : case JSON_UNICODE_ESCAPE_FORMAT:
1217 36 : return _("\"\\u\" must be followed by four hexadecimal digits.");
1218 0 : case JSON_UNICODE_HIGH_ESCAPE:
1219 : /* note: this case is only reachable in frontend not backend */
1220 0 : return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
1221 28 : case JSON_UNICODE_UNTRANSLATABLE:
1222 : /* note: this case is only reachable in backend not frontend */
1223 28 : return psprintf(_("Unicode escape value could not be translated to the server's encoding %s."),
1224 : GetDatabaseEncodingName());
1225 12 : case JSON_UNICODE_HIGH_SURROGATE:
1226 12 : return _("Unicode high surrogate must not follow a high surrogate.");
1227 36 : case JSON_UNICODE_LOW_SURROGATE:
1228 36 : return _("Unicode low surrogate must follow a high surrogate.");
1229 0 : case JSON_SEM_ACTION_FAILED:
1230 : /* fall through to the error code after switch */
1231 0 : break;
1232 : }
1233 :
1234 : /*
1235 : * We don't use a default: case, so that the compiler will warn about
1236 : * unhandled enum values. But this needs to be here anyway to cover the
1237 : * possibility of an incorrect input.
1238 : */
1239 0 : elog(ERROR, "unexpected json parse error type: %d", (int) error);
1240 : return NULL;
1241 : }
1242 : #endif
|