Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * Note that the grammar is not allowed to perform any table access
7 : * (since we need to be able to do basic parsing even while inside an
8 : * aborted transaction). Therefore, the data structures returned by
9 : * the grammar are "raw" parsetrees that still need to be analyzed by
10 : * analyze.c and related files.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/backend/parser/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres.h"
23 :
24 : #include "gramparse.h"
25 : #include "mb/pg_wchar.h"
26 : #include "parser/parser.h"
27 : #include "parser/scansup.h"
28 :
29 : static bool check_uescapechar(unsigned char escape);
30 : static char *str_udeescape(const char *str, char escape,
31 : int position, core_yyscan_t yyscanner);
32 :
33 :
34 : /*
35 : * raw_parser
36 : * Given a query in string form, do lexical and grammatical analysis.
37 : *
38 : * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 : * list have the form required by the specified RawParseMode.
40 : */
41 : List *
42 487901 : raw_parser(const char *str, RawParseMode mode)
43 : {
44 : core_yyscan_t yyscanner;
45 : base_yy_extra_type yyextra;
46 : int yyresult;
47 :
48 : /* initialize the flex scanner */
49 487901 : yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 : &ScanKeywords, ScanKeywordTokens);
51 :
52 : /* base_yylex() only needs us to initialize the lookahead token, if any */
53 487901 : if (mode == RAW_PARSE_DEFAULT)
54 456806 : yyextra.have_lookahead = false;
55 : else
56 : {
57 : /* this array is indexed by RawParseMode enum */
58 : static const int mode_token[] = {
59 : [RAW_PARSE_DEFAULT] = 0,
60 : [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61 : [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62 : [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63 : [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64 : [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65 : };
66 :
67 31095 : yyextra.have_lookahead = true;
68 31095 : yyextra.lookahead_token = mode_token[mode];
69 31095 : yyextra.lookahead_yylloc = 0;
70 31095 : yyextra.lookahead_end = NULL;
71 : }
72 :
73 : /* initialize the bison parser */
74 487901 : parser_init(&yyextra);
75 :
76 : /* Parse! */
77 487901 : yyresult = base_yyparse(yyscanner);
78 :
79 : /* Clean up (release memory) */
80 487094 : scanner_finish(yyscanner);
81 :
82 487094 : if (yyresult) /* error */
83 0 : return NIL;
84 :
85 487094 : return yyextra.parsetree;
86 : }
87 :
88 :
89 : /*
90 : * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 : *
92 : * This filter is needed because in some cases the standard SQL grammar
93 : * requires more than one token lookahead. We reduce these cases to one-token
94 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 : *
96 : * Using a filter is simpler than trying to recognize multiword tokens
97 : * directly in scan.l, because we'd have to allow for comments between the
98 : * words. Furthermore it's not clear how to do that without re-introducing
99 : * scanner backtrack, which would cost more performance than this filter
100 : * layer does.
101 : *
102 : * We also use this filter to convert UIDENT and USCONST sequences into
103 : * plain IDENT and SCONST tokens. While that could be handled by additional
104 : * productions in the main grammar, it's more efficient to do it like this.
105 : *
106 : * The filter also provides a convenient place to translate between
107 : * the core_YYSTYPE and YYSTYPE representations (which are really the
108 : * same thing anyway, but notationally they're different).
109 : */
110 : int
111 12479406 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 : {
113 12479406 : base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 : int cur_token;
115 : int next_token;
116 : int cur_token_length;
117 : YYLTYPE cur_yylloc;
118 :
119 : /* Get next token --- we might already have it */
120 12479406 : if (yyextra->have_lookahead)
121 : {
122 82295 : cur_token = yyextra->lookahead_token;
123 82295 : lvalp->core_yystype = yyextra->lookahead_yylval;
124 82295 : *llocp = yyextra->lookahead_yylloc;
125 82295 : if (yyextra->lookahead_end)
126 51200 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 82295 : yyextra->have_lookahead = false;
128 : }
129 : else
130 12397111 : cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 :
132 : /*
133 : * If this token isn't one that requires lookahead, just return it. If it
134 : * does, determine the token length. (We could get that via strlen(), but
135 : * since we have such a small set of possibilities, hardwiring seems
136 : * feasible and more efficient --- at least for the fixed-length cases.)
137 : */
138 12479266 : switch (cur_token)
139 : {
140 2246 : case FORMAT:
141 2246 : cur_token_length = 6;
142 2246 : break;
143 33241 : case NOT:
144 33241 : cur_token_length = 3;
145 33241 : break;
146 1427 : case NULLS_P:
147 1427 : cur_token_length = 5;
148 1427 : break;
149 13019 : case WITH:
150 13019 : cur_token_length = 4;
151 13019 : break;
152 390 : case UIDENT:
153 : case USCONST:
154 390 : cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 390 : break;
156 953 : case WITHOUT:
157 953 : cur_token_length = 7;
158 953 : break;
159 12427990 : default:
160 12427990 : return cur_token;
161 : }
162 :
163 : /*
164 : * Identify end+1 of current token. core_yylex() has temporarily stored a
165 : * '\0' here, and will undo that when we call it again. We need to redo
166 : * it to fully revert the lookahead call for error reporting purposes.
167 : */
168 51276 : yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 51276 : *llocp + cur_token_length;
170 : Assert(*(yyextra->lookahead_end) == '\0');
171 :
172 : /*
173 : * Save and restore *llocp around the call. It might look like we could
174 : * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 : * does not work because flex actually holds onto the last-passed pointer
176 : * internally, and will use that for error reporting. We need any error
177 : * reports to point to the current token, not the next one.
178 : */
179 51276 : cur_yylloc = *llocp;
180 :
181 : /* Get next token, saving outputs into lookahead variables */
182 51276 : next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 51276 : yyextra->lookahead_token = next_token;
184 51276 : yyextra->lookahead_yylloc = *llocp;
185 :
186 51276 : *llocp = cur_yylloc;
187 :
188 : /* Now revert the un-truncation of the current token */
189 51276 : yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 51276 : *(yyextra->lookahead_end) = '\0';
191 :
192 51276 : yyextra->have_lookahead = true;
193 :
194 : /* Replace cur_token if needed, based on lookahead */
195 51276 : switch (cur_token)
196 : {
197 2246 : case FORMAT:
198 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 2246 : switch (next_token)
200 : {
201 559 : case JSON:
202 559 : cur_token = FORMAT_LA;
203 559 : break;
204 : }
205 2246 : break;
206 :
207 33241 : case NOT:
208 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 33241 : switch (next_token)
210 : {
211 2185 : case BETWEEN:
212 : case IN_P:
213 : case LIKE:
214 : case ILIKE:
215 : case SIMILAR:
216 2185 : cur_token = NOT_LA;
217 2185 : break;
218 : }
219 33241 : break;
220 :
221 1427 : case NULLS_P:
222 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223 1427 : switch (next_token)
224 : {
225 1115 : case FIRST_P:
226 : case LAST_P:
227 1115 : cur_token = NULLS_LA;
228 1115 : break;
229 : }
230 1427 : break;
231 :
232 13019 : case WITH:
233 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 13019 : switch (next_token)
235 : {
236 1674 : case TIME:
237 : case ORDINALITY:
238 1674 : cur_token = WITH_LA;
239 1674 : break;
240 : }
241 13019 : break;
242 :
243 953 : case WITHOUT:
244 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 953 : switch (next_token)
246 : {
247 376 : case TIME:
248 376 : cur_token = WITHOUT_LA;
249 376 : break;
250 : }
251 953 : break;
252 :
253 390 : case UIDENT:
254 : case USCONST:
255 : /* Look ahead for UESCAPE */
256 390 : if (next_token == UESCAPE)
257 : {
258 : /* Yup, so get third token, which had better be SCONST */
259 : const char *escstr;
260 :
261 : /* Again save and restore *llocp */
262 26 : cur_yylloc = *llocp;
263 :
264 : /* Un-truncate current token so errors point to third token */
265 26 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 :
267 : /* Get third token */
268 26 : next_token = core_yylex(&(yyextra->lookahead_yylval),
269 : llocp, yyscanner);
270 :
271 : /* If we throw error here, it will point to third token */
272 26 : if (next_token != SCONST)
273 4 : scanner_yyerror("UESCAPE must be followed by a simple string literal",
274 : yyscanner);
275 :
276 22 : escstr = yyextra->lookahead_yylval.str;
277 22 : if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278 4 : scanner_yyerror("invalid Unicode escape character",
279 : yyscanner);
280 :
281 : /* Now restore *llocp; errors will point to first token */
282 18 : *llocp = cur_yylloc;
283 :
284 : /* Apply Unicode conversion */
285 18 : lvalp->core_yystype.str =
286 18 : str_udeescape(lvalp->core_yystype.str,
287 18 : escstr[0],
288 : *llocp,
289 : yyscanner);
290 :
291 : /*
292 : * We don't need to revert the un-truncation of UESCAPE. What
293 : * we do want to do is clear have_lookahead, thereby consuming
294 : * all three tokens.
295 : */
296 18 : yyextra->have_lookahead = false;
297 : }
298 : else
299 : {
300 : /* No UESCAPE, so convert using default escape character */
301 332 : lvalp->core_yystype.str =
302 364 : str_udeescape(lvalp->core_yystype.str,
303 : '\\',
304 : *llocp,
305 : yyscanner);
306 : }
307 :
308 350 : if (cur_token == UIDENT)
309 : {
310 : /* It's an identifier, so truncate as appropriate */
311 18 : truncate_identifier(lvalp->core_yystype.str,
312 18 : strlen(lvalp->core_yystype.str),
313 : true);
314 18 : cur_token = IDENT;
315 : }
316 332 : else if (cur_token == USCONST)
317 : {
318 332 : cur_token = SCONST;
319 : }
320 350 : break;
321 : }
322 :
323 51236 : return cur_token;
324 : }
325 :
326 : /* convert hex digit (caller should have verified that) to value */
327 : static unsigned int
328 2120 : hexval(unsigned char c)
329 : {
330 2120 : if (c >= '0' && c <= '9')
331 1802 : return c - '0';
332 318 : if (c >= 'a' && c <= 'f')
333 44 : return c - 'a' + 0xA;
334 274 : if (c >= 'A' && c <= 'F')
335 274 : return c - 'A' + 0xA;
336 0 : elog(ERROR, "invalid hexadecimal digit");
337 : return 0; /* not reached */
338 : }
339 :
340 : /* is Unicode code point acceptable? */
341 : static void
342 514 : check_unicode_value(char32_t c)
343 : {
344 514 : if (!is_valid_unicode_codepoint(c))
345 4 : ereport(ERROR,
346 : (errcode(ERRCODE_SYNTAX_ERROR),
347 : errmsg("invalid Unicode escape value")));
348 510 : }
349 :
350 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 : static bool
352 22 : check_uescapechar(unsigned char escape)
353 : {
354 22 : if (isxdigit(escape)
355 22 : || escape == '+'
356 18 : || escape == '\''
357 18 : || escape == '"'
358 18 : || scanner_isspace(escape))
359 4 : return false;
360 : else
361 18 : return true;
362 : }
363 :
364 : /*
365 : * Process Unicode escapes in "str", producing a palloc'd plain string
366 : *
367 : * escape: the escape character to use
368 : * position: start position of U&'' or U&"" string token
369 : * yyscanner: context information needed for error reports
370 : */
371 : static char *
372 382 : str_udeescape(const char *str, char escape,
373 : int position, core_yyscan_t yyscanner)
374 : {
375 : const char *in;
376 : char *new,
377 : *out;
378 : size_t new_len;
379 382 : char16_t pair_first = 0;
380 : ScannerCallbackState scbstate;
381 :
382 : /*
383 : * Guesstimate that result will be no longer than input, but allow enough
384 : * padding for Unicode conversion.
385 : */
386 382 : new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387 382 : new = palloc(new_len);
388 :
389 382 : in = str;
390 382 : out = new;
391 1886 : while (*in)
392 : {
393 : /* Enlarge string if needed */
394 1532 : size_t out_dist = out - new;
395 :
396 1532 : if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 : {
398 0 : new_len *= 2;
399 0 : new = repalloc(new, new_len);
400 0 : out = new + out_dist;
401 : }
402 :
403 1532 : if (in[0] == escape)
404 : {
405 : /*
406 : * Any errors reported while processing this escape sequence will
407 : * have an error cursor pointing at the escape.
408 : */
409 530 : setup_scanner_errposition_callback(&scbstate, yyscanner,
410 530 : in - str + position + 3); /* 3 for U&" */
411 530 : if (in[1] == escape)
412 : {
413 8 : if (pair_first)
414 4 : goto invalid_pair;
415 4 : *out++ = escape;
416 4 : in += 2;
417 : }
418 522 : else if (isxdigit((unsigned char) in[1]) &&
419 486 : isxdigit((unsigned char) in[2]) &&
420 486 : isxdigit((unsigned char) in[3]) &&
421 486 : isxdigit((unsigned char) in[4]))
422 478 : {
423 : char32_t unicode;
424 :
425 482 : unicode = (hexval(in[1]) << 12) +
426 482 : (hexval(in[2]) << 8) +
427 482 : (hexval(in[3]) << 4) +
428 482 : hexval(in[4]);
429 482 : check_unicode_value(unicode);
430 482 : if (pair_first)
431 : {
432 4 : if (is_utf16_surrogate_second(unicode))
433 : {
434 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435 0 : pair_first = 0;
436 : }
437 : else
438 4 : goto invalid_pair;
439 : }
440 478 : else if (is_utf16_surrogate_second(unicode))
441 0 : goto invalid_pair;
442 :
443 478 : if (is_utf16_surrogate_first(unicode))
444 16 : pair_first = unicode;
445 : else
446 : {
447 462 : pg_unicode_to_server(unicode, (unsigned char *) out);
448 462 : out += strlen(out);
449 : }
450 478 : in += 5;
451 : }
452 40 : else if (in[1] == '+' &&
453 36 : isxdigit((unsigned char) in[2]) &&
454 36 : isxdigit((unsigned char) in[3]) &&
455 36 : isxdigit((unsigned char) in[4]) &&
456 36 : isxdigit((unsigned char) in[5]) &&
457 36 : isxdigit((unsigned char) in[6]) &&
458 32 : isxdigit((unsigned char) in[7]))
459 24 : {
460 : char32_t unicode;
461 :
462 32 : unicode = (hexval(in[2]) << 20) +
463 32 : (hexval(in[3]) << 16) +
464 32 : (hexval(in[4]) << 12) +
465 32 : (hexval(in[5]) << 8) +
466 32 : (hexval(in[6]) << 4) +
467 32 : hexval(in[7]);
468 32 : check_unicode_value(unicode);
469 28 : if (pair_first)
470 : {
471 4 : if (is_utf16_surrogate_second(unicode))
472 : {
473 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 0 : pair_first = 0;
475 : }
476 : else
477 4 : goto invalid_pair;
478 : }
479 24 : else if (is_utf16_surrogate_second(unicode))
480 0 : goto invalid_pair;
481 :
482 24 : if (is_utf16_surrogate_first(unicode))
483 4 : pair_first = unicode;
484 : else
485 : {
486 20 : pg_unicode_to_server(unicode, (unsigned char *) out);
487 20 : out += strlen(out);
488 : }
489 24 : in += 8;
490 : }
491 : else
492 8 : ereport(ERROR,
493 : (errcode(ERRCODE_SYNTAX_ERROR),
494 : errmsg("invalid Unicode escape"),
495 : errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 :
497 506 : cancel_scanner_errposition_callback(&scbstate);
498 : }
499 : else
500 : {
501 1002 : if (pair_first)
502 4 : goto invalid_pair;
503 :
504 998 : *out++ = *in++;
505 : }
506 : }
507 :
508 : /* unfinished surrogate pair? */
509 354 : if (pair_first)
510 4 : goto invalid_pair;
511 :
512 350 : *out = '\0';
513 350 : return new;
514 :
515 : /*
516 : * We might get here with the error callback active, or not. Call
517 : * scanner_errposition to make sure an error cursor appears; if the
518 : * callback is active, this is duplicative but harmless.
519 : */
520 20 : invalid_pair:
521 20 : ereport(ERROR,
522 : (errcode(ERRCODE_SYNTAX_ERROR),
523 : errmsg("invalid Unicode surrogate pair"),
524 : scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 : yyscanner)));
526 : return NULL; /* keep compiler quiet */
527 : }
|