Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * Note that the grammar is not allowed to perform any table access
7 : * (since we need to be able to do basic parsing even while inside an
8 : * aborted transaction). Therefore, the data structures returned by
9 : * the grammar are "raw" parsetrees that still need to be analyzed by
10 : * analyze.c and related files.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/backend/parser/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres.h"
23 :
24 : #include "gramparse.h"
25 : #include "mb/pg_wchar.h"
26 : #include "parser/parser.h"
27 : #include "parser/scansup.h"
28 :
29 : static bool check_uescapechar(unsigned char escape);
30 : static char *str_udeescape(const char *str, char escape,
31 : int position, core_yyscan_t yyscanner);
32 :
33 :
34 : /*
35 : * raw_parser
36 : * Given a query in string form, do lexical and grammatical analysis.
37 : *
38 : * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 : * list have the form required by the specified RawParseMode.
40 : */
41 : List *
42 766086 : raw_parser(const char *str, RawParseMode mode)
43 : {
44 : core_yyscan_t yyscanner;
45 : base_yy_extra_type yyextra;
46 : int yyresult;
47 :
48 : /* initialize the flex scanner */
49 766086 : yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 : &ScanKeywords, ScanKeywordTokens);
51 :
52 : /* base_yylex() only needs us to initialize the lookahead token, if any */
53 766086 : if (mode == RAW_PARSE_DEFAULT)
54 715998 : yyextra.have_lookahead = false;
55 : else
56 : {
57 : /* this array is indexed by RawParseMode enum */
58 : static const int mode_token[] = {
59 : [RAW_PARSE_DEFAULT] = 0,
60 : [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61 : [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62 : [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63 : [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64 : [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65 : };
66 :
67 50088 : yyextra.have_lookahead = true;
68 50088 : yyextra.lookahead_token = mode_token[mode];
69 50088 : yyextra.lookahead_yylloc = 0;
70 50088 : yyextra.lookahead_end = NULL;
71 : }
72 :
73 : /* initialize the bison parser */
74 766086 : parser_init(&yyextra);
75 :
76 : /* Parse! */
77 766086 : yyresult = base_yyparse(yyscanner);
78 :
79 : /* Clean up (release memory) */
80 764838 : scanner_finish(yyscanner);
81 :
82 764838 : if (yyresult) /* error */
83 0 : return NIL;
84 :
85 764838 : return yyextra.parsetree;
86 : }
87 :
88 :
89 : /*
90 : * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 : *
92 : * This filter is needed because in some cases the standard SQL grammar
93 : * requires more than one token lookahead. We reduce these cases to one-token
94 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 : *
96 : * Using a filter is simpler than trying to recognize multiword tokens
97 : * directly in scan.l, because we'd have to allow for comments between the
98 : * words. Furthermore it's not clear how to do that without re-introducing
99 : * scanner backtrack, which would cost more performance than this filter
100 : * layer does.
101 : *
102 : * We also use this filter to convert UIDENT and USCONST sequences into
103 : * plain IDENT and SCONST tokens. While that could be handled by additional
104 : * productions in the main grammar, it's more efficient to do it like this.
105 : *
106 : * The filter also provides a convenient place to translate between
107 : * the core_YYSTYPE and YYSTYPE representations (which are really the
108 : * same thing anyway, but notationally they're different).
109 : */
110 : int
111 19560552 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 : {
113 19560552 : base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 : int cur_token;
115 : int next_token;
116 : int cur_token_length;
117 : YYLTYPE cur_yylloc;
118 :
119 : /* Get next token --- we might already have it */
120 19560552 : if (yyextra->have_lookahead)
121 : {
122 123148 : cur_token = yyextra->lookahead_token;
123 123148 : lvalp->core_yystype = yyextra->lookahead_yylval;
124 123148 : *llocp = yyextra->lookahead_yylloc;
125 123148 : if (yyextra->lookahead_end)
126 73060 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 123148 : yyextra->have_lookahead = false;
128 : }
129 : else
130 19437404 : cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 :
132 : /*
133 : * If this token isn't one that requires lookahead, just return it. If it
134 : * does, determine the token length. (We could get that via strlen(), but
135 : * since we have such a small set of possibilities, hardwiring seems
136 : * feasible and more efficient --- at least for the fixed-length cases.)
137 : */
138 19560306 : switch (cur_token)
139 : {
140 3284 : case FORMAT:
141 3284 : cur_token_length = 6;
142 3284 : break;
143 43890 : case NOT:
144 43890 : cur_token_length = 3;
145 43890 : break;
146 2574 : case NULLS_P:
147 2574 : cur_token_length = 5;
148 2574 : break;
149 21342 : case WITH:
150 21342 : cur_token_length = 4;
151 21342 : break;
152 560 : case UIDENT:
153 : case USCONST:
154 560 : cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 560 : break;
156 1532 : case WITHOUT:
157 1532 : cur_token_length = 7;
158 1532 : break;
159 19487124 : default:
160 19487124 : return cur_token;
161 : }
162 :
163 : /*
164 : * Identify end+1 of current token. core_yylex() has temporarily stored a
165 : * '\0' here, and will undo that when we call it again. We need to redo
166 : * it to fully revert the lookahead call for error reporting purposes.
167 : */
168 73182 : yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 73182 : *llocp + cur_token_length;
170 : Assert(*(yyextra->lookahead_end) == '\0');
171 :
172 : /*
173 : * Save and restore *llocp around the call. It might look like we could
174 : * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 : * does not work because flex actually holds onto the last-passed pointer
176 : * internally, and will use that for error reporting. We need any error
177 : * reports to point to the current token, not the next one.
178 : */
179 73182 : cur_yylloc = *llocp;
180 :
181 : /* Get next token, saving outputs into lookahead variables */
182 73182 : next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 73182 : yyextra->lookahead_token = next_token;
184 73182 : yyextra->lookahead_yylloc = *llocp;
185 :
186 73182 : *llocp = cur_yylloc;
187 :
188 : /* Now revert the un-truncation of the current token */
189 73182 : yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 73182 : *(yyextra->lookahead_end) = '\0';
191 :
192 73182 : yyextra->have_lookahead = true;
193 :
194 : /* Replace cur_token if needed, based on lookahead */
195 73182 : switch (cur_token)
196 : {
197 3284 : case FORMAT:
198 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 : switch (next_token)
200 : {
201 660 : case JSON:
202 660 : cur_token = FORMAT_LA;
203 660 : break;
204 : }
205 3284 : break;
206 :
207 43890 : case NOT:
208 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 : switch (next_token)
210 : {
211 3170 : case BETWEEN:
212 : case IN_P:
213 : case LIKE:
214 : case ILIKE:
215 : case SIMILAR:
216 3170 : cur_token = NOT_LA;
217 3170 : break;
218 : }
219 43890 : break;
220 :
221 2574 : case NULLS_P:
222 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223 : switch (next_token)
224 : {
225 2076 : case FIRST_P:
226 : case LAST_P:
227 2076 : cur_token = NULLS_LA;
228 2076 : break;
229 : }
230 2574 : break;
231 :
232 21342 : case WITH:
233 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 : switch (next_token)
235 : {
236 2856 : case TIME:
237 : case ORDINALITY:
238 2856 : cur_token = WITH_LA;
239 2856 : break;
240 : }
241 21342 : break;
242 :
243 1532 : case WITHOUT:
244 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 : switch (next_token)
246 : {
247 624 : case TIME:
248 624 : cur_token = WITHOUT_LA;
249 624 : break;
250 : }
251 1532 : break;
252 :
253 560 : case UIDENT:
254 : case USCONST:
255 : /* Look ahead for UESCAPE */
256 560 : if (next_token == UESCAPE)
257 : {
258 : /* Yup, so get third token, which had better be SCONST */
259 : const char *escstr;
260 :
261 : /* Again save and restore *llocp */
262 46 : cur_yylloc = *llocp;
263 :
264 : /* Un-truncate current token so errors point to third token */
265 46 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 :
267 : /* Get third token */
268 46 : next_token = core_yylex(&(yyextra->lookahead_yylval),
269 : llocp, yyscanner);
270 :
271 : /* If we throw error here, it will point to third token */
272 46 : if (next_token != SCONST)
273 6 : scanner_yyerror("UESCAPE must be followed by a simple string literal",
274 : yyscanner);
275 :
276 40 : escstr = yyextra->lookahead_yylval.str;
277 40 : if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278 6 : scanner_yyerror("invalid Unicode escape character",
279 : yyscanner);
280 :
281 : /* Now restore *llocp; errors will point to first token */
282 34 : *llocp = cur_yylloc;
283 :
284 : /* Apply Unicode conversion */
285 34 : lvalp->core_yystype.str =
286 34 : str_udeescape(lvalp->core_yystype.str,
287 34 : escstr[0],
288 : *llocp,
289 : yyscanner);
290 :
291 : /*
292 : * We don't need to revert the un-truncation of UESCAPE. What
293 : * we do want to do is clear have_lookahead, thereby consuming
294 : * all three tokens.
295 : */
296 34 : yyextra->have_lookahead = false;
297 : }
298 : else
299 : {
300 : /* No UESCAPE, so convert using default escape character */
301 466 : lvalp->core_yystype.str =
302 514 : str_udeescape(lvalp->core_yystype.str,
303 : '\\',
304 : *llocp,
305 : yyscanner);
306 : }
307 :
308 500 : if (cur_token == UIDENT)
309 : {
310 : /* It's an identifier, so truncate as appropriate */
311 28 : truncate_identifier(lvalp->core_yystype.str,
312 28 : strlen(lvalp->core_yystype.str),
313 : true);
314 28 : cur_token = IDENT;
315 : }
316 472 : else if (cur_token == USCONST)
317 : {
318 472 : cur_token = SCONST;
319 : }
320 500 : break;
321 : }
322 :
323 73122 : return cur_token;
324 : }
325 :
326 : /* convert hex digit (caller should have verified that) to value */
327 : static unsigned int
328 3056 : hexval(unsigned char c)
329 : {
330 3056 : if (c >= '0' && c <= '9')
331 2614 : return c - '0';
332 442 : if (c >= 'a' && c <= 'f')
333 66 : return c - 'a' + 0xA;
334 376 : if (c >= 'A' && c <= 'F')
335 376 : return c - 'A' + 0xA;
336 0 : elog(ERROR, "invalid hexadecimal digit");
337 : return 0; /* not reached */
338 : }
339 :
340 : /* is Unicode code point acceptable? */
341 : static void
342 742 : check_unicode_value(pg_wchar c)
343 : {
344 742 : if (!is_valid_unicode_codepoint(c))
345 6 : ereport(ERROR,
346 : (errcode(ERRCODE_SYNTAX_ERROR),
347 : errmsg("invalid Unicode escape value")));
348 736 : }
349 :
350 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 : static bool
352 40 : check_uescapechar(unsigned char escape)
353 : {
354 40 : if (isxdigit(escape)
355 40 : || escape == '+'
356 34 : || escape == '\''
357 34 : || escape == '"'
358 34 : || scanner_isspace(escape))
359 6 : return false;
360 : else
361 34 : return true;
362 : }
363 :
364 : /*
365 : * Process Unicode escapes in "str", producing a palloc'd plain string
366 : *
367 : * escape: the escape character to use
368 : * position: start position of U&'' or U&"" string token
369 : * yyscanner: context information needed for error reports
370 : */
371 : static char *
372 548 : str_udeescape(const char *str, char escape,
373 : int position, core_yyscan_t yyscanner)
374 : {
375 : const char *in;
376 : char *new,
377 : *out;
378 : size_t new_len;
379 548 : pg_wchar pair_first = 0;
380 : ScannerCallbackState scbstate;
381 :
382 : /*
383 : * Guesstimate that result will be no longer than input, but allow enough
384 : * padding for Unicode conversion.
385 : */
386 548 : new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387 548 : new = palloc(new_len);
388 :
389 548 : in = str;
390 548 : out = new;
391 2742 : while (*in)
392 : {
393 : /* Enlarge string if needed */
394 2236 : size_t out_dist = out - new;
395 :
396 2236 : if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 : {
398 0 : new_len *= 2;
399 0 : new = repalloc(new, new_len);
400 0 : out = new + out_dist;
401 : }
402 :
403 2236 : if (in[0] == escape)
404 : {
405 : /*
406 : * Any errors reported while processing this escape sequence will
407 : * have an error cursor pointing at the escape.
408 : */
409 766 : setup_scanner_errposition_callback(&scbstate, yyscanner,
410 766 : in - str + position + 3); /* 3 for U&" */
411 766 : if (in[1] == escape)
412 : {
413 12 : if (pair_first)
414 6 : goto invalid_pair;
415 6 : *out++ = escape;
416 6 : in += 2;
417 : }
418 754 : else if (isxdigit((unsigned char) in[1]) &&
419 704 : isxdigit((unsigned char) in[2]) &&
420 704 : isxdigit((unsigned char) in[3]) &&
421 704 : isxdigit((unsigned char) in[4]))
422 692 : {
423 : pg_wchar unicode;
424 :
425 698 : unicode = (hexval(in[1]) << 12) +
426 698 : (hexval(in[2]) << 8) +
427 698 : (hexval(in[3]) << 4) +
428 698 : hexval(in[4]);
429 698 : check_unicode_value(unicode);
430 698 : if (pair_first)
431 : {
432 6 : if (is_utf16_surrogate_second(unicode))
433 : {
434 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435 0 : pair_first = 0;
436 : }
437 : else
438 6 : goto invalid_pair;
439 : }
440 692 : else if (is_utf16_surrogate_second(unicode))
441 0 : goto invalid_pair;
442 :
443 692 : if (is_utf16_surrogate_first(unicode))
444 24 : pair_first = unicode;
445 : else
446 : {
447 668 : pg_unicode_to_server(unicode, (unsigned char *) out);
448 668 : out += strlen(out);
449 : }
450 692 : in += 5;
451 : }
452 56 : else if (in[1] == '+' &&
453 50 : isxdigit((unsigned char) in[2]) &&
454 50 : isxdigit((unsigned char) in[3]) &&
455 50 : isxdigit((unsigned char) in[4]) &&
456 50 : isxdigit((unsigned char) in[5]) &&
457 50 : isxdigit((unsigned char) in[6]) &&
458 44 : isxdigit((unsigned char) in[7]))
459 32 : {
460 : pg_wchar unicode;
461 :
462 44 : unicode = (hexval(in[2]) << 20) +
463 44 : (hexval(in[3]) << 16) +
464 44 : (hexval(in[4]) << 12) +
465 44 : (hexval(in[5]) << 8) +
466 44 : (hexval(in[6]) << 4) +
467 44 : hexval(in[7]);
468 44 : check_unicode_value(unicode);
469 38 : if (pair_first)
470 : {
471 6 : if (is_utf16_surrogate_second(unicode))
472 : {
473 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 0 : pair_first = 0;
475 : }
476 : else
477 6 : goto invalid_pair;
478 : }
479 32 : else if (is_utf16_surrogate_second(unicode))
480 0 : goto invalid_pair;
481 :
482 32 : if (is_utf16_surrogate_first(unicode))
483 6 : pair_first = unicode;
484 : else
485 : {
486 26 : pg_unicode_to_server(unicode, (unsigned char *) out);
487 26 : out += strlen(out);
488 : }
489 32 : in += 8;
490 : }
491 : else
492 12 : ereport(ERROR,
493 : (errcode(ERRCODE_SYNTAX_ERROR),
494 : errmsg("invalid Unicode escape"),
495 : errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 :
497 730 : cancel_scanner_errposition_callback(&scbstate);
498 : }
499 : else
500 : {
501 1470 : if (pair_first)
502 6 : goto invalid_pair;
503 :
504 1464 : *out++ = *in++;
505 : }
506 : }
507 :
508 : /* unfinished surrogate pair? */
509 506 : if (pair_first)
510 6 : goto invalid_pair;
511 :
512 500 : *out = '\0';
513 500 : return new;
514 :
515 : /*
516 : * We might get here with the error callback active, or not. Call
517 : * scanner_errposition to make sure an error cursor appears; if the
518 : * callback is active, this is duplicative but harmless.
519 : */
520 30 : invalid_pair:
521 30 : ereport(ERROR,
522 : (errcode(ERRCODE_SYNTAX_ERROR),
523 : errmsg("invalid Unicode surrogate pair"),
524 : scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 : yyscanner)));
526 : return NULL; /* keep compiler quiet */
527 : }
|