Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * Note that the grammar is not allowed to perform any table access
7 : * (since we need to be able to do basic parsing even while inside an
8 : * aborted transaction). Therefore, the data structures returned by
9 : * the grammar are "raw" parsetrees that still need to be analyzed by
10 : * analyze.c and related files.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/backend/parser/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres.h"
23 :
24 : #include "gramparse.h"
25 : #include "mb/pg_wchar.h"
26 : #include "parser/parser.h"
27 : #include "parser/scansup.h"
28 :
29 : static bool check_uescapechar(unsigned char escape);
30 : static char *str_udeescape(const char *str, char escape,
31 : int position, core_yyscan_t yyscanner);
32 :
33 :
34 : /*
35 : * raw_parser
36 : * Given a query in string form, do lexical and grammatical analysis.
37 : *
38 : * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 : * list have the form required by the specified RawParseMode.
40 : */
41 : List *
42 762918 : raw_parser(const char *str, RawParseMode mode)
43 : {
44 : core_yyscan_t yyscanner;
45 : base_yy_extra_type yyextra;
46 : int yyresult;
47 :
48 : /* initialize the flex scanner */
49 762918 : yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 : &ScanKeywords, ScanKeywordTokens);
51 :
52 : /* base_yylex() only needs us to initialize the lookahead token, if any */
53 762918 : if (mode == RAW_PARSE_DEFAULT)
54 714086 : yyextra.have_lookahead = false;
55 : else
56 : {
57 : /* this array is indexed by RawParseMode enum */
58 : static const int mode_token[] = {
59 : [RAW_PARSE_DEFAULT] = 0,
60 : [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61 : [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62 : [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63 : [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64 : [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65 : };
66 :
67 48832 : yyextra.have_lookahead = true;
68 48832 : yyextra.lookahead_token = mode_token[mode];
69 48832 : yyextra.lookahead_yylloc = 0;
70 48832 : yyextra.lookahead_end = NULL;
71 : }
72 :
73 : /* initialize the bison parser */
74 762918 : parser_init(&yyextra);
75 :
76 : /* Parse! */
77 762918 : yyresult = base_yyparse(yyscanner);
78 :
79 : /* Clean up (release memory) */
80 761742 : scanner_finish(yyscanner);
81 :
82 761742 : if (yyresult) /* error */
83 0 : return NIL;
84 :
85 761742 : return yyextra.parsetree;
86 : }
87 :
88 :
89 : /*
90 : * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 : *
92 : * This filter is needed because in some cases the standard SQL grammar
93 : * requires more than one token lookahead. We reduce these cases to one-token
94 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 : *
96 : * Using a filter is simpler than trying to recognize multiword tokens
97 : * directly in scan.l, because we'd have to allow for comments between the
98 : * words. Furthermore it's not clear how to do that without re-introducing
99 : * scanner backtrack, which would cost more performance than this filter
100 : * layer does.
101 : *
102 : * We also use this filter to convert UIDENT and USCONST sequences into
103 : * plain IDENT and SCONST tokens. While that could be handled by additional
104 : * productions in the main grammar, it's more efficient to do it like this.
105 : *
106 : * The filter also provides a convenient place to translate between
107 : * the core_YYSTYPE and YYSTYPE representations (which are really the
108 : * same thing anyway, but notationally they're different).
109 : */
110 : int
111 18291072 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 : {
113 18291072 : base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 : int cur_token;
115 : int next_token;
116 : int cur_token_length;
117 : YYLTYPE cur_yylloc;
118 :
119 : /* Get next token --- we might already have it */
120 18291072 : if (yyextra->have_lookahead)
121 : {
122 115234 : cur_token = yyextra->lookahead_token;
123 115234 : lvalp->core_yystype = yyextra->lookahead_yylval;
124 115234 : *llocp = yyextra->lookahead_yylloc;
125 115234 : if (yyextra->lookahead_end)
126 66402 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 115234 : yyextra->have_lookahead = false;
128 : }
129 : else
130 18175838 : cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 :
132 : /*
133 : * If this token isn't one that requires lookahead, just return it. If it
134 : * does, determine the token length. (We could get that via strlen(), but
135 : * since we have such a small set of possibilities, hardwiring seems
136 : * feasible and more efficient --- at least for the fixed-length cases.)
137 : */
138 18290826 : switch (cur_token)
139 : {
140 3124 : case FORMAT:
141 3124 : cur_token_length = 6;
142 3124 : break;
143 40508 : case NOT:
144 40508 : cur_token_length = 3;
145 40508 : break;
146 2222 : case NULLS_P:
147 2222 : cur_token_length = 5;
148 2222 : break;
149 18924 : case WITH:
150 18924 : cur_token_length = 4;
151 18924 : break;
152 314 : case UIDENT:
153 : case USCONST:
154 314 : cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 314 : break;
156 1432 : case WITHOUT:
157 1432 : cur_token_length = 7;
158 1432 : break;
159 18224302 : default:
160 18224302 : return cur_token;
161 : }
162 :
163 : /*
164 : * Identify end+1 of current token. core_yylex() has temporarily stored a
165 : * '\0' here, and will undo that when we call it again. We need to redo
166 : * it to fully revert the lookahead call for error reporting purposes.
167 : */
168 66524 : yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 66524 : *llocp + cur_token_length;
170 : Assert(*(yyextra->lookahead_end) == '\0');
171 :
172 : /*
173 : * Save and restore *llocp around the call. It might look like we could
174 : * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 : * does not work because flex actually holds onto the last-passed pointer
176 : * internally, and will use that for error reporting. We need any error
177 : * reports to point to the current token, not the next one.
178 : */
179 66524 : cur_yylloc = *llocp;
180 :
181 : /* Get next token, saving outputs into lookahead variables */
182 66524 : next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 66524 : yyextra->lookahead_token = next_token;
184 66524 : yyextra->lookahead_yylloc = *llocp;
185 :
186 66524 : *llocp = cur_yylloc;
187 :
188 : /* Now revert the un-truncation of the current token */
189 66524 : yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 66524 : *(yyextra->lookahead_end) = '\0';
191 :
192 66524 : yyextra->have_lookahead = true;
193 :
194 : /* Replace cur_token if needed, based on lookahead */
195 66524 : switch (cur_token)
196 : {
197 3124 : case FORMAT:
198 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 : switch (next_token)
200 : {
201 660 : case JSON:
202 660 : cur_token = FORMAT_LA;
203 660 : break;
204 : }
205 3124 : break;
206 :
207 40508 : case NOT:
208 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 : switch (next_token)
210 : {
211 5344 : case BETWEEN:
212 : case IN_P:
213 : case LIKE:
214 : case ILIKE:
215 : case SIMILAR:
216 5344 : cur_token = NOT_LA;
217 5344 : break;
218 : }
219 40508 : break;
220 :
221 2222 : case NULLS_P:
222 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223 : switch (next_token)
224 : {
225 2034 : case FIRST_P:
226 : case LAST_P:
227 2034 : cur_token = NULLS_LA;
228 2034 : break;
229 : }
230 2222 : break;
231 :
232 18924 : case WITH:
233 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 : switch (next_token)
235 : {
236 2588 : case TIME:
237 : case ORDINALITY:
238 2588 : cur_token = WITH_LA;
239 2588 : break;
240 : }
241 18924 : break;
242 :
243 1432 : case WITHOUT:
244 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 : switch (next_token)
246 : {
247 592 : case TIME:
248 592 : cur_token = WITHOUT_LA;
249 592 : break;
250 : }
251 1432 : break;
252 :
253 314 : case UIDENT:
254 : case USCONST:
255 : /* Look ahead for UESCAPE */
256 314 : if (next_token == UESCAPE)
257 : {
258 : /* Yup, so get third token, which had better be SCONST */
259 : const char *escstr;
260 :
261 : /* Again save and restore *llocp */
262 46 : cur_yylloc = *llocp;
263 :
264 : /* Un-truncate current token so errors point to third token */
265 46 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 :
267 : /* Get third token */
268 46 : next_token = core_yylex(&(yyextra->lookahead_yylval),
269 : llocp, yyscanner);
270 :
271 : /* If we throw error here, it will point to third token */
272 46 : if (next_token != SCONST)
273 6 : scanner_yyerror("UESCAPE must be followed by a simple string literal",
274 : yyscanner);
275 :
276 40 : escstr = yyextra->lookahead_yylval.str;
277 40 : if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278 6 : scanner_yyerror("invalid Unicode escape character",
279 : yyscanner);
280 :
281 : /* Now restore *llocp; errors will point to first token */
282 34 : *llocp = cur_yylloc;
283 :
284 : /* Apply Unicode conversion */
285 34 : lvalp->core_yystype.str =
286 34 : str_udeescape(lvalp->core_yystype.str,
287 34 : escstr[0],
288 : *llocp,
289 : yyscanner);
290 :
291 : /*
292 : * We don't need to revert the un-truncation of UESCAPE. What
293 : * we do want to do is clear have_lookahead, thereby consuming
294 : * all three tokens.
295 : */
296 34 : yyextra->have_lookahead = false;
297 : }
298 : else
299 : {
300 : /* No UESCAPE, so convert using default escape character */
301 220 : lvalp->core_yystype.str =
302 268 : str_udeescape(lvalp->core_yystype.str,
303 : '\\',
304 : *llocp,
305 : yyscanner);
306 : }
307 :
308 254 : if (cur_token == UIDENT)
309 : {
310 : /* It's an identifier, so truncate as appropriate */
311 28 : truncate_identifier(lvalp->core_yystype.str,
312 28 : strlen(lvalp->core_yystype.str),
313 : true);
314 28 : cur_token = IDENT;
315 : }
316 226 : else if (cur_token == USCONST)
317 : {
318 226 : cur_token = SCONST;
319 : }
320 254 : break;
321 : }
322 :
323 66464 : return cur_token;
324 : }
325 :
326 : /* convert hex digit (caller should have verified that) to value */
327 : static unsigned int
328 1808 : hexval(unsigned char c)
329 : {
330 1808 : if (c >= '0' && c <= '9')
331 1498 : return c - '0';
332 310 : if (c >= 'a' && c <= 'f')
333 60 : return c - 'a' + 0xA;
334 250 : if (c >= 'A' && c <= 'F')
335 250 : return c - 'A' + 0xA;
336 0 : elog(ERROR, "invalid hexadecimal digit");
337 : return 0; /* not reached */
338 : }
339 :
340 : /* is Unicode code point acceptable? */
341 : static void
342 430 : check_unicode_value(pg_wchar c)
343 : {
344 430 : if (!is_valid_unicode_codepoint(c))
345 6 : ereport(ERROR,
346 : (errcode(ERRCODE_SYNTAX_ERROR),
347 : errmsg("invalid Unicode escape value")));
348 424 : }
349 :
350 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 : static bool
352 40 : check_uescapechar(unsigned char escape)
353 : {
354 40 : if (isxdigit(escape)
355 40 : || escape == '+'
356 34 : || escape == '\''
357 34 : || escape == '"'
358 34 : || scanner_isspace(escape))
359 6 : return false;
360 : else
361 34 : return true;
362 : }
363 :
364 : /*
365 : * Process Unicode escapes in "str", producing a palloc'd plain string
366 : *
367 : * escape: the escape character to use
368 : * position: start position of U&'' or U&"" string token
369 : * yyscanner: context information needed for error reports
370 : */
371 : static char *
372 302 : str_udeescape(const char *str, char escape,
373 : int position, core_yyscan_t yyscanner)
374 : {
375 : const char *in;
376 : char *new,
377 : *out;
378 : size_t new_len;
379 302 : pg_wchar pair_first = 0;
380 : ScannerCallbackState scbstate;
381 :
382 : /*
383 : * Guesstimate that result will be no longer than input, but allow enough
384 : * padding for Unicode conversion.
385 : */
386 302 : new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387 302 : new = palloc(new_len);
388 :
389 302 : in = str;
390 302 : out = new;
391 1548 : while (*in)
392 : {
393 : /* Enlarge string if needed */
394 1288 : size_t out_dist = out - new;
395 :
396 1288 : if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 : {
398 0 : new_len *= 2;
399 0 : new = repalloc(new, new_len);
400 0 : out = new + out_dist;
401 : }
402 :
403 1288 : if (in[0] == escape)
404 : {
405 : /*
406 : * Any errors reported while processing this escape sequence will
407 : * have an error cursor pointing at the escape.
408 : */
409 454 : setup_scanner_errposition_callback(&scbstate, yyscanner,
410 454 : in - str + position + 3); /* 3 for U&" */
411 454 : if (in[1] == escape)
412 : {
413 12 : if (pair_first)
414 6 : goto invalid_pair;
415 6 : *out++ = escape;
416 6 : in += 2;
417 : }
418 442 : else if (isxdigit((unsigned char) in[1]) &&
419 392 : isxdigit((unsigned char) in[2]) &&
420 392 : isxdigit((unsigned char) in[3]) &&
421 392 : isxdigit((unsigned char) in[4]))
422 380 : {
423 : pg_wchar unicode;
424 :
425 386 : unicode = (hexval(in[1]) << 12) +
426 386 : (hexval(in[2]) << 8) +
427 386 : (hexval(in[3]) << 4) +
428 386 : hexval(in[4]);
429 386 : check_unicode_value(unicode);
430 386 : if (pair_first)
431 : {
432 6 : if (is_utf16_surrogate_second(unicode))
433 : {
434 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435 0 : pair_first = 0;
436 : }
437 : else
438 6 : goto invalid_pair;
439 : }
440 380 : else if (is_utf16_surrogate_second(unicode))
441 0 : goto invalid_pair;
442 :
443 380 : if (is_utf16_surrogate_first(unicode))
444 24 : pair_first = unicode;
445 : else
446 : {
447 356 : pg_unicode_to_server(unicode, (unsigned char *) out);
448 356 : out += strlen(out);
449 : }
450 380 : in += 5;
451 : }
452 56 : else if (in[1] == '+' &&
453 50 : isxdigit((unsigned char) in[2]) &&
454 50 : isxdigit((unsigned char) in[3]) &&
455 50 : isxdigit((unsigned char) in[4]) &&
456 50 : isxdigit((unsigned char) in[5]) &&
457 50 : isxdigit((unsigned char) in[6]) &&
458 44 : isxdigit((unsigned char) in[7]))
459 32 : {
460 : pg_wchar unicode;
461 :
462 44 : unicode = (hexval(in[2]) << 20) +
463 44 : (hexval(in[3]) << 16) +
464 44 : (hexval(in[4]) << 12) +
465 44 : (hexval(in[5]) << 8) +
466 44 : (hexval(in[6]) << 4) +
467 44 : hexval(in[7]);
468 44 : check_unicode_value(unicode);
469 38 : if (pair_first)
470 : {
471 6 : if (is_utf16_surrogate_second(unicode))
472 : {
473 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 0 : pair_first = 0;
475 : }
476 : else
477 6 : goto invalid_pair;
478 : }
479 32 : else if (is_utf16_surrogate_second(unicode))
480 0 : goto invalid_pair;
481 :
482 32 : if (is_utf16_surrogate_first(unicode))
483 6 : pair_first = unicode;
484 : else
485 : {
486 26 : pg_unicode_to_server(unicode, (unsigned char *) out);
487 26 : out += strlen(out);
488 : }
489 32 : in += 8;
490 : }
491 : else
492 12 : ereport(ERROR,
493 : (errcode(ERRCODE_SYNTAX_ERROR),
494 : errmsg("invalid Unicode escape"),
495 : errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 :
497 418 : cancel_scanner_errposition_callback(&scbstate);
498 : }
499 : else
500 : {
501 834 : if (pair_first)
502 6 : goto invalid_pair;
503 :
504 828 : *out++ = *in++;
505 : }
506 : }
507 :
508 : /* unfinished surrogate pair? */
509 260 : if (pair_first)
510 6 : goto invalid_pair;
511 :
512 254 : *out = '\0';
513 254 : return new;
514 :
515 : /*
516 : * We might get here with the error callback active, or not. Call
517 : * scanner_errposition to make sure an error cursor appears; if the
518 : * callback is active, this is duplicative but harmless.
519 : */
520 30 : invalid_pair:
521 30 : ereport(ERROR,
522 : (errcode(ERRCODE_SYNTAX_ERROR),
523 : errmsg("invalid Unicode surrogate pair"),
524 : scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 : yyscanner)));
526 : return NULL; /* keep compiler quiet */
527 : }
|