Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pl_scanner.c
4 : * lexical scanning for PL/pgSQL
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * src/pl/plpgsql/src/pl_scanner.c
13 : *
14 : *-------------------------------------------------------------------------
15 : */
16 : #include "postgres.h"
17 :
18 : #include "mb/pg_wchar.h"
19 : #include "parser/scanner.h"
20 :
21 : #include "plpgsql.h"
22 : #include "pl_gram.h" /* must be after parser/scanner.h */
23 :
24 :
25 : /* Klugy flag to tell scanner how to look up identifiers */
26 : IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
27 :
28 : /*
29 : * A word about keywords:
30 : *
31 : * We keep reserved and unreserved keywords in separate headers. Be careful
32 : * not to put the same word in both headers. Also be sure that pl_gram.y's
33 : * unreserved_keyword production agrees with the unreserved header. The
34 : * reserved keywords are passed to the core scanner, so they will be
35 : * recognized before (and instead of) any variable name. Unreserved words
36 : * are checked for separately, usually after determining that the identifier
37 : * isn't a known variable name. If plpgsql_IdentifierLookup is DECLARE then
38 : * no variable names will be recognized, so the unreserved words always work.
39 : * (Note in particular that this helps us avoid reserving keywords that are
40 : * only needed in DECLARE sections.)
41 : *
42 : * In certain contexts it is desirable to prefer recognizing an unreserved
43 : * keyword over recognizing a variable name. In particular, at the start
44 : * of a statement we should prefer unreserved keywords unless the statement
45 : * looks like an assignment (i.e., first token is followed by ':=' or '[').
46 : * This rule allows most statement-introducing keywords to be kept unreserved.
47 : * (We still have to reserve initial keywords that might follow a block
48 : * label, unfortunately, since the method used to determine if we are at
49 : * start of statement doesn't recognize such cases. We'd also have to
50 : * reserve any keyword that could legitimately be followed by ':=' or '['.)
51 : * Some additional cases are handled in pl_gram.y using tok_is_keyword().
52 : *
53 : * We try to avoid reserving more keywords than we have to; but there's
54 : * little point in not reserving a word if it's reserved in the core grammar.
55 : * Currently, the following words are reserved here but not in the core:
56 : * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE
57 : */
58 :
59 : /* ScanKeywordList lookup data for PL/pgSQL keywords */
60 : #include "pl_reserved_kwlist_d.h"
61 : #include "pl_unreserved_kwlist_d.h"
62 :
63 : /* Token codes for PL/pgSQL keywords */
64 : #define PG_KEYWORD(kwname, value) value,
65 :
66 : static const uint16 ReservedPLKeywordTokens[] = {
67 : #include "pl_reserved_kwlist.h"
68 : };
69 :
70 : static const uint16 UnreservedPLKeywordTokens[] = {
71 : #include "pl_unreserved_kwlist.h"
72 : };
73 :
74 : #undef PG_KEYWORD
75 :
76 : /*
77 : * This macro must recognize all tokens that can immediately precede a
78 : * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the
79 : * grammar). Fortunately, there are not very many, so hard-coding in this
80 : * fashion seems sufficient.
81 : */
82 : #define AT_STMT_START(prev_token) \
83 : ((prev_token) == ';' || \
84 : (prev_token) == K_BEGIN || \
85 : (prev_token) == K_THEN || \
86 : (prev_token) == K_ELSE || \
87 : (prev_token) == K_LOOP)
88 :
89 :
90 : /* Auxiliary data about a token (other than the token type) */
91 : typedef struct
92 : {
93 : YYSTYPE lval; /* semantic information */
94 : YYLTYPE lloc; /* offset in scanbuf */
95 : int leng; /* length in bytes */
96 : } TokenAuxData;
97 :
98 : /*
99 : * Scanner working state. At some point we might wish to fold all this
100 : * into a YY_EXTRA struct. For the moment, there is no need for plpgsql's
101 : * lexer to be re-entrant, and the notational burden of passing a yyscanner
102 : * pointer around is great enough to not want to do it without need.
103 : */
104 :
105 : /* The stuff the core lexer needs */
106 : static core_yyscan_t yyscanner = NULL;
107 : static core_yy_extra_type core_yy;
108 :
109 : /* The original input string */
110 : static const char *scanorig;
111 :
112 : /* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */
113 : static int plpgsql_yyleng;
114 :
115 : /* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */
116 : static int plpgsql_yytoken;
117 :
118 : /* Token pushback stack */
119 : #define MAX_PUSHBACKS 4
120 :
121 : static int num_pushbacks;
122 : static int pushback_token[MAX_PUSHBACKS];
123 : static TokenAuxData pushback_auxdata[MAX_PUSHBACKS];
124 :
125 : /* State for plpgsql_location_to_lineno() */
126 : static const char *cur_line_start;
127 : static const char *cur_line_end;
128 : static int cur_line_num;
129 :
130 : /* Internal functions */
131 : static int internal_yylex(TokenAuxData *auxdata);
132 : static void push_back_token(int token, TokenAuxData *auxdata);
133 : static void location_lineno_init(void);
134 :
135 :
136 : /*
137 : * This is the yylex routine called from the PL/pgSQL grammar.
138 : * It is a wrapper around the core lexer, with the ability to recognize
139 : * PL/pgSQL variables and return them as special T_DATUM tokens. If a
140 : * word or compound word does not match any variable name, or if matching
141 : * is turned off by plpgsql_IdentifierLookup, it is returned as
142 : * T_WORD or T_CWORD respectively, or as an unreserved keyword if it
143 : * matches one of those.
144 : */
145 : int
146 369726 : plpgsql_yylex(void)
147 : {
148 : int tok1;
149 : TokenAuxData aux1;
150 : int kwnum;
151 :
152 369726 : tok1 = internal_yylex(&aux1);
153 369726 : if (tok1 == IDENT || tok1 == PARAM)
154 : {
155 : int tok2;
156 : TokenAuxData aux2;
157 :
158 122350 : tok2 = internal_yylex(&aux2);
159 122350 : if (tok2 == '.')
160 : {
161 : int tok3;
162 : TokenAuxData aux3;
163 :
164 8276 : tok3 = internal_yylex(&aux3);
165 8276 : if (tok3 == IDENT)
166 : {
167 : int tok4;
168 : TokenAuxData aux4;
169 :
170 8076 : tok4 = internal_yylex(&aux4);
171 8076 : if (tok4 == '.')
172 : {
173 : int tok5;
174 : TokenAuxData aux5;
175 :
176 60 : tok5 = internal_yylex(&aux5);
177 60 : if (tok5 == IDENT)
178 : {
179 60 : if (plpgsql_parse_tripword(aux1.lval.str,
180 : aux3.lval.str,
181 : aux5.lval.str,
182 : &aux1.lval.wdatum,
183 : &aux1.lval.cword))
184 42 : tok1 = T_DATUM;
185 : else
186 18 : tok1 = T_CWORD;
187 : /* Adjust token length to include A.B.C */
188 60 : aux1.leng = aux5.lloc - aux1.lloc + aux5.leng;
189 : }
190 : else
191 : {
192 : /* not A.B.C, so just process A.B */
193 0 : push_back_token(tok5, &aux5);
194 0 : push_back_token(tok4, &aux4);
195 0 : if (plpgsql_parse_dblword(aux1.lval.str,
196 : aux3.lval.str,
197 : &aux1.lval.wdatum,
198 : &aux1.lval.cword))
199 0 : tok1 = T_DATUM;
200 : else
201 0 : tok1 = T_CWORD;
202 : /* Adjust token length to include A.B */
203 0 : aux1.leng = aux3.lloc - aux1.lloc + aux3.leng;
204 : }
205 : }
206 : else
207 : {
208 : /* not A.B.C, so just process A.B */
209 8016 : push_back_token(tok4, &aux4);
210 8016 : if (plpgsql_parse_dblword(aux1.lval.str,
211 : aux3.lval.str,
212 : &aux1.lval.wdatum,
213 : &aux1.lval.cword))
214 7036 : tok1 = T_DATUM;
215 : else
216 980 : tok1 = T_CWORD;
217 : /* Adjust token length to include A.B */
218 8016 : aux1.leng = aux3.lloc - aux1.lloc + aux3.leng;
219 : }
220 : }
221 : else
222 : {
223 : /* not A.B, so just process A */
224 200 : push_back_token(tok3, &aux3);
225 200 : push_back_token(tok2, &aux2);
226 200 : if (plpgsql_parse_word(aux1.lval.str,
227 200 : core_yy.scanbuf + aux1.lloc,
228 : true,
229 : &aux1.lval.wdatum,
230 : &aux1.lval.word))
231 0 : tok1 = T_DATUM;
232 400 : else if (!aux1.lval.word.quoted &&
233 200 : (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
234 : &UnreservedPLKeywords)) >= 0)
235 : {
236 0 : aux1.lval.keyword = GetScanKeyword(kwnum,
237 : &UnreservedPLKeywords);
238 0 : tok1 = UnreservedPLKeywordTokens[kwnum];
239 : }
240 : else
241 200 : tok1 = T_WORD;
242 : }
243 : }
244 : else
245 : {
246 : /* not A.B, so just process A */
247 114074 : push_back_token(tok2, &aux2);
248 :
249 : /*
250 : * See if it matches a variable name, except in the context where
251 : * we are at start of statement and the next token isn't
252 : * assignment or '['. In that case, it couldn't validly be a
253 : * variable name, and skipping the lookup allows variable names to
254 : * be used that would conflict with plpgsql or core keywords that
255 : * introduce statements (e.g., "comment"). Without this special
256 : * logic, every statement-introducing keyword would effectively be
257 : * reserved in PL/pgSQL, which would be unpleasant.
258 : *
259 : * If it isn't a variable name, try to match against unreserved
260 : * plpgsql keywords. If not one of those either, it's T_WORD.
261 : *
262 : * Note: we must call plpgsql_parse_word even if we don't want to
263 : * do variable lookup, because it sets up aux1.lval.word for the
264 : * non-variable cases.
265 : */
266 114074 : if (plpgsql_parse_word(aux1.lval.str,
267 114074 : core_yy.scanbuf + aux1.lloc,
268 141008 : (!AT_STMT_START(plpgsql_yytoken) ||
269 26934 : (tok2 == '=' || tok2 == COLON_EQUALS ||
270 : tok2 == '[')),
271 : &aux1.lval.wdatum,
272 : &aux1.lval.word))
273 14972 : tok1 = T_DATUM;
274 198180 : else if (!aux1.lval.word.quoted &&
275 99078 : (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
276 : &UnreservedPLKeywords)) >= 0)
277 : {
278 28906 : aux1.lval.keyword = GetScanKeyword(kwnum,
279 : &UnreservedPLKeywords);
280 28906 : tok1 = UnreservedPLKeywordTokens[kwnum];
281 : }
282 : else
283 70196 : tok1 = T_WORD;
284 : }
285 : }
286 : else
287 : {
288 : /*
289 : * Not a potential plpgsql variable name, just return the data.
290 : *
291 : * Note that we also come through here if the grammar pushed back a
292 : * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a
293 : * previous lookup cycle; thus, pushbacks do not incur extra lookup
294 : * work, since we'll never do the above code twice for the same token.
295 : * This property also makes it safe to rely on the old value of
296 : * plpgsql_yytoken in the is-this-start-of-statement test above.
297 : */
298 : }
299 :
300 369726 : plpgsql_yylval = aux1.lval;
301 369726 : plpgsql_yylloc = aux1.lloc;
302 369726 : plpgsql_yyleng = aux1.leng;
303 369726 : plpgsql_yytoken = tok1;
304 369726 : return tok1;
305 : }
306 :
307 : /*
308 : * Return the length of the token last returned by plpgsql_yylex().
309 : *
310 : * In the case of compound tokens, the length includes all the parts.
311 : */
312 : int
313 116382 : plpgsql_token_length(void)
314 : {
315 116382 : return plpgsql_yyleng;
316 : }
317 :
318 : /*
319 : * Internal yylex function. This wraps the core lexer and adds one feature:
320 : * a token pushback stack. We also make a couple of trivial single-token
321 : * translations from what the core lexer does to what we want, in particular
322 : * interfacing from the core_YYSTYPE to YYSTYPE union.
323 : */
324 : static int
325 514430 : internal_yylex(TokenAuxData *auxdata)
326 : {
327 : int token;
328 : const char *yytext;
329 :
330 514430 : if (num_pushbacks > 0)
331 : {
332 159108 : num_pushbacks--;
333 159108 : token = pushback_token[num_pushbacks];
334 159108 : *auxdata = pushback_auxdata[num_pushbacks];
335 : }
336 : else
337 : {
338 355322 : token = core_yylex(&auxdata->lval.core_yystype,
339 : &auxdata->lloc,
340 : yyscanner);
341 :
342 : /* remember the length of yytext before it gets changed */
343 355322 : yytext = core_yy.scanbuf + auxdata->lloc;
344 355322 : auxdata->leng = strlen(yytext);
345 :
346 : /* Check for << >> and #, which the core considers operators */
347 355322 : if (token == Op)
348 : {
349 2588 : if (strcmp(auxdata->lval.str, "<<") == 0)
350 100 : token = LESS_LESS;
351 2488 : else if (strcmp(auxdata->lval.str, ">>") == 0)
352 92 : token = GREATER_GREATER;
353 2396 : else if (strcmp(auxdata->lval.str, "#") == 0)
354 24 : token = '#';
355 : }
356 :
357 : /* The core returns PARAM as ival, but we treat it like IDENT */
358 352734 : else if (token == PARAM)
359 : {
360 1346 : auxdata->lval.str = pstrdup(yytext);
361 : }
362 : }
363 :
364 514430 : return token;
365 : }
366 :
367 : /*
368 : * Push back a token to be re-read by next internal_yylex() call.
369 : */
370 : static void
371 159182 : push_back_token(int token, TokenAuxData *auxdata)
372 : {
373 159182 : if (num_pushbacks >= MAX_PUSHBACKS)
374 0 : elog(ERROR, "too many tokens pushed back");
375 159182 : pushback_token[num_pushbacks] = token;
376 159182 : pushback_auxdata[num_pushbacks] = *auxdata;
377 159182 : num_pushbacks++;
378 159182 : }
379 :
380 : /*
381 : * Push back a single token to be re-read by next plpgsql_yylex() call.
382 : *
383 : * NOTE: this does not cause yylval or yylloc to "back up". Also, it
384 : * is not a good idea to push back a token code other than what you read.
385 : */
386 : void
387 30750 : plpgsql_push_back_token(int token)
388 : {
389 : TokenAuxData auxdata;
390 :
391 30750 : auxdata.lval = plpgsql_yylval;
392 30750 : auxdata.lloc = plpgsql_yylloc;
393 30750 : auxdata.leng = plpgsql_yyleng;
394 30750 : push_back_token(token, &auxdata);
395 30750 : }
396 :
397 : /*
398 : * Tell whether a token is an unreserved keyword.
399 : *
400 : * (If it is, its lowercased form was returned as the token value, so we
401 : * do not need to offer that data here.)
402 : */
403 : bool
404 76 : plpgsql_token_is_unreserved_keyword(int token)
405 : {
406 : int i;
407 :
408 6248 : for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++)
409 : {
410 6176 : if (UnreservedPLKeywordTokens[i] == token)
411 4 : return true;
412 : }
413 72 : return false;
414 : }
415 :
416 : /*
417 : * Append the function text starting at startlocation and extending to
418 : * (not including) endlocation onto the existing contents of "buf".
419 : */
420 : void
421 46470 : plpgsql_append_source_text(StringInfo buf,
422 : int startlocation, int endlocation)
423 : {
424 : Assert(startlocation <= endlocation);
425 46470 : appendBinaryStringInfo(buf, scanorig + startlocation,
426 : endlocation - startlocation);
427 46470 : }
428 :
429 : /*
430 : * Peek one token ahead in the input stream. Only the token code is
431 : * made available, not any of the auxiliary info such as location.
432 : *
433 : * NB: no variable or unreserved keyword lookup is performed here, they will
434 : * be returned as IDENT. Reserved keywords are resolved as usual.
435 : */
436 : int
437 5654 : plpgsql_peek(void)
438 : {
439 : int tok1;
440 : TokenAuxData aux1;
441 :
442 5654 : tok1 = internal_yylex(&aux1);
443 5654 : push_back_token(tok1, &aux1);
444 5654 : return tok1;
445 : }
446 :
447 : /*
448 : * Peek two tokens ahead in the input stream. The first token and its
449 : * location in the query are returned in *tok1_p and *tok1_loc, second token
450 : * and its location in *tok2_p and *tok2_loc.
451 : *
452 : * NB: no variable or unreserved keyword lookup is performed here, they will
453 : * be returned as IDENT. Reserved keywords are resolved as usual.
454 : */
455 : void
456 144 : plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
457 : {
458 : int tok1,
459 : tok2;
460 : TokenAuxData aux1,
461 : aux2;
462 :
463 144 : tok1 = internal_yylex(&aux1);
464 144 : tok2 = internal_yylex(&aux2);
465 :
466 144 : *tok1_p = tok1;
467 144 : if (tok1_loc)
468 144 : *tok1_loc = aux1.lloc;
469 144 : *tok2_p = tok2;
470 144 : if (tok2_loc)
471 0 : *tok2_loc = aux2.lloc;
472 :
473 144 : push_back_token(tok2, &aux2);
474 144 : push_back_token(tok1, &aux1);
475 144 : }
476 :
477 : /*
478 : * plpgsql_scanner_errposition
479 : * Report an error cursor position, if possible.
480 : *
481 : * This is expected to be used within an ereport() call. The return value
482 : * is a dummy (always 0, in fact).
483 : *
484 : * Note that this can only be used for messages emitted during initial
485 : * parsing of a plpgsql function, since it requires the scanorig string
486 : * to still be available.
487 : */
488 : int
489 174 : plpgsql_scanner_errposition(int location)
490 : {
491 : int pos;
492 :
493 174 : if (location < 0 || scanorig == NULL)
494 0 : return 0; /* no-op if location is unknown */
495 :
496 : /* Convert byte offset to character number */
497 174 : pos = pg_mbstrlen_with_len(scanorig, location) + 1;
498 : /* And pass it to the ereport mechanism */
499 174 : (void) internalerrposition(pos);
500 : /* Also pass the function body string */
501 174 : return internalerrquery(scanorig);
502 : }
503 :
504 : /*
505 : * plpgsql_yyerror
506 : * Report a lexer or grammar error.
507 : *
508 : * The message's cursor position refers to the current token (the one
509 : * last returned by plpgsql_yylex()).
510 : * This is OK for syntax error messages from the Bison parser, because Bison
511 : * parsers report error as soon as the first unparsable token is reached.
512 : * Beware of using yyerror for other purposes, as the cursor position might
513 : * be misleading!
514 : */
515 : void
516 8 : plpgsql_yyerror(const char *message)
517 : {
518 8 : char *yytext = core_yy.scanbuf + plpgsql_yylloc;
519 :
520 8 : if (*yytext == '\0')
521 : {
522 0 : ereport(ERROR,
523 : (errcode(ERRCODE_SYNTAX_ERROR),
524 : /* translator: %s is typically the translation of "syntax error" */
525 : errmsg("%s at end of input", _(message)),
526 : plpgsql_scanner_errposition(plpgsql_yylloc)));
527 : }
528 : else
529 : {
530 : /*
531 : * If we have done any lookahead then flex will have restored the
532 : * character after the end-of-token. Zap it again so that we report
533 : * only the single token here. This modifies scanbuf but we no longer
534 : * care about that.
535 : */
536 8 : yytext[plpgsql_yyleng] = '\0';
537 :
538 8 : ereport(ERROR,
539 : (errcode(ERRCODE_SYNTAX_ERROR),
540 : /* translator: first %s is typically the translation of "syntax error" */
541 : errmsg("%s at or near \"%s\"", _(message), yytext),
542 : plpgsql_scanner_errposition(plpgsql_yylloc)));
543 : }
544 : }
545 :
546 : /*
547 : * Given a location (a byte offset in the function source text),
548 : * return a line number.
549 : *
550 : * We expect that this is typically called for a sequence of increasing
551 : * location values, so optimize accordingly by tracking the endpoints
552 : * of the "current" line.
553 : */
554 : int
555 53364 : plpgsql_location_to_lineno(int location)
556 : {
557 : const char *loc;
558 :
559 53364 : if (location < 0 || scanorig == NULL)
560 0 : return 0; /* garbage in, garbage out */
561 53364 : loc = scanorig + location;
562 :
563 : /* be correct, but not fast, if input location goes backwards */
564 53364 : if (loc < cur_line_start)
565 15372 : location_lineno_init();
566 :
567 253798 : while (cur_line_end != NULL && loc > cur_line_end)
568 : {
569 200434 : cur_line_start = cur_line_end + 1;
570 200434 : cur_line_num++;
571 200434 : cur_line_end = strchr(cur_line_start, '\n');
572 : }
573 :
574 53364 : return cur_line_num;
575 : }
576 :
577 : /* initialize or reset the state for plpgsql_location_to_lineno */
578 : static void
579 24024 : location_lineno_init(void)
580 : {
581 24024 : cur_line_start = scanorig;
582 24024 : cur_line_num = 1;
583 :
584 24024 : cur_line_end = strchr(cur_line_start, '\n');
585 24024 : }
586 :
587 : /* return the most recently computed lineno */
588 : int
589 60 : plpgsql_latest_lineno(void)
590 : {
591 60 : return cur_line_num;
592 : }
593 :
594 :
595 : /*
596 : * Called before any actual parsing is done
597 : *
598 : * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
599 : * Although it is not fed directly to flex, we need the original string
600 : * to cite in error messages.
601 : */
602 : void
603 8652 : plpgsql_scanner_init(const char *str)
604 : {
605 : /* Start up the core scanner */
606 8652 : yyscanner = scanner_init(str, &core_yy,
607 : &ReservedPLKeywords, ReservedPLKeywordTokens);
608 :
609 : /*
610 : * scanorig points to the original string, which unlike the scanner's
611 : * scanbuf won't be modified on-the-fly by flex. Notice that although
612 : * yytext points into scanbuf, we rely on being able to apply locations
613 : * (offsets from string start) to scanorig as well.
614 : */
615 8652 : scanorig = str;
616 :
617 : /* Other setup */
618 8652 : plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
619 8652 : plpgsql_yytoken = 0;
620 :
621 8652 : num_pushbacks = 0;
622 :
623 8652 : location_lineno_init();
624 8652 : }
625 :
626 : /*
627 : * Called after parsing is done to clean up after plpgsql_scanner_init()
628 : */
629 : void
630 8474 : plpgsql_scanner_finish(void)
631 : {
632 : /* release storage */
633 8474 : scanner_finish(yyscanner);
634 : /* avoid leaving any dangling pointers */
635 8474 : yyscanner = NULL;
636 8474 : scanorig = NULL;
637 8474 : }
|