LCOV - code coverage report
Current view: top level - src/backend/parser - parser.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 94.8 % 211 200
Test Date: 2026-03-02 16:14:50 Functions: 100.0 % 6 6
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * parser.c
       4              :  *      Main entry point/driver for PostgreSQL grammar
       5              :  *
       6              :  * Note that the grammar is not allowed to perform any table access
       7              :  * (since we need to be able to do basic parsing even while inside an
       8              :  * aborted transaction).  Therefore, the data structures returned by
       9              :  * the grammar are "raw" parsetrees that still need to be analyzed by
      10              :  * analyze.c and related files.
      11              :  *
      12              :  *
      13              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      14              :  * Portions Copyright (c) 1994, Regents of the University of California
      15              :  *
      16              :  * IDENTIFICATION
      17              :  *    src/backend/parser/parser.c
      18              :  *
      19              :  *-------------------------------------------------------------------------
      20              :  */
      21              : 
      22              : #include "postgres.h"
      23              : 
      24              : #include "gramparse.h"
      25              : #include "mb/pg_wchar.h"
      26              : #include "parser/parser.h"
      27              : #include "parser/scansup.h"
      28              : 
      29              : static bool check_uescapechar(unsigned char escape);
      30              : static char *str_udeescape(const char *str, char escape,
      31              :                            int position, core_yyscan_t yyscanner);
      32              : 
      33              : 
      34              : /*
      35              :  * raw_parser
      36              :  *      Given a query in string form, do lexical and grammatical analysis.
      37              :  *
      38              :  * Returns a list of raw (un-analyzed) parse trees.  The contents of the
      39              :  * list have the form required by the specified RawParseMode.
      40              :  */
      41              : List *
      42       411002 : raw_parser(const char *str, RawParseMode mode)
      43              : {
      44              :     core_yyscan_t yyscanner;
      45              :     base_yy_extra_type yyextra;
      46              :     int         yyresult;
      47              : 
      48              :     /* initialize the flex scanner */
      49       411002 :     yyscanner = scanner_init(str, &yyextra.core_yy_extra,
      50              :                              &ScanKeywords, ScanKeywordTokens);
      51              : 
      52              :     /* base_yylex() only needs us to initialize the lookahead token, if any */
      53       411002 :     if (mode == RAW_PARSE_DEFAULT)
      54       385012 :         yyextra.have_lookahead = false;
      55              :     else
      56              :     {
      57              :         /* this array is indexed by RawParseMode enum */
      58              :         static const int mode_token[] = {
      59              :             [RAW_PARSE_DEFAULT] = 0,
      60              :             [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
      61              :             [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
      62              :             [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
      63              :             [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
      64              :             [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
      65              :         };
      66              : 
      67        25990 :         yyextra.have_lookahead = true;
      68        25990 :         yyextra.lookahead_token = mode_token[mode];
      69        25990 :         yyextra.lookahead_yylloc = 0;
      70        25990 :         yyextra.lookahead_end = NULL;
      71              :     }
      72              : 
      73              :     /* initialize the bison parser */
      74       411002 :     parser_init(&yyextra);
      75              : 
      76              :     /* Parse! */
      77       411002 :     yyresult = base_yyparse(yyscanner);
      78              : 
      79              :     /* Clean up (release memory) */
      80       410391 :     scanner_finish(yyscanner);
      81              : 
      82       410391 :     if (yyresult)               /* error */
      83            0 :         return NIL;
      84              : 
      85       410391 :     return yyextra.parsetree;
      86              : }
      87              : 
      88              : 
      89              : /*
      90              :  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
      91              :  *
      92              :  * This filter is needed because in some cases the standard SQL grammar
      93              :  * requires more than one token lookahead.  We reduce these cases to one-token
      94              :  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
      95              :  *
      96              :  * Using a filter is simpler than trying to recognize multiword tokens
      97              :  * directly in scan.l, because we'd have to allow for comments between the
      98              :  * words.  Furthermore it's not clear how to do that without re-introducing
      99              :  * scanner backtrack, which would cost more performance than this filter
     100              :  * layer does.
     101              :  *
     102              :  * We also use this filter to convert UIDENT and USCONST sequences into
     103              :  * plain IDENT and SCONST tokens.  While that could be handled by additional
     104              :  * productions in the main grammar, it's more efficient to do it like this.
     105              :  *
     106              :  * The filter also provides a convenient place to translate between
     107              :  * the core_YYSTYPE and YYSTYPE representations (which are really the
     108              :  * same thing anyway, but notationally they're different).
     109              :  */
     110              : int
     111     10378107 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
     112              : {
     113     10378107 :     base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
     114              :     int         cur_token;
     115              :     int         next_token;
     116              :     int         cur_token_length;
     117              :     YYLTYPE     cur_yylloc;
     118              : 
     119              :     /* Get next token --- we might already have it */
     120     10378107 :     if (yyextra->have_lookahead)
     121              :     {
     122        64772 :         cur_token = yyextra->lookahead_token;
     123        64772 :         lvalp->core_yystype = yyextra->lookahead_yylval;
     124        64772 :         *llocp = yyextra->lookahead_yylloc;
     125        64772 :         if (yyextra->lookahead_end)
     126        38782 :             *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     127        64772 :         yyextra->have_lookahead = false;
     128              :     }
     129              :     else
     130     10313335 :         cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
     131              : 
     132              :     /*
     133              :      * If this token isn't one that requires lookahead, just return it.  If it
     134              :      * does, determine the token length.  (We could get that via strlen(), but
     135              :      * since we have such a small set of possibilities, hardwiring seems
     136              :      * feasible and more efficient --- at least for the fixed-length cases.)
     137              :      */
     138     10378002 :     switch (cur_token)
     139              :     {
     140         1679 :         case FORMAT:
     141         1679 :             cur_token_length = 6;
     142         1679 :             break;
     143        23437 :         case NOT:
     144        23437 :             cur_token_length = 3;
     145        23437 :             break;
     146         1307 :         case NULLS_P:
     147         1307 :             cur_token_length = 5;
     148         1307 :             break;
     149        11380 :         case WITH:
     150        11380 :             cur_token_length = 4;
     151        11380 :             break;
     152          295 :         case UIDENT:
     153              :         case USCONST:
     154          295 :             cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
     155          295 :             break;
     156          742 :         case WITHOUT:
     157          742 :             cur_token_length = 7;
     158          742 :             break;
     159     10339162 :         default:
     160     10339162 :             return cur_token;
     161              :     }
     162              : 
     163              :     /*
     164              :      * Identify end+1 of current token.  core_yylex() has temporarily stored a
     165              :      * '\0' here, and will undo that when we call it again.  We need to redo
     166              :      * it to fully revert the lookahead call for error reporting purposes.
     167              :      */
     168        38840 :     yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
     169        38840 :         *llocp + cur_token_length;
     170              :     Assert(*(yyextra->lookahead_end) == '\0');
     171              : 
     172              :     /*
     173              :      * Save and restore *llocp around the call.  It might look like we could
     174              :      * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
     175              :      * does not work because flex actually holds onto the last-passed pointer
     176              :      * internally, and will use that for error reporting.  We need any error
     177              :      * reports to point to the current token, not the next one.
     178              :      */
     179        38840 :     cur_yylloc = *llocp;
     180              : 
     181              :     /* Get next token, saving outputs into lookahead variables */
     182        38840 :     next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
     183        38840 :     yyextra->lookahead_token = next_token;
     184        38840 :     yyextra->lookahead_yylloc = *llocp;
     185              : 
     186        38840 :     *llocp = cur_yylloc;
     187              : 
     188              :     /* Now revert the un-truncation of the current token */
     189        38840 :     yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
     190        38840 :     *(yyextra->lookahead_end) = '\0';
     191              : 
     192        38840 :     yyextra->have_lookahead = true;
     193              : 
     194              :     /* Replace cur_token if needed, based on lookahead */
     195        38840 :     switch (cur_token)
     196              :     {
     197         1679 :         case FORMAT:
     198              :             /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
     199         1679 :             switch (next_token)
     200              :             {
     201          330 :                 case JSON:
     202          330 :                     cur_token = FORMAT_LA;
     203          330 :                     break;
     204              :             }
     205         1679 :             break;
     206              : 
     207        23437 :         case NOT:
     208              :             /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
     209        23437 :             switch (next_token)
     210              :             {
     211         1690 :                 case BETWEEN:
     212              :                 case IN_P:
     213              :                 case LIKE:
     214              :                 case ILIKE:
     215              :                 case SIMILAR:
     216         1690 :                     cur_token = NOT_LA;
     217         1690 :                     break;
     218              :             }
     219        23437 :             break;
     220              : 
     221         1307 :         case NULLS_P:
     222              :             /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
     223         1307 :             switch (next_token)
     224              :             {
     225         1052 :                 case FIRST_P:
     226              :                 case LAST_P:
     227         1052 :                     cur_token = NULLS_LA;
     228         1052 :                     break;
     229              :             }
     230         1307 :             break;
     231              : 
     232        11380 :         case WITH:
     233              :             /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
     234        11380 :             switch (next_token)
     235              :             {
     236         1424 :                 case TIME:
     237              :                 case ORDINALITY:
     238         1424 :                     cur_token = WITH_LA;
     239         1424 :                     break;
     240              :             }
     241        11380 :             break;
     242              : 
     243          742 :         case WITHOUT:
     244              :             /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
     245          742 :             switch (next_token)
     246              :             {
     247          288 :                 case TIME:
     248          288 :                     cur_token = WITHOUT_LA;
     249          288 :                     break;
     250              :             }
     251          742 :             break;
     252              : 
     253          295 :         case UIDENT:
     254              :         case USCONST:
     255              :             /* Look ahead for UESCAPE */
     256          295 :             if (next_token == UESCAPE)
     257              :             {
     258              :                 /* Yup, so get third token, which had better be SCONST */
     259              :                 const char *escstr;
     260              : 
     261              :                 /* Again save and restore *llocp */
     262           20 :                 cur_yylloc = *llocp;
     263              : 
     264              :                 /* Un-truncate current token so errors point to third token */
     265           20 :                 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     266              : 
     267              :                 /* Get third token */
     268           20 :                 next_token = core_yylex(&(yyextra->lookahead_yylval),
     269              :                                         llocp, yyscanner);
     270              : 
     271              :                 /* If we throw error here, it will point to third token */
     272           20 :                 if (next_token != SCONST)
     273            3 :                     scanner_yyerror("UESCAPE must be followed by a simple string literal",
     274              :                                     yyscanner);
     275              : 
     276           17 :                 escstr = yyextra->lookahead_yylval.str;
     277           17 :                 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
     278            3 :                     scanner_yyerror("invalid Unicode escape character",
     279              :                                     yyscanner);
     280              : 
     281              :                 /* Now restore *llocp; errors will point to first token */
     282           14 :                 *llocp = cur_yylloc;
     283              : 
     284              :                 /* Apply Unicode conversion */
     285           14 :                 lvalp->core_yystype.str =
     286           14 :                     str_udeescape(lvalp->core_yystype.str,
     287           14 :                                   escstr[0],
     288              :                                   *llocp,
     289              :                                   yyscanner);
     290              : 
     291              :                 /*
     292              :                  * We don't need to revert the un-truncation of UESCAPE.  What
     293              :                  * we do want to do is clear have_lookahead, thereby consuming
     294              :                  * all three tokens.
     295              :                  */
     296           14 :                 yyextra->have_lookahead = false;
     297              :             }
     298              :             else
     299              :             {
     300              :                 /* No UESCAPE, so convert using default escape character */
     301          251 :                 lvalp->core_yystype.str =
     302          275 :                     str_udeescape(lvalp->core_yystype.str,
     303              :                                   '\\',
     304              :                                   *llocp,
     305              :                                   yyscanner);
     306              :             }
     307              : 
     308          265 :             if (cur_token == UIDENT)
     309              :             {
     310              :                 /* It's an identifier, so truncate as appropriate */
     311           14 :                 truncate_identifier(lvalp->core_yystype.str,
     312           14 :                                     strlen(lvalp->core_yystype.str),
     313              :                                     true);
     314           14 :                 cur_token = IDENT;
     315              :             }
     316          251 :             else if (cur_token == USCONST)
     317              :             {
     318          251 :                 cur_token = SCONST;
     319              :             }
     320          265 :             break;
     321              :     }
     322              : 
     323        38810 :     return cur_token;
     324              : }
     325              : 
     326              : /* convert hex digit (caller should have verified that) to value */
     327              : static unsigned int
     328         1606 : hexval(unsigned char c)
     329              : {
     330         1606 :     if (c >= '0' && c <= '9')
     331         1367 :         return c - '0';
     332          239 :     if (c >= 'a' && c <= 'f')
     333           33 :         return c - 'a' + 0xA;
     334          206 :     if (c >= 'A' && c <= 'F')
     335          206 :         return c - 'A' + 0xA;
     336            0 :     elog(ERROR, "invalid hexadecimal digit");
     337              :     return 0;                   /* not reached */
     338              : }
     339              : 
     340              : /* is Unicode code point acceptable? */
     341              : static void
     342          389 : check_unicode_value(char32_t c)
     343              : {
     344          389 :     if (!is_valid_unicode_codepoint(c))
     345            3 :         ereport(ERROR,
     346              :                 (errcode(ERRCODE_SYNTAX_ERROR),
     347              :                  errmsg("invalid Unicode escape value")));
     348          386 : }
     349              : 
     350              : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
     351              : static bool
     352           17 : check_uescapechar(unsigned char escape)
     353              : {
     354           17 :     if (isxdigit(escape)
     355           17 :         || escape == '+'
     356           14 :         || escape == '\''
     357           14 :         || escape == '"'
     358           14 :         || scanner_isspace(escape))
     359            3 :         return false;
     360              :     else
     361           14 :         return true;
     362              : }
     363              : 
     364              : /*
     365              :  * Process Unicode escapes in "str", producing a palloc'd plain string
     366              :  *
     367              :  * escape: the escape character to use
     368              :  * position: start position of U&'' or U&"" string token
     369              :  * yyscanner: context information needed for error reports
     370              :  */
     371              : static char *
     372          289 : str_udeescape(const char *str, char escape,
     373              :               int position, core_yyscan_t yyscanner)
     374              : {
     375              :     const char *in;
     376              :     char       *new,
     377              :                *out;
     378              :     size_t      new_len;
     379          289 :     char16_t    pair_first = 0;
     380              :     ScannerCallbackState scbstate;
     381              : 
     382              :     /*
     383              :      * Guesstimate that result will be no longer than input, but allow enough
     384              :      * padding for Unicode conversion.
     385              :      */
     386          289 :     new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
     387          289 :     new = palloc(new_len);
     388              : 
     389          289 :     in = str;
     390          289 :     out = new;
     391         1428 :     while (*in)
     392              :     {
     393              :         /* Enlarge string if needed */
     394         1160 :         size_t      out_dist = out - new;
     395              : 
     396         1160 :         if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
     397              :         {
     398            0 :             new_len *= 2;
     399            0 :             new = repalloc(new, new_len);
     400            0 :             out = new + out_dist;
     401              :         }
     402              : 
     403         1160 :         if (in[0] == escape)
     404              :         {
     405              :             /*
     406              :              * Any errors reported while processing this escape sequence will
     407              :              * have an error cursor pointing at the escape.
     408              :              */
     409          401 :             setup_scanner_errposition_callback(&scbstate, yyscanner,
     410          401 :                                                in - str + position + 3);    /* 3 for U&" */
     411          401 :             if (in[1] == escape)
     412              :             {
     413            6 :                 if (pair_first)
     414            3 :                     goto invalid_pair;
     415            3 :                 *out++ = escape;
     416            3 :                 in += 2;
     417              :             }
     418          395 :             else if (isxdigit((unsigned char) in[1]) &&
     419          367 :                      isxdigit((unsigned char) in[2]) &&
     420          367 :                      isxdigit((unsigned char) in[3]) &&
     421          367 :                      isxdigit((unsigned char) in[4]))
     422          361 :             {
     423              :                 char32_t    unicode;
     424              : 
     425          364 :                 unicode = (hexval(in[1]) << 12) +
     426          364 :                     (hexval(in[2]) << 8) +
     427          364 :                     (hexval(in[3]) << 4) +
     428          364 :                     hexval(in[4]);
     429          364 :                 check_unicode_value(unicode);
     430          364 :                 if (pair_first)
     431              :                 {
     432            3 :                     if (is_utf16_surrogate_second(unicode))
     433              :                     {
     434            0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     435            0 :                         pair_first = 0;
     436              :                     }
     437              :                     else
     438            3 :                         goto invalid_pair;
     439              :                 }
     440          361 :                 else if (is_utf16_surrogate_second(unicode))
     441            0 :                     goto invalid_pair;
     442              : 
     443          361 :                 if (is_utf16_surrogate_first(unicode))
     444           12 :                     pair_first = unicode;
     445              :                 else
     446              :                 {
     447          349 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     448          349 :                     out += strlen(out);
     449              :                 }
     450          361 :                 in += 5;
     451              :             }
     452           31 :             else if (in[1] == '+' &&
     453           28 :                      isxdigit((unsigned char) in[2]) &&
     454           28 :                      isxdigit((unsigned char) in[3]) &&
     455           28 :                      isxdigit((unsigned char) in[4]) &&
     456           28 :                      isxdigit((unsigned char) in[5]) &&
     457           28 :                      isxdigit((unsigned char) in[6]) &&
     458           25 :                      isxdigit((unsigned char) in[7]))
     459           19 :             {
     460              :                 char32_t    unicode;
     461              : 
     462           25 :                 unicode = (hexval(in[2]) << 20) +
     463           25 :                     (hexval(in[3]) << 16) +
     464           25 :                     (hexval(in[4]) << 12) +
     465           25 :                     (hexval(in[5]) << 8) +
     466           25 :                     (hexval(in[6]) << 4) +
     467           25 :                     hexval(in[7]);
     468           25 :                 check_unicode_value(unicode);
     469           22 :                 if (pair_first)
     470              :                 {
     471            3 :                     if (is_utf16_surrogate_second(unicode))
     472              :                     {
     473            0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     474            0 :                         pair_first = 0;
     475              :                     }
     476              :                     else
     477            3 :                         goto invalid_pair;
     478              :                 }
     479           19 :                 else if (is_utf16_surrogate_second(unicode))
     480            0 :                     goto invalid_pair;
     481              : 
     482           19 :                 if (is_utf16_surrogate_first(unicode))
     483            3 :                     pair_first = unicode;
     484              :                 else
     485              :                 {
     486           16 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     487           16 :                     out += strlen(out);
     488              :                 }
     489           19 :                 in += 8;
     490              :             }
     491              :             else
     492            6 :                 ereport(ERROR,
     493              :                         (errcode(ERRCODE_SYNTAX_ERROR),
     494              :                          errmsg("invalid Unicode escape"),
     495              :                          errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
     496              : 
     497          383 :             cancel_scanner_errposition_callback(&scbstate);
     498              :         }
     499              :         else
     500              :         {
     501          759 :             if (pair_first)
     502            3 :                 goto invalid_pair;
     503              : 
     504          756 :             *out++ = *in++;
     505              :         }
     506              :     }
     507              : 
     508              :     /* unfinished surrogate pair? */
     509          268 :     if (pair_first)
     510            3 :         goto invalid_pair;
     511              : 
     512          265 :     *out = '\0';
     513          265 :     return new;
     514              : 
     515              :     /*
     516              :      * We might get here with the error callback active, or not.  Call
     517              :      * scanner_errposition to make sure an error cursor appears; if the
     518              :      * callback is active, this is duplicative but harmless.
     519              :      */
     520           15 : invalid_pair:
     521           15 :     ereport(ERROR,
     522              :             (errcode(ERRCODE_SYNTAX_ERROR),
     523              :              errmsg("invalid Unicode surrogate pair"),
     524              :              scanner_errposition(in - str + position + 3,   /* 3 for U&" */
     525              :                                  yyscanner)));
     526              :     return NULL;                /* keep compiler quiet */
     527              : }
        

Generated by: LCOV version 2.0-1