LCOV - code coverage report
Current view: top level - src/backend/parser - parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 195 206 94.7 %
Date: 2024-11-21 08:14:44 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * parser.c
       4             :  *      Main entry point/driver for PostgreSQL grammar
       5             :  *
       6             :  * Note that the grammar is not allowed to perform any table access
       7             :  * (since we need to be able to do basic parsing even while inside an
       8             :  * aborted transaction).  Therefore, the data structures returned by
       9             :  * the grammar are "raw" parsetrees that still need to be analyzed by
      10             :  * analyze.c and related files.
      11             :  *
      12             :  *
      13             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
      14             :  * Portions Copyright (c) 1994, Regents of the University of California
      15             :  *
      16             :  * IDENTIFICATION
      17             :  *    src/backend/parser/parser.c
      18             :  *
      19             :  *-------------------------------------------------------------------------
      20             :  */
      21             : 
      22             : #include "postgres.h"
      23             : 
      24             : #include "gramparse.h"
      25             : #include "mb/pg_wchar.h"
      26             : #include "parser/parser.h"
      27             : #include "parser/scansup.h"
      28             : 
      29             : static bool check_uescapechar(unsigned char escape);
      30             : static char *str_udeescape(const char *str, char escape,
      31             :                            int position, core_yyscan_t yyscanner);
      32             : 
      33             : 
      34             : /*
      35             :  * raw_parser
      36             :  *      Given a query in string form, do lexical and grammatical analysis.
      37             :  *
      38             :  * Returns a list of raw (un-analyzed) parse trees.  The contents of the
      39             :  * list have the form required by the specified RawParseMode.
      40             :  */
      41             : List *
      42      762918 : raw_parser(const char *str, RawParseMode mode)
      43             : {
      44             :     core_yyscan_t yyscanner;
      45             :     base_yy_extra_type yyextra;
      46             :     int         yyresult;
      47             : 
      48             :     /* initialize the flex scanner */
      49      762918 :     yyscanner = scanner_init(str, &yyextra.core_yy_extra,
      50             :                              &ScanKeywords, ScanKeywordTokens);
      51             : 
      52             :     /* base_yylex() only needs us to initialize the lookahead token, if any */
      53      762918 :     if (mode == RAW_PARSE_DEFAULT)
      54      714086 :         yyextra.have_lookahead = false;
      55             :     else
      56             :     {
      57             :         /* this array is indexed by RawParseMode enum */
      58             :         static const int mode_token[] = {
      59             :             [RAW_PARSE_DEFAULT] = 0,
      60             :             [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
      61             :             [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
      62             :             [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
      63             :             [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
      64             :             [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
      65             :         };
      66             : 
      67       48832 :         yyextra.have_lookahead = true;
      68       48832 :         yyextra.lookahead_token = mode_token[mode];
      69       48832 :         yyextra.lookahead_yylloc = 0;
      70       48832 :         yyextra.lookahead_end = NULL;
      71             :     }
      72             : 
      73             :     /* initialize the bison parser */
      74      762918 :     parser_init(&yyextra);
      75             : 
      76             :     /* Parse! */
      77      762918 :     yyresult = base_yyparse(yyscanner);
      78             : 
      79             :     /* Clean up (release memory) */
      80      761742 :     scanner_finish(yyscanner);
      81             : 
      82      761742 :     if (yyresult)               /* error */
      83           0 :         return NIL;
      84             : 
      85      761742 :     return yyextra.parsetree;
      86             : }
      87             : 
      88             : 
      89             : /*
      90             :  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
      91             :  *
      92             :  * This filter is needed because in some cases the standard SQL grammar
      93             :  * requires more than one token lookahead.  We reduce these cases to one-token
      94             :  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
      95             :  *
      96             :  * Using a filter is simpler than trying to recognize multiword tokens
      97             :  * directly in scan.l, because we'd have to allow for comments between the
      98             :  * words.  Furthermore it's not clear how to do that without re-introducing
      99             :  * scanner backtrack, which would cost more performance than this filter
     100             :  * layer does.
     101             :  *
     102             :  * We also use this filter to convert UIDENT and USCONST sequences into
     103             :  * plain IDENT and SCONST tokens.  While that could be handled by additional
     104             :  * productions in the main grammar, it's more efficient to do it like this.
     105             :  *
     106             :  * The filter also provides a convenient place to translate between
     107             :  * the core_YYSTYPE and YYSTYPE representations (which are really the
     108             :  * same thing anyway, but notationally they're different).
     109             :  */
     110             : int
     111    18291072 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
     112             : {
     113    18291072 :     base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
     114             :     int         cur_token;
     115             :     int         next_token;
     116             :     int         cur_token_length;
     117             :     YYLTYPE     cur_yylloc;
     118             : 
     119             :     /* Get next token --- we might already have it */
     120    18291072 :     if (yyextra->have_lookahead)
     121             :     {
     122      115234 :         cur_token = yyextra->lookahead_token;
     123      115234 :         lvalp->core_yystype = yyextra->lookahead_yylval;
     124      115234 :         *llocp = yyextra->lookahead_yylloc;
     125      115234 :         if (yyextra->lookahead_end)
     126       66402 :             *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     127      115234 :         yyextra->have_lookahead = false;
     128             :     }
     129             :     else
     130    18175838 :         cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
     131             : 
     132             :     /*
     133             :      * If this token isn't one that requires lookahead, just return it.  If it
     134             :      * does, determine the token length.  (We could get that via strlen(), but
     135             :      * since we have such a small set of possibilities, hardwiring seems
     136             :      * feasible and more efficient --- at least for the fixed-length cases.)
     137             :      */
     138    18290826 :     switch (cur_token)
     139             :     {
     140        3124 :         case FORMAT:
     141        3124 :             cur_token_length = 6;
     142        3124 :             break;
     143       40508 :         case NOT:
     144       40508 :             cur_token_length = 3;
     145       40508 :             break;
     146        2222 :         case NULLS_P:
     147        2222 :             cur_token_length = 5;
     148        2222 :             break;
     149       18924 :         case WITH:
     150       18924 :             cur_token_length = 4;
     151       18924 :             break;
     152         314 :         case UIDENT:
     153             :         case USCONST:
     154         314 :             cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
     155         314 :             break;
     156        1432 :         case WITHOUT:
     157        1432 :             cur_token_length = 7;
     158        1432 :             break;
     159    18224302 :         default:
     160    18224302 :             return cur_token;
     161             :     }
     162             : 
     163             :     /*
     164             :      * Identify end+1 of current token.  core_yylex() has temporarily stored a
     165             :      * '\0' here, and will undo that when we call it again.  We need to redo
     166             :      * it to fully revert the lookahead call for error reporting purposes.
     167             :      */
     168       66524 :     yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
     169       66524 :         *llocp + cur_token_length;
     170             :     Assert(*(yyextra->lookahead_end) == '\0');
     171             : 
     172             :     /*
     173             :      * Save and restore *llocp around the call.  It might look like we could
     174             :      * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
     175             :      * does not work because flex actually holds onto the last-passed pointer
     176             :      * internally, and will use that for error reporting.  We need any error
     177             :      * reports to point to the current token, not the next one.
     178             :      */
     179       66524 :     cur_yylloc = *llocp;
     180             : 
     181             :     /* Get next token, saving outputs into lookahead variables */
     182       66524 :     next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
     183       66524 :     yyextra->lookahead_token = next_token;
     184       66524 :     yyextra->lookahead_yylloc = *llocp;
     185             : 
     186       66524 :     *llocp = cur_yylloc;
     187             : 
     188             :     /* Now revert the un-truncation of the current token */
     189       66524 :     yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
     190       66524 :     *(yyextra->lookahead_end) = '\0';
     191             : 
     192       66524 :     yyextra->have_lookahead = true;
     193             : 
     194             :     /* Replace cur_token if needed, based on lookahead */
     195       66524 :     switch (cur_token)
     196             :     {
     197        3124 :         case FORMAT:
     198             :             /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
     199             :             switch (next_token)
     200             :             {
     201         660 :                 case JSON:
     202         660 :                     cur_token = FORMAT_LA;
     203         660 :                     break;
     204             :             }
     205        3124 :             break;
     206             : 
     207       40508 :         case NOT:
     208             :             /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
     209             :             switch (next_token)
     210             :             {
     211        5344 :                 case BETWEEN:
     212             :                 case IN_P:
     213             :                 case LIKE:
     214             :                 case ILIKE:
     215             :                 case SIMILAR:
     216        5344 :                     cur_token = NOT_LA;
     217        5344 :                     break;
     218             :             }
     219       40508 :             break;
     220             : 
     221        2222 :         case NULLS_P:
     222             :             /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
     223             :             switch (next_token)
     224             :             {
     225        2034 :                 case FIRST_P:
     226             :                 case LAST_P:
     227        2034 :                     cur_token = NULLS_LA;
     228        2034 :                     break;
     229             :             }
     230        2222 :             break;
     231             : 
     232       18924 :         case WITH:
     233             :             /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
     234             :             switch (next_token)
     235             :             {
     236        2588 :                 case TIME:
     237             :                 case ORDINALITY:
     238        2588 :                     cur_token = WITH_LA;
     239        2588 :                     break;
     240             :             }
     241       18924 :             break;
     242             : 
     243        1432 :         case WITHOUT:
     244             :             /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
     245             :             switch (next_token)
     246             :             {
     247         592 :                 case TIME:
     248         592 :                     cur_token = WITHOUT_LA;
     249         592 :                     break;
     250             :             }
     251        1432 :             break;
     252             : 
     253         314 :         case UIDENT:
     254             :         case USCONST:
     255             :             /* Look ahead for UESCAPE */
     256         314 :             if (next_token == UESCAPE)
     257             :             {
     258             :                 /* Yup, so get third token, which had better be SCONST */
     259             :                 const char *escstr;
     260             : 
     261             :                 /* Again save and restore *llocp */
     262          46 :                 cur_yylloc = *llocp;
     263             : 
     264             :                 /* Un-truncate current token so errors point to third token */
     265          46 :                 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     266             : 
     267             :                 /* Get third token */
     268          46 :                 next_token = core_yylex(&(yyextra->lookahead_yylval),
     269             :                                         llocp, yyscanner);
     270             : 
     271             :                 /* If we throw error here, it will point to third token */
     272          46 :                 if (next_token != SCONST)
     273           6 :                     scanner_yyerror("UESCAPE must be followed by a simple string literal",
     274             :                                     yyscanner);
     275             : 
     276          40 :                 escstr = yyextra->lookahead_yylval.str;
     277          40 :                 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
     278           6 :                     scanner_yyerror("invalid Unicode escape character",
     279             :                                     yyscanner);
     280             : 
     281             :                 /* Now restore *llocp; errors will point to first token */
     282          34 :                 *llocp = cur_yylloc;
     283             : 
     284             :                 /* Apply Unicode conversion */
     285          34 :                 lvalp->core_yystype.str =
     286          34 :                     str_udeescape(lvalp->core_yystype.str,
     287          34 :                                   escstr[0],
     288             :                                   *llocp,
     289             :                                   yyscanner);
     290             : 
     291             :                 /*
     292             :                  * We don't need to revert the un-truncation of UESCAPE.  What
     293             :                  * we do want to do is clear have_lookahead, thereby consuming
     294             :                  * all three tokens.
     295             :                  */
     296          34 :                 yyextra->have_lookahead = false;
     297             :             }
     298             :             else
     299             :             {
     300             :                 /* No UESCAPE, so convert using default escape character */
     301         220 :                 lvalp->core_yystype.str =
     302         268 :                     str_udeescape(lvalp->core_yystype.str,
     303             :                                   '\\',
     304             :                                   *llocp,
     305             :                                   yyscanner);
     306             :             }
     307             : 
     308         254 :             if (cur_token == UIDENT)
     309             :             {
     310             :                 /* It's an identifier, so truncate as appropriate */
     311          28 :                 truncate_identifier(lvalp->core_yystype.str,
     312          28 :                                     strlen(lvalp->core_yystype.str),
     313             :                                     true);
     314          28 :                 cur_token = IDENT;
     315             :             }
     316         226 :             else if (cur_token == USCONST)
     317             :             {
     318         226 :                 cur_token = SCONST;
     319             :             }
     320         254 :             break;
     321             :     }
     322             : 
     323       66464 :     return cur_token;
     324             : }
     325             : 
     326             : /* convert hex digit (caller should have verified that) to value */
     327             : static unsigned int
     328        1808 : hexval(unsigned char c)
     329             : {
     330        1808 :     if (c >= '0' && c <= '9')
     331        1498 :         return c - '0';
     332         310 :     if (c >= 'a' && c <= 'f')
     333          60 :         return c - 'a' + 0xA;
     334         250 :     if (c >= 'A' && c <= 'F')
     335         250 :         return c - 'A' + 0xA;
     336           0 :     elog(ERROR, "invalid hexadecimal digit");
     337             :     return 0;                   /* not reached */
     338             : }
     339             : 
     340             : /* is Unicode code point acceptable? */
     341             : static void
     342         430 : check_unicode_value(pg_wchar c)
     343             : {
     344         430 :     if (!is_valid_unicode_codepoint(c))
     345           6 :         ereport(ERROR,
     346             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     347             :                  errmsg("invalid Unicode escape value")));
     348         424 : }
     349             : 
     350             : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
     351             : static bool
     352          40 : check_uescapechar(unsigned char escape)
     353             : {
     354          40 :     if (isxdigit(escape)
     355          40 :         || escape == '+'
     356          34 :         || escape == '\''
     357          34 :         || escape == '"'
     358          34 :         || scanner_isspace(escape))
     359           6 :         return false;
     360             :     else
     361          34 :         return true;
     362             : }
     363             : 
     364             : /*
     365             :  * Process Unicode escapes in "str", producing a palloc'd plain string
     366             :  *
     367             :  * escape: the escape character to use
     368             :  * position: start position of U&'' or U&"" string token
     369             :  * yyscanner: context information needed for error reports
     370             :  */
     371             : static char *
     372         302 : str_udeescape(const char *str, char escape,
     373             :               int position, core_yyscan_t yyscanner)
     374             : {
     375             :     const char *in;
     376             :     char       *new,
     377             :                *out;
     378             :     size_t      new_len;
     379         302 :     pg_wchar    pair_first = 0;
     380             :     ScannerCallbackState scbstate;
     381             : 
     382             :     /*
     383             :      * Guesstimate that result will be no longer than input, but allow enough
     384             :      * padding for Unicode conversion.
     385             :      */
     386         302 :     new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
     387         302 :     new = palloc(new_len);
     388             : 
     389         302 :     in = str;
     390         302 :     out = new;
     391        1548 :     while (*in)
     392             :     {
     393             :         /* Enlarge string if needed */
     394        1288 :         size_t      out_dist = out - new;
     395             : 
     396        1288 :         if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
     397             :         {
     398           0 :             new_len *= 2;
     399           0 :             new = repalloc(new, new_len);
     400           0 :             out = new + out_dist;
     401             :         }
     402             : 
     403        1288 :         if (in[0] == escape)
     404             :         {
     405             :             /*
     406             :              * Any errors reported while processing this escape sequence will
     407             :              * have an error cursor pointing at the escape.
     408             :              */
     409         454 :             setup_scanner_errposition_callback(&scbstate, yyscanner,
     410         454 :                                                in - str + position + 3);    /* 3 for U&" */
     411         454 :             if (in[1] == escape)
     412             :             {
     413          12 :                 if (pair_first)
     414           6 :                     goto invalid_pair;
     415           6 :                 *out++ = escape;
     416           6 :                 in += 2;
     417             :             }
     418         442 :             else if (isxdigit((unsigned char) in[1]) &&
     419         392 :                      isxdigit((unsigned char) in[2]) &&
     420         392 :                      isxdigit((unsigned char) in[3]) &&
     421         392 :                      isxdigit((unsigned char) in[4]))
     422         380 :             {
     423             :                 pg_wchar    unicode;
     424             : 
     425         386 :                 unicode = (hexval(in[1]) << 12) +
     426         386 :                     (hexval(in[2]) << 8) +
     427         386 :                     (hexval(in[3]) << 4) +
     428         386 :                     hexval(in[4]);
     429         386 :                 check_unicode_value(unicode);
     430         386 :                 if (pair_first)
     431             :                 {
     432           6 :                     if (is_utf16_surrogate_second(unicode))
     433             :                     {
     434           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     435           0 :                         pair_first = 0;
     436             :                     }
     437             :                     else
     438           6 :                         goto invalid_pair;
     439             :                 }
     440         380 :                 else if (is_utf16_surrogate_second(unicode))
     441           0 :                     goto invalid_pair;
     442             : 
     443         380 :                 if (is_utf16_surrogate_first(unicode))
     444          24 :                     pair_first = unicode;
     445             :                 else
     446             :                 {
     447         356 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     448         356 :                     out += strlen(out);
     449             :                 }
     450         380 :                 in += 5;
     451             :             }
     452          56 :             else if (in[1] == '+' &&
     453          50 :                      isxdigit((unsigned char) in[2]) &&
     454          50 :                      isxdigit((unsigned char) in[3]) &&
     455          50 :                      isxdigit((unsigned char) in[4]) &&
     456          50 :                      isxdigit((unsigned char) in[5]) &&
     457          50 :                      isxdigit((unsigned char) in[6]) &&
     458          44 :                      isxdigit((unsigned char) in[7]))
     459          32 :             {
     460             :                 pg_wchar    unicode;
     461             : 
     462          44 :                 unicode = (hexval(in[2]) << 20) +
     463          44 :                     (hexval(in[3]) << 16) +
     464          44 :                     (hexval(in[4]) << 12) +
     465          44 :                     (hexval(in[5]) << 8) +
     466          44 :                     (hexval(in[6]) << 4) +
     467          44 :                     hexval(in[7]);
     468          44 :                 check_unicode_value(unicode);
     469          38 :                 if (pair_first)
     470             :                 {
     471           6 :                     if (is_utf16_surrogate_second(unicode))
     472             :                     {
     473           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     474           0 :                         pair_first = 0;
     475             :                     }
     476             :                     else
     477           6 :                         goto invalid_pair;
     478             :                 }
     479          32 :                 else if (is_utf16_surrogate_second(unicode))
     480           0 :                     goto invalid_pair;
     481             : 
     482          32 :                 if (is_utf16_surrogate_first(unicode))
     483           6 :                     pair_first = unicode;
     484             :                 else
     485             :                 {
     486          26 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     487          26 :                     out += strlen(out);
     488             :                 }
     489          32 :                 in += 8;
     490             :             }
     491             :             else
     492          12 :                 ereport(ERROR,
     493             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     494             :                          errmsg("invalid Unicode escape"),
     495             :                          errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
     496             : 
     497         418 :             cancel_scanner_errposition_callback(&scbstate);
     498             :         }
     499             :         else
     500             :         {
     501         834 :             if (pair_first)
     502           6 :                 goto invalid_pair;
     503             : 
     504         828 :             *out++ = *in++;
     505             :         }
     506             :     }
     507             : 
     508             :     /* unfinished surrogate pair? */
     509         260 :     if (pair_first)
     510           6 :         goto invalid_pair;
     511             : 
     512         254 :     *out = '\0';
     513         254 :     return new;
     514             : 
     515             :     /*
     516             :      * We might get here with the error callback active, or not.  Call
     517             :      * scanner_errposition to make sure an error cursor appears; if the
     518             :      * callback is active, this is duplicative but harmless.
     519             :      */
     520          30 : invalid_pair:
     521          30 :     ereport(ERROR,
     522             :             (errcode(ERRCODE_SYNTAX_ERROR),
     523             :              errmsg("invalid Unicode surrogate pair"),
     524             :              scanner_errposition(in - str + position + 3,   /* 3 for U&" */
     525             :                                  yyscanner)));
     526             :     return NULL;                /* keep compiler quiet */
     527             : }

Generated by: LCOV version 1.14