LCOV - code coverage report
Current view: top level - src/backend/parser - parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15devel Lines: 179 190 94.2 %
Date: 2021-12-04 23:09:10 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * parser.c
       4             :  *      Main entry point/driver for PostgreSQL grammar
       5             :  *
       6             :  * Note that the grammar is not allowed to perform any table access
       7             :  * (since we need to be able to do basic parsing even while inside an
       8             :  * aborted transaction).  Therefore, the data structures returned by
       9             :  * the grammar are "raw" parsetrees that still need to be analyzed by
      10             :  * analyze.c and related files.
      11             :  *
      12             :  *
      13             :  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
      14             :  * Portions Copyright (c) 1994, Regents of the University of California
      15             :  *
      16             :  * IDENTIFICATION
      17             :  *    src/backend/parser/parser.c
      18             :  *
      19             :  *-------------------------------------------------------------------------
      20             :  */
      21             : 
      22             : #include "postgres.h"
      23             : 
      24             : #include "mb/pg_wchar.h"
      25             : #include "parser/gramparse.h"
      26             : #include "parser/parser.h"
      27             : #include "parser/scansup.h"
      28             : 
      29             : static bool check_uescapechar(unsigned char escape);
      30             : static char *str_udeescape(const char *str, char escape,
      31             :                            int position, core_yyscan_t yyscanner);
      32             : 
      33             : 
      34             : /*
      35             :  * raw_parser
      36             :  *      Given a query in string form, do lexical and grammatical analysis.
      37             :  *
      38             :  * Returns a list of raw (un-analyzed) parse trees.  The contents of the
      39             :  * list have the form required by the specified RawParseMode.
      40             :  */
      41             : List *
      42      776332 : raw_parser(const char *str, RawParseMode mode)
      43             : {
      44             :     core_yyscan_t yyscanner;
      45             :     base_yy_extra_type yyextra;
      46             :     int         yyresult;
      47             : 
      48             :     /* initialize the flex scanner */
      49      776332 :     yyscanner = scanner_init(str, &yyextra.core_yy_extra,
      50             :                              &ScanKeywords, ScanKeywordTokens);
      51             : 
      52             :     /* base_yylex() only needs us to initialize the lookahead token, if any */
      53      776332 :     if (mode == RAW_PARSE_DEFAULT)
      54      744666 :         yyextra.have_lookahead = false;
      55             :     else
      56             :     {
      57             :         /* this array is indexed by RawParseMode enum */
      58             :         static const int mode_token[] = {
      59             :             0,                  /* RAW_PARSE_DEFAULT */
      60             :             MODE_TYPE_NAME,     /* RAW_PARSE_TYPE_NAME */
      61             :             MODE_PLPGSQL_EXPR,  /* RAW_PARSE_PLPGSQL_EXPR */
      62             :             MODE_PLPGSQL_ASSIGN1,   /* RAW_PARSE_PLPGSQL_ASSIGN1 */
      63             :             MODE_PLPGSQL_ASSIGN2,   /* RAW_PARSE_PLPGSQL_ASSIGN2 */
      64             :             MODE_PLPGSQL_ASSIGN3    /* RAW_PARSE_PLPGSQL_ASSIGN3 */
      65             :         };
      66             : 
      67       31666 :         yyextra.have_lookahead = true;
      68       31666 :         yyextra.lookahead_token = mode_token[mode];
      69       31666 :         yyextra.lookahead_yylloc = 0;
      70       31666 :         yyextra.lookahead_end = NULL;
      71             :     }
      72             : 
      73             :     /* initialize the bison parser */
      74      776332 :     parser_init(&yyextra);
      75             : 
      76             :     /* Parse! */
      77      776332 :     yyresult = base_yyparse(yyscanner);
      78             : 
      79             :     /* Clean up (release memory) */
      80      775754 :     scanner_finish(yyscanner);
      81             : 
      82      775754 :     if (yyresult)               /* error */
      83           0 :         return NIL;
      84             : 
      85      775754 :     return yyextra.parsetree;
      86             : }
      87             : 
      88             : 
      89             : /*
      90             :  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
      91             :  *
      92             :  * This filter is needed because in some cases the standard SQL grammar
      93             :  * requires more than one token lookahead.  We reduce these cases to one-token
      94             :  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
      95             :  *
      96             :  * Using a filter is simpler than trying to recognize multiword tokens
      97             :  * directly in scan.l, because we'd have to allow for comments between the
      98             :  * words.  Furthermore it's not clear how to do that without re-introducing
      99             :  * scanner backtrack, which would cost more performance than this filter
     100             :  * layer does.
     101             :  *
     102             :  * We also use this filter to convert UIDENT and USCONST sequences into
     103             :  * plain IDENT and SCONST tokens.  While that could be handled by additional
     104             :  * productions in the main grammar, it's more efficient to do it like this.
     105             :  *
     106             :  * The filter also provides a convenient place to translate between
     107             :  * the core_YYSTYPE and YYSTYPE representations (which are really the
     108             :  * same thing anyway, but notationally they're different).
     109             :  */
     110             : int
     111    35854608 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
     112             : {
     113    35854608 :     base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
     114             :     int         cur_token;
     115             :     int         next_token;
     116             :     int         cur_token_length;
     117             :     YYLTYPE     cur_yylloc;
     118             : 
     119             :     /* Get next token --- we might already have it */
     120    35854608 :     if (yyextra->have_lookahead)
     121             :     {
     122      177734 :         cur_token = yyextra->lookahead_token;
     123      177734 :         lvalp->core_yystype = yyextra->lookahead_yylval;
     124      177734 :         *llocp = yyextra->lookahead_yylloc;
     125      177734 :         if (yyextra->lookahead_end)
     126      146068 :             *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     127      177734 :         yyextra->have_lookahead = false;
     128             :     }
     129             :     else
     130    35676874 :         cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
     131             : 
     132             :     /*
     133             :      * If this token isn't one that requires lookahead, just return it.  If it
     134             :      * does, determine the token length.  (We could get that via strlen(), but
     135             :      * since we have such a small set of possibilities, hardwiring seems
     136             :      * feasible and more efficient --- at least for the fixed-length cases.)
     137             :      */
     138    35854548 :     switch (cur_token)
     139             :     {
     140       87604 :         case NOT:
     141       87604 :             cur_token_length = 3;
     142       87604 :             break;
     143        1966 :         case NULLS_P:
     144        1966 :             cur_token_length = 5;
     145        1966 :             break;
     146       56418 :         case WITH:
     147       56418 :             cur_token_length = 4;
     148       56418 :             break;
     149         164 :         case UIDENT:
     150             :         case USCONST:
     151         164 :             cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
     152         164 :             break;
     153    35708396 :         default:
     154    35708396 :             return cur_token;
     155             :     }
     156             : 
     157             :     /*
     158             :      * Identify end+1 of current token.  core_yylex() has temporarily stored a
     159             :      * '\0' here, and will undo that when we call it again.  We need to redo
     160             :      * it to fully revert the lookahead call for error reporting purposes.
     161             :      */
     162      146152 :     yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
     163      146152 :         *llocp + cur_token_length;
     164             :     Assert(*(yyextra->lookahead_end) == '\0');
     165             : 
     166             :     /*
     167             :      * Save and restore *llocp around the call.  It might look like we could
     168             :      * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
     169             :      * does not work because flex actually holds onto the last-passed pointer
     170             :      * internally, and will use that for error reporting.  We need any error
     171             :      * reports to point to the current token, not the next one.
     172             :      */
     173      146152 :     cur_yylloc = *llocp;
     174             : 
     175             :     /* Get next token, saving outputs into lookahead variables */
     176      146152 :     next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
     177      146152 :     yyextra->lookahead_token = next_token;
     178      146152 :     yyextra->lookahead_yylloc = *llocp;
     179             : 
     180      146152 :     *llocp = cur_yylloc;
     181             : 
     182             :     /* Now revert the un-truncation of the current token */
     183      146152 :     yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
     184      146152 :     *(yyextra->lookahead_end) = '\0';
     185             : 
     186      146152 :     yyextra->have_lookahead = true;
     187             : 
     188             :     /* Replace cur_token if needed, based on lookahead */
     189      146152 :     switch (cur_token)
     190             :     {
     191       87604 :         case NOT:
     192             :             /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
     193             :             switch (next_token)
     194             :             {
     195        4502 :                 case BETWEEN:
     196             :                 case IN_P:
     197             :                 case LIKE:
     198             :                 case ILIKE:
     199             :                 case SIMILAR:
     200        4502 :                     cur_token = NOT_LA;
     201        4502 :                     break;
     202             :             }
     203       87604 :             break;
     204             : 
     205        1966 :         case NULLS_P:
     206             :             /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
     207             :             switch (next_token)
     208             :             {
     209        1434 :                 case FIRST_P:
     210             :                 case LAST_P:
     211        1434 :                     cur_token = NULLS_LA;
     212        1434 :                     break;
     213             :             }
     214        1966 :             break;
     215             : 
     216       56418 :         case WITH:
     217             :             /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
     218             :             switch (next_token)
     219             :             {
     220        9742 :                 case TIME:
     221             :                 case ORDINALITY:
     222        9742 :                     cur_token = WITH_LA;
     223        9742 :                     break;
     224             :             }
     225       56418 :             break;
     226             : 
     227         164 :         case UIDENT:
     228             :         case USCONST:
     229             :             /* Look ahead for UESCAPE */
     230         164 :             if (next_token == UESCAPE)
     231             :             {
     232             :                 /* Yup, so get third token, which had better be SCONST */
     233             :                 const char *escstr;
     234             : 
     235             :                 /* Again save and restore *llocp */
     236          32 :                 cur_yylloc = *llocp;
     237             : 
     238             :                 /* Un-truncate current token so errors point to third token */
     239          32 :                 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     240             : 
     241             :                 /* Get third token */
     242          32 :                 next_token = core_yylex(&(yyextra->lookahead_yylval),
     243             :                                         llocp, yyscanner);
     244             : 
     245             :                 /* If we throw error here, it will point to third token */
     246          32 :                 if (next_token != SCONST)
     247           4 :                     scanner_yyerror("UESCAPE must be followed by a simple string literal",
     248             :                                     yyscanner);
     249             : 
     250          28 :                 escstr = yyextra->lookahead_yylval.str;
     251          28 :                 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
     252           4 :                     scanner_yyerror("invalid Unicode escape character",
     253             :                                     yyscanner);
     254             : 
     255             :                 /* Now restore *llocp; errors will point to first token */
     256          24 :                 *llocp = cur_yylloc;
     257             : 
     258             :                 /* Apply Unicode conversion */
     259          24 :                 lvalp->core_yystype.str =
     260          24 :                     str_udeescape(lvalp->core_yystype.str,
     261          24 :                                   escstr[0],
     262             :                                   *llocp,
     263             :                                   yyscanner);
     264             : 
     265             :                 /*
     266             :                  * We don't need to revert the un-truncation of UESCAPE.  What
     267             :                  * we do want to do is clear have_lookahead, thereby consuming
     268             :                  * all three tokens.
     269             :                  */
     270          24 :                 yyextra->have_lookahead = false;
     271             :             }
     272             :             else
     273             :             {
     274             :                 /* No UESCAPE, so convert using default escape character */
     275         100 :                 lvalp->core_yystype.str =
     276         132 :                     str_udeescape(lvalp->core_yystype.str,
     277             :                                   '\\',
     278             :                                   *llocp,
     279             :                                   yyscanner);
     280             :             }
     281             : 
     282         124 :             if (cur_token == UIDENT)
     283             :             {
     284             :                 /* It's an identifier, so truncate as appropriate */
     285          20 :                 truncate_identifier(lvalp->core_yystype.str,
     286          20 :                                     strlen(lvalp->core_yystype.str),
     287             :                                     true);
     288          20 :                 cur_token = IDENT;
     289             :             }
     290         104 :             else if (cur_token == USCONST)
     291             :             {
     292         104 :                 cur_token = SCONST;
     293             :             }
     294         124 :             break;
     295             :     }
     296             : 
     297      146112 :     return cur_token;
     298             : }
     299             : 
     300             : /* convert hex digit (caller should have verified that) to value */
     301             : static unsigned int
     302         984 : hexval(unsigned char c)
     303             : {
     304         984 :     if (c >= '0' && c <= '9')
     305         828 :         return c - '0';
     306         156 :     if (c >= 'a' && c <= 'f')
     307          40 :         return c - 'a' + 0xA;
     308         116 :     if (c >= 'A' && c <= 'F')
     309         116 :         return c - 'A' + 0xA;
     310           0 :     elog(ERROR, "invalid hexadecimal digit");
     311             :     return 0;                   /* not reached */
     312             : }
     313             : 
     314             : /* is Unicode code point acceptable? */
     315             : static void
     316         232 : check_unicode_value(pg_wchar c)
     317             : {
     318         232 :     if (!is_valid_unicode_codepoint(c))
     319           4 :         ereport(ERROR,
     320             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     321             :                  errmsg("invalid Unicode escape value")));
     322         228 : }
     323             : 
     324             : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
     325             : static bool
     326          28 : check_uescapechar(unsigned char escape)
     327             : {
     328          28 :     if (isxdigit(escape)
     329          28 :         || escape == '+'
     330          24 :         || escape == '\''
     331          24 :         || escape == '"'
     332          24 :         || scanner_isspace(escape))
     333           4 :         return false;
     334             :     else
     335          24 :         return true;
     336             : }
     337             : 
     338             : /*
     339             :  * Process Unicode escapes in "str", producing a palloc'd plain string
     340             :  *
     341             :  * escape: the escape character to use
     342             :  * position: start position of U&'' or U&"" string token
     343             :  * yyscanner: context information needed for error reports
     344             :  */
     345             : static char *
     346         156 : str_udeescape(const char *str, char escape,
     347             :               int position, core_yyscan_t yyscanner)
     348             : {
     349             :     const char *in;
     350             :     char       *new,
     351             :                *out;
     352             :     size_t      new_len;
     353         156 :     pg_wchar    pair_first = 0;
     354             :     ScannerCallbackState scbstate;
     355             : 
     356             :     /*
     357             :      * Guesstimate that result will be no longer than input, but allow enough
     358             :      * padding for Unicode conversion.
     359             :      */
     360         156 :     new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
     361         156 :     new = palloc(new_len);
     362             : 
     363         156 :     in = str;
     364         156 :     out = new;
     365         880 :     while (*in)
     366             :     {
     367             :         /* Enlarge string if needed */
     368         752 :         size_t      out_dist = out - new;
     369             : 
     370         752 :         if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
     371             :         {
     372           0 :             new_len *= 2;
     373           0 :             new = repalloc(new, new_len);
     374           0 :             out = new + out_dist;
     375             :         }
     376             : 
     377         752 :         if (in[0] == escape)
     378             :         {
     379             :             /*
     380             :              * Any errors reported while processing this escape sequence will
     381             :              * have an error cursor pointing at the escape.
     382             :              */
     383         248 :             setup_scanner_errposition_callback(&scbstate, yyscanner,
     384         248 :                                                in - str + position + 3);    /* 3 for U&" */
     385         248 :             if (in[1] == escape)
     386             :             {
     387           8 :                 if (pair_first)
     388           4 :                     goto invalid_pair;
     389           4 :                 *out++ = escape;
     390           4 :                 in += 2;
     391             :             }
     392         240 :             else if (isxdigit((unsigned char) in[1]) &&
     393         208 :                      isxdigit((unsigned char) in[2]) &&
     394         208 :                      isxdigit((unsigned char) in[3]) &&
     395         208 :                      isxdigit((unsigned char) in[4]))
     396         200 :             {
     397             :                 pg_wchar    unicode;
     398             : 
     399         204 :                 unicode = (hexval(in[1]) << 12) +
     400         204 :                     (hexval(in[2]) << 8) +
     401         204 :                     (hexval(in[3]) << 4) +
     402         204 :                     hexval(in[4]);
     403         204 :                 check_unicode_value(unicode);
     404         204 :                 if (pair_first)
     405             :                 {
     406           4 :                     if (is_utf16_surrogate_second(unicode))
     407             :                     {
     408           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     409           0 :                         pair_first = 0;
     410             :                     }
     411             :                     else
     412           4 :                         goto invalid_pair;
     413             :                 }
     414         200 :                 else if (is_utf16_surrogate_second(unicode))
     415           0 :                     goto invalid_pair;
     416             : 
     417         200 :                 if (is_utf16_surrogate_first(unicode))
     418          16 :                     pair_first = unicode;
     419             :                 else
     420             :                 {
     421         184 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     422         184 :                     out += strlen(out);
     423             :                 }
     424         200 :                 in += 5;
     425             :             }
     426          36 :             else if (in[1] == '+' &&
     427          32 :                      isxdigit((unsigned char) in[2]) &&
     428          32 :                      isxdigit((unsigned char) in[3]) &&
     429          32 :                      isxdigit((unsigned char) in[4]) &&
     430          32 :                      isxdigit((unsigned char) in[5]) &&
     431          32 :                      isxdigit((unsigned char) in[6]) &&
     432          28 :                      isxdigit((unsigned char) in[7]))
     433          20 :             {
     434             :                 pg_wchar    unicode;
     435             : 
     436          28 :                 unicode = (hexval(in[2]) << 20) +
     437          28 :                     (hexval(in[3]) << 16) +
     438          28 :                     (hexval(in[4]) << 12) +
     439          28 :                     (hexval(in[5]) << 8) +
     440          28 :                     (hexval(in[6]) << 4) +
     441          28 :                     hexval(in[7]);
     442          28 :                 check_unicode_value(unicode);
     443          24 :                 if (pair_first)
     444             :                 {
     445           4 :                     if (is_utf16_surrogate_second(unicode))
     446             :                     {
     447           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     448           0 :                         pair_first = 0;
     449             :                     }
     450             :                     else
     451           4 :                         goto invalid_pair;
     452             :                 }
     453          20 :                 else if (is_utf16_surrogate_second(unicode))
     454           0 :                     goto invalid_pair;
     455             : 
     456          20 :                 if (is_utf16_surrogate_first(unicode))
     457           4 :                     pair_first = unicode;
     458             :                 else
     459             :                 {
     460          16 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     461          16 :                     out += strlen(out);
     462             :                 }
     463          20 :                 in += 8;
     464             :             }
     465             :             else
     466           8 :                 ereport(ERROR,
     467             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     468             :                          errmsg("invalid Unicode escape"),
     469             :                          errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
     470             : 
     471         224 :             cancel_scanner_errposition_callback(&scbstate);
     472             :         }
     473             :         else
     474             :         {
     475         504 :             if (pair_first)
     476           4 :                 goto invalid_pair;
     477             : 
     478         500 :             *out++ = *in++;
     479             :         }
     480             :     }
     481             : 
     482             :     /* unfinished surrogate pair? */
     483         128 :     if (pair_first)
     484           4 :         goto invalid_pair;
     485             : 
     486         124 :     *out = '\0';
     487         124 :     return new;
     488             : 
     489             :     /*
     490             :      * We might get here with the error callback active, or not.  Call
     491             :      * scanner_errposition to make sure an error cursor appears; if the
     492             :      * callback is active, this is duplicative but harmless.
     493             :      */
     494          20 : invalid_pair:
     495          20 :     ereport(ERROR,
     496             :             (errcode(ERRCODE_SYNTAX_ERROR),
     497             :              errmsg("invalid Unicode surrogate pair"),
     498             :              scanner_errposition(in - str + position + 3,   /* 3 for U&" */
     499             :                                  yyscanner)));
     500             :     return NULL;                /* keep compiler quiet */
     501             : }

Generated by: LCOV version 1.14