LCOV - code coverage report
Current view: top level - src/backend/parser - parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 173 184 94.0 %
Date: 2020-06-01 00:06:26 Functions: 6 6 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * parser.c
       4             :  *      Main entry point/driver for PostgreSQL grammar
       5             :  *
       6             :  * Note that the grammar is not allowed to perform any table access
       7             :  * (since we need to be able to do basic parsing even while inside an
       8             :  * aborted transaction).  Therefore, the data structures returned by
       9             :  * the grammar are "raw" parsetrees that still need to be analyzed by
      10             :  * analyze.c and related files.
      11             :  *
      12             :  *
      13             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
      14             :  * Portions Copyright (c) 1994, Regents of the University of California
      15             :  *
      16             :  * IDENTIFICATION
      17             :  *    src/backend/parser/parser.c
      18             :  *
      19             :  *-------------------------------------------------------------------------
      20             :  */
      21             : 
      22             : #include "postgres.h"
      23             : 
      24             : #include "mb/pg_wchar.h"
      25             : #include "parser/gramparse.h"
      26             : #include "parser/parser.h"
      27             : #include "parser/scansup.h"
      28             : 
      29             : static bool check_uescapechar(unsigned char escape);
      30             : static char *str_udeescape(const char *str, char escape,
      31             :                            int position, core_yyscan_t yyscanner);
      32             : 
      33             : 
      34             : /*
      35             :  * raw_parser
      36             :  *      Given a query in string form, do lexical and grammatical analysis.
      37             :  *
      38             :  * Returns a list of raw (un-analyzed) parse trees.  The immediate elements
      39             :  * of the list are always RawStmt nodes.
      40             :  */
      41             : List *
      42      535592 : raw_parser(const char *str)
      43             : {
      44             :     core_yyscan_t yyscanner;
      45             :     base_yy_extra_type yyextra;
      46             :     int         yyresult;
      47             : 
      48             :     /* initialize the flex scanner */
      49      535592 :     yyscanner = scanner_init(str, &yyextra.core_yy_extra,
      50             :                              &ScanKeywords, ScanKeywordTokens);
      51             : 
      52             :     /* base_yylex() only needs this much initialization */
      53      535592 :     yyextra.have_lookahead = false;
      54             : 
      55             :     /* initialize the bison parser */
      56      535592 :     parser_init(&yyextra);
      57             : 
      58             :     /* Parse! */
      59      535592 :     yyresult = base_yyparse(yyscanner);
      60             : 
      61             :     /* Clean up (release memory) */
      62      535018 :     scanner_finish(yyscanner);
      63             : 
      64      535018 :     if (yyresult)               /* error */
      65           0 :         return NIL;
      66             : 
      67      535018 :     return yyextra.parsetree;
      68             : }
      69             : 
      70             : 
      71             : /*
      72             :  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
      73             :  *
      74             :  * This filter is needed because in some cases the standard SQL grammar
      75             :  * requires more than one token lookahead.  We reduce these cases to one-token
      76             :  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
      77             :  *
      78             :  * Using a filter is simpler than trying to recognize multiword tokens
      79             :  * directly in scan.l, because we'd have to allow for comments between the
      80             :  * words.  Furthermore it's not clear how to do that without re-introducing
      81             :  * scanner backtrack, which would cost more performance than this filter
      82             :  * layer does.
      83             :  *
      84             :  * We also use this filter to convert UIDENT and USCONST sequences into
      85             :  * plain IDENT and SCONST tokens.  While that could be handled by additional
      86             :  * productions in the main grammar, it's more efficient to do it like this.
      87             :  *
      88             :  * The filter also provides a convenient place to translate between
      89             :  * the core_YYSTYPE and YYSTYPE representations (which are really the
      90             :  * same thing anyway, but notationally they're different).
      91             :  */
      92             : int
      93    20080280 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
      94             : {
      95    20080280 :     base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
      96             :     int         cur_token;
      97             :     int         next_token;
      98             :     int         cur_token_length;
      99             :     YYLTYPE     cur_yylloc;
     100             : 
     101             :     /* Get next token --- we might already have it */
     102    20080280 :     if (yyextra->have_lookahead)
     103             :     {
     104       89884 :         cur_token = yyextra->lookahead_token;
     105       89884 :         lvalp->core_yystype = yyextra->lookahead_yylval;
     106       89884 :         *llocp = yyextra->lookahead_yylloc;
     107       89884 :         *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     108       89884 :         yyextra->have_lookahead = false;
     109             :     }
     110             :     else
     111    19990396 :         cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
     112             : 
     113             :     /*
     114             :      * If this token isn't one that requires lookahead, just return it.  If it
     115             :      * does, determine the token length.  (We could get that via strlen(), but
     116             :      * since we have such a small set of possibilities, hardwiring seems
     117             :      * feasible and more efficient --- at least for the fixed-length cases.)
     118             :      */
     119    20080220 :     switch (cur_token)
     120             :     {
     121       52430 :         case NOT:
     122       52430 :             cur_token_length = 3;
     123       52430 :             break;
     124        1668 :         case NULLS_P:
     125        1668 :             cur_token_length = 5;
     126        1668 :             break;
     127       35706 :         case WITH:
     128       35706 :             cur_token_length = 4;
     129       35706 :             break;
     130         164 :         case UIDENT:
     131             :         case USCONST:
     132         164 :             cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
     133         164 :             break;
     134    19990252 :         default:
     135    19990252 :             return cur_token;
     136             :     }
     137             : 
     138             :     /*
     139             :      * Identify end+1 of current token.  core_yylex() has temporarily stored a
     140             :      * '\0' here, and will undo that when we call it again.  We need to redo
     141             :      * it to fully revert the lookahead call for error reporting purposes.
     142             :      */
     143      179936 :     yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
     144       89968 :         *llocp + cur_token_length;
     145             :     Assert(*(yyextra->lookahead_end) == '\0');
     146             : 
     147             :     /*
     148             :      * Save and restore *llocp around the call.  It might look like we could
     149             :      * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
     150             :      * does not work because flex actually holds onto the last-passed pointer
     151             :      * internally, and will use that for error reporting.  We need any error
     152             :      * reports to point to the current token, not the next one.
     153             :      */
     154       89968 :     cur_yylloc = *llocp;
     155             : 
     156             :     /* Get next token, saving outputs into lookahead variables */
     157       89968 :     next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
     158       89968 :     yyextra->lookahead_token = next_token;
     159       89968 :     yyextra->lookahead_yylloc = *llocp;
     160             : 
     161       89968 :     *llocp = cur_yylloc;
     162             : 
     163             :     /* Now revert the un-truncation of the current token */
     164       89968 :     yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
     165       89968 :     *(yyextra->lookahead_end) = '\0';
     166             : 
     167       89968 :     yyextra->have_lookahead = true;
     168             : 
     169             :     /* Replace cur_token if needed, based on lookahead */
     170       89968 :     switch (cur_token)
     171             :     {
     172       52430 :         case NOT:
     173             :             /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
     174             :             switch (next_token)
     175             :             {
     176        3372 :                 case BETWEEN:
     177             :                 case IN_P:
     178             :                 case LIKE:
     179             :                 case ILIKE:
     180             :                 case SIMILAR:
     181        3372 :                     cur_token = NOT_LA;
     182        3372 :                     break;
     183             :             }
     184       52430 :             break;
     185             : 
     186        1668 :         case NULLS_P:
     187             :             /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
     188             :             switch (next_token)
     189             :             {
     190        1286 :                 case FIRST_P:
     191             :                 case LAST_P:
     192        1286 :                     cur_token = NULLS_LA;
     193        1286 :                     break;
     194             :             }
     195        1668 :             break;
     196             : 
     197       35706 :         case WITH:
     198             :             /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
     199             :             switch (next_token)
     200             :             {
     201        7638 :                 case TIME:
     202             :                 case ORDINALITY:
     203        7638 :                     cur_token = WITH_LA;
     204        7638 :                     break;
     205             :             }
     206       35706 :             break;
     207             : 
     208         164 :         case UIDENT:
     209             :         case USCONST:
     210             :             /* Look ahead for UESCAPE */
     211         164 :             if (next_token == UESCAPE)
     212             :             {
     213             :                 /* Yup, so get third token, which had better be SCONST */
     214             :                 const char *escstr;
     215             : 
     216             :                 /* Again save and restore *llocp */
     217          32 :                 cur_yylloc = *llocp;
     218             : 
     219             :                 /* Un-truncate current token so errors point to third token */
     220          32 :                 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
     221             : 
     222             :                 /* Get third token */
     223          32 :                 next_token = core_yylex(&(yyextra->lookahead_yylval),
     224             :                                         llocp, yyscanner);
     225             : 
     226             :                 /* If we throw error here, it will point to third token */
     227          32 :                 if (next_token != SCONST)
     228           4 :                     scanner_yyerror("UESCAPE must be followed by a simple string literal",
     229             :                                     yyscanner);
     230             : 
     231          28 :                 escstr = yyextra->lookahead_yylval.str;
     232          28 :                 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
     233           4 :                     scanner_yyerror("invalid Unicode escape character",
     234             :                                     yyscanner);
     235             : 
     236             :                 /* Now restore *llocp; errors will point to first token */
     237          24 :                 *llocp = cur_yylloc;
     238             : 
     239             :                 /* Apply Unicode conversion */
     240          24 :                 lvalp->core_yystype.str =
     241          48 :                     str_udeescape(lvalp->core_yystype.str,
     242          24 :                                   escstr[0],
     243             :                                   *llocp,
     244             :                                   yyscanner);
     245             : 
     246             :                 /*
     247             :                  * We don't need to revert the un-truncation of UESCAPE.  What
     248             :                  * we do want to do is clear have_lookahead, thereby consuming
     249             :                  * all three tokens.
     250             :                  */
     251          24 :                 yyextra->have_lookahead = false;
     252             :             }
     253             :             else
     254             :             {
     255             :                 /* No UESCAPE, so convert using default escape character */
     256         100 :                 lvalp->core_yystype.str =
     257         132 :                     str_udeescape(lvalp->core_yystype.str,
     258             :                                   '\\',
     259             :                                   *llocp,
     260             :                                   yyscanner);
     261             :             }
     262             : 
     263         124 :             if (cur_token == UIDENT)
     264             :             {
     265             :                 /* It's an identifier, so truncate as appropriate */
     266          20 :                 truncate_identifier(lvalp->core_yystype.str,
     267          20 :                                     strlen(lvalp->core_yystype.str),
     268             :                                     true);
     269          20 :                 cur_token = IDENT;
     270             :             }
     271         104 :             else if (cur_token == USCONST)
     272             :             {
     273         104 :                 cur_token = SCONST;
     274             :             }
     275         124 :             break;
     276             :     }
     277             : 
     278       89928 :     return cur_token;
     279             : }
     280             : 
     281             : /* convert hex digit (caller should have verified that) to value */
     282             : static unsigned int
     283         984 : hexval(unsigned char c)
     284             : {
     285         984 :     if (c >= '0' && c <= '9')
     286         828 :         return c - '0';
     287         156 :     if (c >= 'a' && c <= 'f')
     288          40 :         return c - 'a' + 0xA;
     289         116 :     if (c >= 'A' && c <= 'F')
     290         116 :         return c - 'A' + 0xA;
     291           0 :     elog(ERROR, "invalid hexadecimal digit");
     292             :     return 0;                   /* not reached */
     293             : }
     294             : 
     295             : /* is Unicode code point acceptable? */
     296             : static void
     297         232 : check_unicode_value(pg_wchar c)
     298             : {
     299         232 :     if (!is_valid_unicode_codepoint(c))
     300           4 :         ereport(ERROR,
     301             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     302             :                  errmsg("invalid Unicode escape value")));
     303         228 : }
     304             : 
     305             : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
     306             : static bool
     307          28 : check_uescapechar(unsigned char escape)
     308             : {
     309          28 :     if (isxdigit(escape)
     310          28 :         || escape == '+'
     311          24 :         || escape == '\''
     312          24 :         || escape == '"'
     313          24 :         || scanner_isspace(escape))
     314           4 :         return false;
     315             :     else
     316          24 :         return true;
     317             : }
     318             : 
     319             : /*
     320             :  * Process Unicode escapes in "str", producing a palloc'd plain string
     321             :  *
     322             :  * escape: the escape character to use
     323             :  * position: start position of U&'' or U&"" string token
     324             :  * yyscanner: context information needed for error reports
     325             :  */
     326             : static char *
     327         156 : str_udeescape(const char *str, char escape,
     328             :               int position, core_yyscan_t yyscanner)
     329             : {
     330             :     const char *in;
     331             :     char       *new,
     332             :                *out;
     333             :     size_t      new_len;
     334         156 :     pg_wchar    pair_first = 0;
     335             :     ScannerCallbackState scbstate;
     336             : 
     337             :     /*
     338             :      * Guesstimate that result will be no longer than input, but allow enough
     339             :      * padding for Unicode conversion.
     340             :      */
     341         156 :     new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
     342         156 :     new = palloc(new_len);
     343             : 
     344         156 :     in = str;
     345         156 :     out = new;
     346         864 :     while (*in)
     347             :     {
     348             :         /* Enlarge string if needed */
     349         736 :         size_t      out_dist = out - new;
     350             : 
     351         736 :         if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
     352             :         {
     353           0 :             new_len *= 2;
     354           0 :             new = repalloc(new, new_len);
     355           0 :             out = new + out_dist;
     356             :         }
     357             : 
     358         736 :         if (in[0] == escape)
     359             :         {
     360             :             /*
     361             :              * Any errors reported while processing this escape sequence will
     362             :              * have an error cursor pointing at the escape.
     363             :              */
     364         248 :             setup_scanner_errposition_callback(&scbstate, yyscanner,
     365         248 :                                                in - str + position + 3);    /* 3 for U&" */
     366         248 :             if (in[1] == escape)
     367             :             {
     368           8 :                 if (pair_first)
     369           4 :                     goto invalid_pair;
     370           4 :                 *out++ = escape;
     371           4 :                 in += 2;
     372             :             }
     373         240 :             else if (isxdigit((unsigned char) in[1]) &&
     374         208 :                      isxdigit((unsigned char) in[2]) &&
     375         208 :                      isxdigit((unsigned char) in[3]) &&
     376         208 :                      isxdigit((unsigned char) in[4]))
     377         200 :             {
     378             :                 pg_wchar    unicode;
     379             : 
     380         204 :                 unicode = (hexval(in[1]) << 12) +
     381         204 :                     (hexval(in[2]) << 8) +
     382         204 :                     (hexval(in[3]) << 4) +
     383         204 :                     hexval(in[4]);
     384         204 :                 check_unicode_value(unicode);
     385         204 :                 if (pair_first)
     386             :                 {
     387           4 :                     if (is_utf16_surrogate_second(unicode))
     388             :                     {
     389           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     390           0 :                         pair_first = 0;
     391             :                     }
     392             :                     else
     393           4 :                         goto invalid_pair;
     394             :                 }
     395         200 :                 else if (is_utf16_surrogate_second(unicode))
     396           0 :                     goto invalid_pair;
     397             : 
     398         200 :                 if (is_utf16_surrogate_first(unicode))
     399          16 :                     pair_first = unicode;
     400             :                 else
     401             :                 {
     402         184 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     403         184 :                     out += strlen(out);
     404             :                 }
     405         200 :                 in += 5;
     406             :             }
     407          36 :             else if (in[1] == '+' &&
     408          32 :                      isxdigit((unsigned char) in[2]) &&
     409          32 :                      isxdigit((unsigned char) in[3]) &&
     410          32 :                      isxdigit((unsigned char) in[4]) &&
     411          32 :                      isxdigit((unsigned char) in[5]) &&
     412          32 :                      isxdigit((unsigned char) in[6]) &&
     413          28 :                      isxdigit((unsigned char) in[7]))
     414          20 :             {
     415             :                 pg_wchar    unicode;
     416             : 
     417          28 :                 unicode = (hexval(in[2]) << 20) +
     418          28 :                     (hexval(in[3]) << 16) +
     419          28 :                     (hexval(in[4]) << 12) +
     420          28 :                     (hexval(in[5]) << 8) +
     421          28 :                     (hexval(in[6]) << 4) +
     422          28 :                     hexval(in[7]);
     423          28 :                 check_unicode_value(unicode);
     424          24 :                 if (pair_first)
     425             :                 {
     426           4 :                     if (is_utf16_surrogate_second(unicode))
     427             :                     {
     428           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
     429           0 :                         pair_first = 0;
     430             :                     }
     431             :                     else
     432           4 :                         goto invalid_pair;
     433             :                 }
     434          20 :                 else if (is_utf16_surrogate_second(unicode))
     435           0 :                     goto invalid_pair;
     436             : 
     437          20 :                 if (is_utf16_surrogate_first(unicode))
     438           4 :                     pair_first = unicode;
     439             :                 else
     440             :                 {
     441          16 :                     pg_unicode_to_server(unicode, (unsigned char *) out);
     442          16 :                     out += strlen(out);
     443             :                 }
     444          20 :                 in += 8;
     445             :             }
     446             :             else
     447           8 :                 ereport(ERROR,
     448             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     449             :                          errmsg("invalid Unicode escape"),
     450             :                          errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
     451             : 
     452         224 :             cancel_scanner_errposition_callback(&scbstate);
     453             :         }
     454             :         else
     455             :         {
     456         488 :             if (pair_first)
     457           4 :                 goto invalid_pair;
     458             : 
     459         484 :             *out++ = *in++;
     460             :         }
     461             :     }
     462             : 
     463             :     /* unfinished surrogate pair? */
     464         128 :     if (pair_first)
     465           4 :         goto invalid_pair;
     466             : 
     467         124 :     *out = '\0';
     468         124 :     return new;
     469             : 
     470             :     /*
     471             :      * We might get here with the error callback active, or not.  Call
     472             :      * scanner_errposition to make sure an error cursor appears; if the
     473             :      * callback is active, this is duplicative but harmless.
     474             :      */
     475          20 : invalid_pair:
     476          20 :     ereport(ERROR,
     477             :             (errcode(ERRCODE_SYNTAX_ERROR),
     478             :              errmsg("invalid Unicode surrogate pair"),
     479             :              scanner_errposition(in - str + position + 3,   /* 3 for U&" */
     480             :                                  yyscanner)));
     481             :     return NULL;                /* keep compiler quiet */
     482             : }

Generated by: LCOV version 1.13