LCOV - code coverage report
Current view: top level - src/interfaces/ecpg/preproc - parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 95 102 93.1 %
Date: 2025-01-18 04:15:08 Functions: 4 4 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * parser.c
       4             :  *      Main entry point/driver for PostgreSQL grammar
       5             :  *
       6             :  * This should match src/backend/parser/parser.c, except that we do not
       7             :  * need to bother with re-entrant interfaces.
       8             :  *
       9             :  * Note: ECPG doesn't report error location like the backend does.
      10             :  * This file will need work if we ever want it to.
      11             :  *
      12             :  *
      13             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      14             :  * Portions Copyright (c) 1994, Regents of the University of California
      15             :  *
      16             :  * IDENTIFICATION
      17             :  *    src/interfaces/ecpg/preproc/parser.c
      18             :  *
      19             :  *-------------------------------------------------------------------------
      20             :  */
      21             : 
      22             : #include "postgres_fe.h"
      23             : 
      24             : #include "preproc_extern.h"
      25             : #include "preproc.h"
      26             : 
      27             : 
      28             : static bool have_lookahead;     /* is lookahead info valid? */
      29             : static int  lookahead_token;    /* one-token lookahead */
      30             : static YYSTYPE lookahead_yylval;    /* yylval for lookahead token */
      31             : static YYLTYPE lookahead_yylloc;    /* yylloc for lookahead token */
      32             : static char *lookahead_yytext;  /* start current token */
      33             : 
      34             : static int  base_yylex_location(void);
      35             : static bool check_uescapechar(unsigned char escape);
      36             : static bool ecpg_isspace(char ch);
      37             : 
      38             : 
      39             : /*
      40             :  * Intermediate filter between parser and base lexer (base_yylex in scan.l).
      41             :  *
      42             :  * This filter is needed because in some cases the standard SQL grammar
      43             :  * requires more than one token lookahead.  We reduce these cases to one-token
      44             :  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
      45             :  *
      46             :  * Using a filter is simpler than trying to recognize multiword tokens
      47             :  * directly in scan.l, because we'd have to allow for comments between the
      48             :  * words.  Furthermore it's not clear how to do that without re-introducing
      49             :  * scanner backtrack, which would cost more performance than this filter
      50             :  * layer does.
      51             :  *
      52             :  * We also use this filter to convert UIDENT and USCONST sequences into
      53             :  * plain IDENT and SCONST tokens.  While that could be handled by additional
      54             :  * productions in the main grammar, it's more efficient to do it like this.
      55             :  */
      56             : int
      57       71144 : filtered_base_yylex(void)
      58             : {
      59             :     int         cur_token;
      60             :     int         next_token;
      61             :     YYSTYPE     cur_yylval;
      62             :     YYLTYPE     cur_yylloc;
      63             :     char       *cur_yytext;
      64             : 
      65             :     /* Get next token --- we might already have it */
      66       71144 :     if (have_lookahead)
      67             :     {
      68         114 :         cur_token = lookahead_token;
      69         114 :         base_yylval = lookahead_yylval;
      70         114 :         base_yylloc = lookahead_yylloc;
      71         114 :         base_yytext = lookahead_yytext;
      72         114 :         have_lookahead = false;
      73             :     }
      74             :     else
      75       71030 :         cur_token = base_yylex_location();
      76             : 
      77             :     /*
      78             :      * If this token isn't one that requires lookahead, just return it.
      79             :      */
      80       71144 :     switch (cur_token)
      81             :     {
      82         116 :         case FORMAT:
      83             :         case NOT:
      84             :         case NULLS_P:
      85             :         case WITH:
      86             :         case WITHOUT:
      87             :         case UIDENT:
      88             :         case USCONST:
      89         116 :             break;
      90       71028 :         default:
      91       71028 :             return cur_token;
      92             :     }
      93             : 
      94             :     /* Save and restore lexer output variables around the call */
      95         116 :     cur_yylval = base_yylval;
      96         116 :     cur_yylloc = base_yylloc;
      97         116 :     cur_yytext = base_yytext;
      98             : 
      99             :     /* Get next token, saving outputs into lookahead variables */
     100         116 :     next_token = base_yylex_location();
     101             : 
     102         116 :     lookahead_token = next_token;
     103         116 :     lookahead_yylval = base_yylval;
     104         116 :     lookahead_yylloc = base_yylloc;
     105         116 :     lookahead_yytext = base_yytext;
     106             : 
     107         116 :     base_yylval = cur_yylval;
     108         116 :     base_yylloc = cur_yylloc;
     109         116 :     base_yytext = cur_yytext;
     110             : 
     111         116 :     have_lookahead = true;
     112             : 
     113             :     /* Replace cur_token if needed, based on lookahead */
     114         116 :     switch (cur_token)
     115             :     {
     116          10 :         case FORMAT:
     117             :             /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
     118             :             switch (next_token)
     119             :             {
     120          10 :                 case JSON:
     121          10 :                     cur_token = FORMAT_LA;
     122          10 :                     break;
     123             :             }
     124          10 :             break;
     125             : 
     126          72 :         case NOT:
     127             :             /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
     128             :             switch (next_token)
     129             :             {
     130           0 :                 case BETWEEN:
     131             :                 case IN_P:
     132             :                 case LIKE:
     133             :                 case ILIKE:
     134             :                 case SIMILAR:
     135           0 :                     cur_token = NOT_LA;
     136           0 :                     break;
     137             :             }
     138          72 :             break;
     139             : 
     140           4 :         case NULLS_P:
     141             :             /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
     142             :             switch (next_token)
     143             :             {
     144           4 :                 case FIRST_P:
     145             :                 case LAST_P:
     146           4 :                     cur_token = NULLS_LA;
     147           4 :                     break;
     148             :             }
     149           4 :             break;
     150             : 
     151          16 :         case WITH:
     152             :             /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
     153             :             switch (next_token)
     154             :             {
     155           2 :                 case TIME:
     156             :                 case ORDINALITY:
     157           2 :                     cur_token = WITH_LA;
     158           2 :                     break;
     159             :             }
     160          16 :             break;
     161             : 
     162           8 :         case WITHOUT:
     163             :             /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
     164             :             switch (next_token)
     165             :             {
     166           2 :                 case TIME:
     167           2 :                     cur_token = WITHOUT_LA;
     168           2 :                     break;
     169             :             }
     170           8 :             break;
     171           6 :         case UIDENT:
     172             :         case USCONST:
     173             :             /* Look ahead for UESCAPE */
     174           6 :             if (next_token == UESCAPE)
     175             :             {
     176             :                 /* Yup, so get third token, which had better be SCONST */
     177             :                 const char *escstr;
     178             : 
     179             :                 /*
     180             :                  * Again save and restore lexer output variables around the
     181             :                  * call
     182             :                  */
     183           2 :                 cur_yylval = base_yylval;
     184           2 :                 cur_yylloc = base_yylloc;
     185           2 :                 cur_yytext = base_yytext;
     186             : 
     187             :                 /* Get third token */
     188           2 :                 next_token = base_yylex_location();
     189             : 
     190           2 :                 if (next_token != SCONST)
     191           0 :                     mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
     192             : 
     193             :                 /*
     194             :                  * Save and check escape string, which the scanner returns
     195             :                  * with quotes
     196             :                  */
     197           2 :                 escstr = base_yylval.str;
     198           2 :                 if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
     199           0 :                     mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
     200             : 
     201           2 :                 base_yylval = cur_yylval;
     202           2 :                 base_yylloc = cur_yylloc;
     203           2 :                 base_yytext = cur_yytext;
     204             : 
     205             :                 /* Combine 3 tokens into 1 */
     206           2 :                 base_yylval.str = make3_str(base_yylval.str,
     207             :                                             " UESCAPE ",
     208             :                                             escstr);
     209           2 :                 base_yylloc = loc_strdup(base_yylval.str);
     210             : 
     211             :                 /* Clear have_lookahead, thereby consuming all three tokens */
     212           2 :                 have_lookahead = false;
     213             :             }
     214             : 
     215           6 :             if (cur_token == UIDENT)
     216           2 :                 cur_token = IDENT;
     217           4 :             else if (cur_token == USCONST)
     218           4 :                 cur_token = SCONST;
     219           6 :             break;
     220             :     }
     221             : 
     222         116 :     return cur_token;
     223             : }
     224             : 
     225             : /*
     226             :  * Call base_yylex() and fill in base_yylloc.
     227             :  *
     228             :  * pgc.l does not worry about setting yylloc, and given what we want for
     229             :  * that, trying to set it there would be pretty inconvenient.  What we
     230             :  * want is: if the returned token has type <str>, then duplicate its
     231             :  * string value as yylloc; otherwise, make a downcased copy of yytext.
     232             :  * The downcasing is ASCII-only because all that we care about there
     233             :  * is producing uniformly-cased output of keywords.  (That's mostly
     234             :  * cosmetic, but there are places in ecpglib that expect to receive
     235             :  * downcased keywords, plus it keeps us regression-test-compatible
     236             :  * with the pre-v18 implementation of ecpg.)
     237             :  */
     238             : static int
     239       71148 : base_yylex_location(void)
     240             : {
     241       71148 :     int         token = base_yylex();
     242             : 
     243       71148 :     switch (token)
     244             :     {
     245             :             /* List a token here if pgc.l assigns to base_yylval.str for it */
     246       23780 :         case Op:
     247             :         case CSTRING:
     248             :         case CPP_LINE:
     249             :         case CVARIABLE:
     250             :         case BCONST:
     251             :         case SCONST:
     252             :         case USCONST:
     253             :         case XCONST:
     254             :         case FCONST:
     255             :         case IDENT:
     256             :         case UIDENT:
     257             :         case IP:
     258             :             /* Duplicate the <str> value */
     259       23780 :             base_yylloc = loc_strdup(base_yylval.str);
     260       23780 :             break;
     261       47368 :         default:
     262             :             /* Else just use the input, i.e., yytext */
     263       47368 :             base_yylloc = loc_strdup(base_yytext);
     264             :             /* Apply an ASCII-only downcasing */
     265      158734 :             for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++)
     266             :             {
     267      111366 :                 if (*ptr >= 'A' && *ptr <= 'Z')
     268       22074 :                     *ptr += 'a' - 'A';
     269             :             }
     270       47368 :             break;
     271             :     }
     272       71148 :     return token;
     273             : }
     274             : 
     275             : /*
     276             :  * check_uescapechar() and ecpg_isspace() should match their equivalents
     277             :  * in pgc.l.
     278             :  */
     279             : 
     280             : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
     281             : static bool
     282           2 : check_uescapechar(unsigned char escape)
     283             : {
     284           2 :     if (isxdigit(escape)
     285           2 :         || escape == '+'
     286           2 :         || escape == '\''
     287           2 :         || escape == '"'
     288           2 :         || ecpg_isspace(escape))
     289           0 :         return false;
     290             :     else
     291           2 :         return true;
     292             : }
     293             : 
     294             : /*
     295             :  * ecpg_isspace() --- return true if flex scanner considers char whitespace
     296             :  */
     297             : static bool
     298           2 : ecpg_isspace(char ch)
     299             : {
     300           2 :     if (ch == ' ' ||
     301           2 :         ch == '\t' ||
     302           2 :         ch == '\n' ||
     303           2 :         ch == '\r' ||
     304             :         ch == '\f')
     305           0 :         return true;
     306           2 :     return false;
     307             : }

Generated by: LCOV version 1.14