Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * parser.c 4 : * Main entry point/driver for PostgreSQL grammar 5 : * 6 : * This should match src/backend/parser/parser.c, except that we do not 7 : * need to bother with re-entrant interfaces. 8 : * 9 : * Note: ECPG doesn't report error location like the backend does. 10 : * This file will need work if we ever want it to. 11 : * 12 : * 13 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 14 : * Portions Copyright (c) 1994, Regents of the University of California 15 : * 16 : * IDENTIFICATION 17 : * src/interfaces/ecpg/preproc/parser.c 18 : * 19 : *------------------------------------------------------------------------- 20 : */ 21 : 22 : #include "postgres_fe.h" 23 : 24 : #include "preproc_extern.h" 25 : #include "preproc.h" 26 : 27 : 28 : static bool have_lookahead; /* is lookahead info valid? */ 29 : static int lookahead_token; /* one-token lookahead */ 30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ 31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ 32 : static char *lookahead_yytext; /* start current token */ 33 : 34 : static int base_yylex_location(void); 35 : static bool check_uescapechar(unsigned char escape); 36 : static bool ecpg_isspace(char ch); 37 : 38 : 39 : /* 40 : * Intermediate filter between parser and base lexer (base_yylex in scan.l). 41 : * 42 : * This filter is needed because in some cases the standard SQL grammar 43 : * requires more than one token lookahead. We reduce these cases to one-token 44 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1). 45 : * 46 : * Using a filter is simpler than trying to recognize multiword tokens 47 : * directly in scan.l, because we'd have to allow for comments between the 48 : * words. Furthermore it's not clear how to do that without re-introducing 49 : * scanner backtrack, which would cost more performance than this filter 50 : * layer does. 51 : * 52 : * We also use this filter to convert UIDENT and USCONST sequences into 53 : * plain IDENT and SCONST tokens. While that could be handled by additional 54 : * productions in the main grammar, it's more efficient to do it like this. 55 : */ 56 : int 57 71144 : filtered_base_yylex(void) 58 : { 59 : int cur_token; 60 : int next_token; 61 : YYSTYPE cur_yylval; 62 : YYLTYPE cur_yylloc; 63 : char *cur_yytext; 64 : 65 : /* Get next token --- we might already have it */ 66 71144 : if (have_lookahead) 67 : { 68 114 : cur_token = lookahead_token; 69 114 : base_yylval = lookahead_yylval; 70 114 : base_yylloc = lookahead_yylloc; 71 114 : base_yytext = lookahead_yytext; 72 114 : have_lookahead = false; 73 : } 74 : else 75 71030 : cur_token = base_yylex_location(); 76 : 77 : /* 78 : * If this token isn't one that requires lookahead, just return it. 79 : */ 80 71144 : switch (cur_token) 81 : { 82 116 : case FORMAT: 83 : case NOT: 84 : case NULLS_P: 85 : case WITH: 86 : case WITHOUT: 87 : case UIDENT: 88 : case USCONST: 89 116 : break; 90 71028 : default: 91 71028 : return cur_token; 92 : } 93 : 94 : /* Save and restore lexer output variables around the call */ 95 116 : cur_yylval = base_yylval; 96 116 : cur_yylloc = base_yylloc; 97 116 : cur_yytext = base_yytext; 98 : 99 : /* Get next token, saving outputs into lookahead variables */ 100 116 : next_token = base_yylex_location(); 101 : 102 116 : lookahead_token = next_token; 103 116 : lookahead_yylval = base_yylval; 104 116 : lookahead_yylloc = base_yylloc; 105 116 : lookahead_yytext = base_yytext; 106 : 107 116 : base_yylval = cur_yylval; 108 116 : base_yylloc = cur_yylloc; 109 116 : base_yytext = cur_yytext; 110 : 111 116 : have_lookahead = true; 112 : 113 : /* Replace cur_token if needed, based on lookahead */ 114 116 : switch (cur_token) 115 : { 116 10 : case FORMAT: 117 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */ 118 : switch (next_token) 119 : { 120 10 : case JSON: 121 10 : cur_token = FORMAT_LA; 122 10 : break; 123 : } 124 10 : break; 125 : 126 72 : case NOT: 127 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ 128 : switch (next_token) 129 : { 130 0 : case BETWEEN: 131 : case IN_P: 132 : case LIKE: 133 : case ILIKE: 134 : case SIMILAR: 135 0 : cur_token = NOT_LA; 136 0 : break; 137 : } 138 72 : break; 139 : 140 4 : case NULLS_P: 141 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ 142 : switch (next_token) 143 : { 144 4 : case FIRST_P: 145 : case LAST_P: 146 4 : cur_token = NULLS_LA; 147 4 : break; 148 : } 149 4 : break; 150 : 151 16 : case WITH: 152 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ 153 : switch (next_token) 154 : { 155 2 : case TIME: 156 : case ORDINALITY: 157 2 : cur_token = WITH_LA; 158 2 : break; 159 : } 160 16 : break; 161 : 162 8 : case WITHOUT: 163 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */ 164 : switch (next_token) 165 : { 166 2 : case TIME: 167 2 : cur_token = WITHOUT_LA; 168 2 : break; 169 : } 170 8 : break; 171 6 : case UIDENT: 172 : case USCONST: 173 : /* Look ahead for UESCAPE */ 174 6 : if (next_token == UESCAPE) 175 : { 176 : /* Yup, so get third token, which had better be SCONST */ 177 : const char *escstr; 178 : 179 : /* 180 : * Again save and restore lexer output variables around the 181 : * call 182 : */ 183 2 : cur_yylval = base_yylval; 184 2 : cur_yylloc = base_yylloc; 185 2 : cur_yytext = base_yytext; 186 : 187 : /* Get third token */ 188 2 : next_token = base_yylex_location(); 189 : 190 2 : if (next_token != SCONST) 191 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); 192 : 193 : /* 194 : * Save and check escape string, which the scanner returns 195 : * with quotes 196 : */ 197 2 : escstr = base_yylval.str; 198 2 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) 199 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); 200 : 201 2 : base_yylval = cur_yylval; 202 2 : base_yylloc = cur_yylloc; 203 2 : base_yytext = cur_yytext; 204 : 205 : /* Combine 3 tokens into 1 */ 206 2 : base_yylval.str = make3_str(base_yylval.str, 207 : " UESCAPE ", 208 : escstr); 209 2 : base_yylloc = loc_strdup(base_yylval.str); 210 : 211 : /* Clear have_lookahead, thereby consuming all three tokens */ 212 2 : have_lookahead = false; 213 : } 214 : 215 6 : if (cur_token == UIDENT) 216 2 : cur_token = IDENT; 217 4 : else if (cur_token == USCONST) 218 4 : cur_token = SCONST; 219 6 : break; 220 : } 221 : 222 116 : return cur_token; 223 : } 224 : 225 : /* 226 : * Call base_yylex() and fill in base_yylloc. 227 : * 228 : * pgc.l does not worry about setting yylloc, and given what we want for 229 : * that, trying to set it there would be pretty inconvenient. What we 230 : * want is: if the returned token has type <str>, then duplicate its 231 : * string value as yylloc; otherwise, make a downcased copy of yytext. 232 : * The downcasing is ASCII-only because all that we care about there 233 : * is producing uniformly-cased output of keywords. (That's mostly 234 : * cosmetic, but there are places in ecpglib that expect to receive 235 : * downcased keywords, plus it keeps us regression-test-compatible 236 : * with the pre-v18 implementation of ecpg.) 237 : */ 238 : static int 239 71148 : base_yylex_location(void) 240 : { 241 71148 : int token = base_yylex(); 242 : 243 71148 : switch (token) 244 : { 245 : /* List a token here if pgc.l assigns to base_yylval.str for it */ 246 23780 : case Op: 247 : case CSTRING: 248 : case CPP_LINE: 249 : case CVARIABLE: 250 : case BCONST: 251 : case SCONST: 252 : case USCONST: 253 : case XCONST: 254 : case FCONST: 255 : case IDENT: 256 : case UIDENT: 257 : case IP: 258 : /* Duplicate the <str> value */ 259 23780 : base_yylloc = loc_strdup(base_yylval.str); 260 23780 : break; 261 47368 : default: 262 : /* Else just use the input, i.e., yytext */ 263 47368 : base_yylloc = loc_strdup(base_yytext); 264 : /* Apply an ASCII-only downcasing */ 265 158734 : for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++) 266 : { 267 111366 : if (*ptr >= 'A' && *ptr <= 'Z') 268 22074 : *ptr += 'a' - 'A'; 269 : } 270 47368 : break; 271 : } 272 71148 : return token; 273 : } 274 : 275 : /* 276 : * check_uescapechar() and ecpg_isspace() should match their equivalents 277 : * in pgc.l. 278 : */ 279 : 280 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ 281 : static bool 282 2 : check_uescapechar(unsigned char escape) 283 : { 284 2 : if (isxdigit(escape) 285 2 : || escape == '+' 286 2 : || escape == '\'' 287 2 : || escape == '"' 288 2 : || ecpg_isspace(escape)) 289 0 : return false; 290 : else 291 2 : return true; 292 : } 293 : 294 : /* 295 : * ecpg_isspace() --- return true if flex scanner considers char whitespace 296 : */ 297 : static bool 298 2 : ecpg_isspace(char ch) 299 : { 300 2 : if (ch == ' ' || 301 2 : ch == '\t' || 302 2 : ch == '\n' || 303 2 : ch == '\r' || 304 : ch == '\f') 305 0 : return true; 306 2 : return false; 307 : }