Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * parser.c 4 : * Main entry point/driver for PostgreSQL grammar 5 : * 6 : * This should match src/backend/parser/parser.c, except that we do not 7 : * need to bother with re-entrant interfaces. 8 : * 9 : * Note: ECPG doesn't report error location like the backend does. 10 : * This file will need work if we ever want it to. 11 : * 12 : * 13 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group 14 : * Portions Copyright (c) 1994, Regents of the University of California 15 : * 16 : * IDENTIFICATION 17 : * src/interfaces/ecpg/preproc/parser.c 18 : * 19 : *------------------------------------------------------------------------- 20 : */ 21 : 22 : #include "postgres_fe.h" 23 : 24 : #include "preproc_extern.h" 25 : #include "preproc.h" 26 : 27 : 28 : static bool have_lookahead; /* is lookahead info valid? */ 29 : static int lookahead_token; /* one-token lookahead */ 30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ 31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ 32 : static char *lookahead_yytext; /* start current token */ 33 : 34 : static int base_yylex_location(void); 35 : static bool check_uescapechar(unsigned char escape); 36 : static bool ecpg_isspace(char ch); 37 : 38 : 39 : /* 40 : * Intermediate filter between parser and base lexer (base_yylex in scan.l). 41 : * 42 : * This filter is needed because in some cases the standard SQL grammar 43 : * requires more than one token lookahead. We reduce these cases to one-token 44 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1). 45 : * 46 : * Using a filter is simpler than trying to recognize multiword tokens 47 : * directly in scan.l, because we'd have to allow for comments between the 48 : * words. Furthermore it's not clear how to do that without re-introducing 49 : * scanner backtrack, which would cost more performance than this filter 50 : * layer does. 51 : * 52 : * We also use this filter to convert UIDENT and USCONST sequences into 53 : * plain IDENT and SCONST tokens. While that could be handled by additional 54 : * productions in the main grammar, it's more efficient to do it like this. 55 : */ 56 : int 57 71150 : filtered_base_yylex(void) 58 : { 59 : int cur_token; 60 : int next_token; 61 : YYSTYPE cur_yylval; 62 : YYLTYPE cur_yylloc; 63 : char *cur_yytext; 64 : 65 : /* Get next token --- we might already have it */ 66 71150 : if (have_lookahead) 67 : { 68 114 : cur_token = lookahead_token; 69 114 : base_yylval = lookahead_yylval; 70 114 : base_yylloc = lookahead_yylloc; 71 114 : base_yytext = lookahead_yytext; 72 114 : have_lookahead = false; 73 : } 74 : else 75 71036 : cur_token = base_yylex_location(); 76 : 77 : /* 78 : * If this token isn't one that requires lookahead, just return it. 79 : */ 80 71150 : switch (cur_token) 81 : { 82 116 : case FORMAT: 83 : case NOT: 84 : case NULLS_P: 85 : case WITH: 86 : case WITHOUT: 87 : case UIDENT: 88 : case USCONST: 89 116 : break; 90 71034 : default: 91 71034 : return cur_token; 92 : } 93 : 94 : /* Save and restore lexer output variables around the call */ 95 116 : cur_yylval = base_yylval; 96 116 : cur_yylloc = base_yylloc; 97 116 : cur_yytext = base_yytext; 98 : 99 : /* Get next token, saving outputs into lookahead variables */ 100 116 : next_token = base_yylex_location(); 101 : 102 116 : lookahead_token = next_token; 103 116 : lookahead_yylval = base_yylval; 104 116 : lookahead_yylloc = base_yylloc; 105 116 : lookahead_yytext = base_yytext; 106 : 107 116 : base_yylval = cur_yylval; 108 116 : base_yylloc = cur_yylloc; 109 116 : base_yytext = cur_yytext; 110 : 111 116 : have_lookahead = true; 112 : 113 : /* Replace cur_token if needed, based on lookahead */ 114 116 : switch (cur_token) 115 : { 116 10 : case FORMAT: 117 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */ 118 : switch (next_token) 119 : { 120 10 : case JSON: 121 10 : cur_token = FORMAT_LA; 122 10 : break; 123 : } 124 10 : break; 125 : 126 72 : case NOT: 127 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ 128 : switch (next_token) 129 : { 130 0 : case BETWEEN: 131 : case IN_P: 132 : case LIKE: 133 : case ILIKE: 134 : case SIMILAR: 135 0 : cur_token = NOT_LA; 136 0 : break; 137 : } 138 72 : break; 139 : 140 4 : case NULLS_P: 141 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ 142 : switch (next_token) 143 : { 144 4 : case FIRST_P: 145 : case LAST_P: 146 4 : cur_token = NULLS_LA; 147 4 : break; 148 : } 149 4 : break; 150 : 151 16 : case WITH: 152 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ 153 : switch (next_token) 154 : { 155 2 : case TIME: 156 : case ORDINALITY: 157 2 : cur_token = WITH_LA; 158 2 : break; 159 : } 160 16 : break; 161 : 162 8 : case WITHOUT: 163 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */ 164 : switch (next_token) 165 : { 166 2 : case TIME: 167 2 : cur_token = WITHOUT_LA; 168 2 : break; 169 : } 170 8 : break; 171 6 : case UIDENT: 172 : case USCONST: 173 : /* Look ahead for UESCAPE */ 174 6 : if (next_token == UESCAPE) 175 : { 176 : /* Yup, so get third token, which had better be SCONST */ 177 : const char *escstr; 178 : 179 : /* 180 : * Again save and restore lexer output variables around the 181 : * call 182 : */ 183 2 : cur_yylval = base_yylval; 184 2 : cur_yylloc = base_yylloc; 185 2 : cur_yytext = base_yytext; 186 : 187 : /* Get third token */ 188 2 : next_token = base_yylex_location(); 189 : 190 2 : if (next_token != SCONST) 191 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); 192 : 193 : /* 194 : * Save and check escape string, which the scanner returns 195 : * with quotes 196 : */ 197 2 : escstr = base_yylval.str; 198 2 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) 199 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); 200 : 201 2 : base_yylval = cur_yylval; 202 2 : base_yylloc = cur_yylloc; 203 2 : base_yytext = cur_yytext; 204 : 205 : /* Combine 3 tokens into 1 */ 206 2 : base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr); 207 2 : base_yylloc = loc_strdup(base_yylval.str); 208 : 209 : /* Clear have_lookahead, thereby consuming all three tokens */ 210 2 : have_lookahead = false; 211 : } 212 : 213 6 : if (cur_token == UIDENT) 214 2 : cur_token = IDENT; 215 4 : else if (cur_token == USCONST) 216 4 : cur_token = SCONST; 217 6 : break; 218 : } 219 : 220 116 : return cur_token; 221 : } 222 : 223 : /* 224 : * Call base_yylex() and fill in base_yylloc. 225 : * 226 : * pgc.l does not worry about setting yylloc, and given what we want for 227 : * that, trying to set it there would be pretty inconvenient. What we 228 : * want is: if the returned token has type <str>, then duplicate its 229 : * string value as yylloc; otherwise, make a downcased copy of yytext. 230 : * The downcasing is ASCII-only because all that we care about there 231 : * is producing uniformly-cased output of keywords. (That's mostly 232 : * cosmetic, but there are places in ecpglib that expect to receive 233 : * downcased keywords, plus it keeps us regression-test-compatible 234 : * with the pre-v18 implementation of ecpg.) 235 : */ 236 : static int 237 71154 : base_yylex_location(void) 238 : { 239 71154 : int token = base_yylex(); 240 : 241 71154 : switch (token) 242 : { 243 : /* List a token here if pgc.l assigns to base_yylval.str for it */ 244 23784 : case Op: 245 : case CSTRING: 246 : case CPP_LINE: 247 : case CVARIABLE: 248 : case BCONST: 249 : case SCONST: 250 : case USCONST: 251 : case XCONST: 252 : case FCONST: 253 : case IDENT: 254 : case UIDENT: 255 : case IP: 256 : /* Duplicate the <str> value */ 257 23784 : base_yylloc = loc_strdup(base_yylval.str); 258 23784 : break; 259 47370 : default: 260 : /* Else just use the input, i.e., yytext */ 261 47370 : base_yylloc = loc_strdup(base_yytext); 262 : /* Apply an ASCII-only downcasing */ 263 158738 : for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++) 264 : { 265 111368 : if (*ptr >= 'A' && *ptr <= 'Z') 266 22074 : *ptr += 'a' - 'A'; 267 : } 268 47370 : break; 269 : } 270 71154 : return token; 271 : } 272 : 273 : /* 274 : * check_uescapechar() and ecpg_isspace() should match their equivalents 275 : * in pgc.l. 276 : */ 277 : 278 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ 279 : static bool 280 2 : check_uescapechar(unsigned char escape) 281 : { 282 2 : if (isxdigit(escape) 283 2 : || escape == '+' 284 2 : || escape == '\'' 285 2 : || escape == '"' 286 2 : || ecpg_isspace(escape)) 287 0 : return false; 288 : else 289 2 : return true; 290 : } 291 : 292 : /* 293 : * ecpg_isspace() --- return true if flex scanner considers char whitespace 294 : */ 295 : static bool 296 2 : ecpg_isspace(char ch) 297 : { 298 2 : if (ch == ' ' || 299 2 : ch == '\t' || 300 2 : ch == '\n' || 301 2 : ch == '\r' || 302 : ch == '\f') 303 0 : return true; 304 2 : return false; 305 : }