Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * parser.c 4 : * Main entry point/driver for PostgreSQL grammar 5 : * 6 : * This should match src/backend/parser/parser.c, except that we do not 7 : * need to bother with re-entrant interfaces. 8 : * 9 : * Note: ECPG doesn't report error location like the backend does. 10 : * This file will need work if we ever want it to. 11 : * 12 : * 13 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group 14 : * Portions Copyright (c) 1994, Regents of the University of California 15 : * 16 : * IDENTIFICATION 17 : * src/interfaces/ecpg/preproc/parser.c 18 : * 19 : *------------------------------------------------------------------------- 20 : */ 21 : 22 : #include "postgres_fe.h" 23 : 24 : #include "preproc_extern.h" 25 : #include "preproc.h" 26 : 27 : 28 : static bool have_lookahead; /* is lookahead info valid? */ 29 : static int lookahead_token; /* one-token lookahead */ 30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ 31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ 32 : static char *lookahead_yytext; /* start current token */ 33 : 34 : static bool check_uescapechar(unsigned char escape); 35 : static bool ecpg_isspace(char ch); 36 : 37 : 38 : /* 39 : * Intermediate filter between parser and base lexer (base_yylex in scan.l). 40 : * 41 : * This filter is needed because in some cases the standard SQL grammar 42 : * requires more than one token lookahead. We reduce these cases to one-token 43 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1). 44 : * 45 : * Using a filter is simpler than trying to recognize multiword tokens 46 : * directly in scan.l, because we'd have to allow for comments between the 47 : * words. Furthermore it's not clear how to do that without re-introducing 48 : * scanner backtrack, which would cost more performance than this filter 49 : * layer does. 50 : * 51 : * We also use this filter to convert UIDENT and USCONST sequences into 52 : * plain IDENT and SCONST tokens. While that could be handled by additional 53 : * productions in the main grammar, it's more efficient to do it like this. 54 : */ 55 : int 56 70238 : filtered_base_yylex(void) 57 : { 58 : int cur_token; 59 : int next_token; 60 : YYSTYPE cur_yylval; 61 : YYLTYPE cur_yylloc; 62 : char *cur_yytext; 63 : 64 : /* Get next token --- we might already have it */ 65 70238 : if (have_lookahead) 66 : { 67 114 : cur_token = lookahead_token; 68 114 : base_yylval = lookahead_yylval; 69 114 : base_yylloc = lookahead_yylloc; 70 114 : base_yytext = lookahead_yytext; 71 114 : have_lookahead = false; 72 : } 73 : else 74 70124 : cur_token = base_yylex(); 75 : 76 : /* 77 : * If this token isn't one that requires lookahead, just return it. 78 : */ 79 70238 : switch (cur_token) 80 : { 81 116 : case FORMAT: 82 : case NOT: 83 : case NULLS_P: 84 : case WITH: 85 : case WITHOUT: 86 : case UIDENT: 87 : case USCONST: 88 116 : break; 89 70122 : default: 90 70122 : return cur_token; 91 : } 92 : 93 : /* Save and restore lexer output variables around the call */ 94 116 : cur_yylval = base_yylval; 95 116 : cur_yylloc = base_yylloc; 96 116 : cur_yytext = base_yytext; 97 : 98 : /* Get next token, saving outputs into lookahead variables */ 99 116 : next_token = base_yylex(); 100 : 101 116 : lookahead_token = next_token; 102 116 : lookahead_yylval = base_yylval; 103 116 : lookahead_yylloc = base_yylloc; 104 116 : lookahead_yytext = base_yytext; 105 : 106 116 : base_yylval = cur_yylval; 107 116 : base_yylloc = cur_yylloc; 108 116 : base_yytext = cur_yytext; 109 : 110 116 : have_lookahead = true; 111 : 112 : /* Replace cur_token if needed, based on lookahead */ 113 116 : switch (cur_token) 114 : { 115 10 : case FORMAT: 116 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */ 117 : switch (next_token) 118 : { 119 10 : case JSON: 120 10 : cur_token = FORMAT_LA; 121 10 : break; 122 : } 123 10 : break; 124 : 125 72 : case NOT: 126 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ 127 : switch (next_token) 128 : { 129 0 : case BETWEEN: 130 : case IN_P: 131 : case LIKE: 132 : case ILIKE: 133 : case SIMILAR: 134 0 : cur_token = NOT_LA; 135 0 : break; 136 : } 137 72 : break; 138 : 139 4 : case NULLS_P: 140 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ 141 : switch (next_token) 142 : { 143 4 : case FIRST_P: 144 : case LAST_P: 145 4 : cur_token = NULLS_LA; 146 4 : break; 147 : } 148 4 : break; 149 : 150 16 : case WITH: 151 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ 152 : switch (next_token) 153 : { 154 2 : case TIME: 155 : case ORDINALITY: 156 2 : cur_token = WITH_LA; 157 2 : break; 158 : } 159 16 : break; 160 : 161 8 : case WITHOUT: 162 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */ 163 : switch (next_token) 164 : { 165 2 : case TIME: 166 2 : cur_token = WITHOUT_LA; 167 2 : break; 168 : } 169 8 : break; 170 6 : case UIDENT: 171 : case USCONST: 172 : /* Look ahead for UESCAPE */ 173 6 : if (next_token == UESCAPE) 174 : { 175 : /* Yup, so get third token, which had better be SCONST */ 176 : const char *escstr; 177 : 178 : /* 179 : * Again save and restore lexer output variables around the 180 : * call 181 : */ 182 2 : cur_yylval = base_yylval; 183 2 : cur_yylloc = base_yylloc; 184 2 : cur_yytext = base_yytext; 185 : 186 : /* Get third token */ 187 2 : next_token = base_yylex(); 188 : 189 2 : if (next_token != SCONST) 190 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal"); 191 : 192 : /* 193 : * Save and check escape string, which the scanner returns 194 : * with quotes 195 : */ 196 2 : escstr = base_yylval.str; 197 2 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1])) 198 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character"); 199 : 200 2 : base_yylval = cur_yylval; 201 2 : base_yylloc = cur_yylloc; 202 2 : base_yytext = cur_yytext; 203 : 204 : /* Combine 3 tokens into 1 */ 205 2 : base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr); 206 : 207 : /* Clear have_lookahead, thereby consuming all three tokens */ 208 2 : have_lookahead = false; 209 : } 210 : 211 6 : if (cur_token == UIDENT) 212 2 : cur_token = IDENT; 213 4 : else if (cur_token == USCONST) 214 4 : cur_token = SCONST; 215 6 : break; 216 : } 217 : 218 116 : return cur_token; 219 : } 220 : 221 : /* 222 : * check_uescapechar() and ecpg_isspace() should match their equivalents 223 : * in pgc.l. 224 : */ 225 : 226 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ 227 : static bool 228 2 : check_uescapechar(unsigned char escape) 229 : { 230 2 : if (isxdigit(escape) 231 2 : || escape == '+' 232 2 : || escape == '\'' 233 2 : || escape == '"' 234 2 : || ecpg_isspace(escape)) 235 0 : return false; 236 : else 237 2 : return true; 238 : } 239 : 240 : /* 241 : * ecpg_isspace() --- return true if flex scanner considers char whitespace 242 : */ 243 : static bool 244 2 : ecpg_isspace(char ch) 245 : { 246 2 : if (ch == ' ' || 247 2 : ch == '\t' || 248 2 : ch == '\n' || 249 2 : ch == '\r' || 250 : ch == '\f') 251 0 : return true; 252 2 : return false; 253 : }