Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * This should match src/backend/parser/parser.c, except that we do not
7 : * need to bother with re-entrant interfaces.
8 : *
9 : * Note: ECPG doesn't report error location like the backend does.
10 : * This file will need work if we ever want it to.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/interfaces/ecpg/preproc/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres_fe.h"
23 :
24 : #include "preproc_extern.h"
25 : #include "preproc.h"
26 :
27 :
28 : static bool have_lookahead; /* is lookahead info valid? */
29 : static int lookahead_token; /* one-token lookahead */
30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32 : static char *lookahead_yytext; /* start current token */
33 :
34 : static int base_yylex_location(void);
35 : static bool check_uescapechar(unsigned char escape);
36 : static bool ecpg_isspace(char ch);
37 :
38 :
39 : /*
40 : * Intermediate filter between parser and base lexer (base_yylex in scan.l).
41 : *
42 : * This filter is needed because in some cases the standard SQL grammar
43 : * requires more than one token lookahead. We reduce these cases to one-token
44 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
45 : *
46 : * Using a filter is simpler than trying to recognize multiword tokens
47 : * directly in scan.l, because we'd have to allow for comments between the
48 : * words. Furthermore it's not clear how to do that without re-introducing
49 : * scanner backtrack, which would cost more performance than this filter
50 : * layer does.
51 : *
52 : * We also use this filter to convert UIDENT and USCONST sequences into
53 : * plain IDENT and SCONST tokens. While that could be handled by additional
54 : * productions in the main grammar, it's more efficient to do it like this.
55 : */
56 : int
57 36044 : filtered_base_yylex(void)
58 : {
59 : int cur_token;
60 : int next_token;
61 : YYSTYPE cur_yylval;
62 : YYLTYPE cur_yylloc;
63 : char *cur_yytext;
64 :
65 : /* Get next token --- we might already have it */
66 36044 : if (have_lookahead)
67 : {
68 58 : cur_token = lookahead_token;
69 58 : base_yylval = lookahead_yylval;
70 58 : base_yylloc = lookahead_yylloc;
71 58 : base_yytext = lookahead_yytext;
72 58 : have_lookahead = false;
73 : }
74 : else
75 35986 : cur_token = base_yylex_location();
76 :
77 : /*
78 : * If this token isn't one that requires lookahead, just return it.
79 : */
80 36044 : switch (cur_token)
81 : {
82 59 : case FORMAT:
83 : case NOT:
84 : case NULLS_P:
85 : case WITH:
86 : case WITHOUT:
87 : case UIDENT:
88 : case USCONST:
89 59 : break;
90 35985 : default:
91 35985 : return cur_token;
92 : }
93 :
94 : /* Save and restore lexer output variables around the call */
95 59 : cur_yylval = base_yylval;
96 59 : cur_yylloc = base_yylloc;
97 59 : cur_yytext = base_yytext;
98 :
99 : /* Get next token, saving outputs into lookahead variables */
100 59 : next_token = base_yylex_location();
101 :
102 59 : lookahead_token = next_token;
103 59 : lookahead_yylval = base_yylval;
104 59 : lookahead_yylloc = base_yylloc;
105 59 : lookahead_yytext = base_yytext;
106 :
107 59 : base_yylval = cur_yylval;
108 59 : base_yylloc = cur_yylloc;
109 59 : base_yytext = cur_yytext;
110 :
111 59 : have_lookahead = true;
112 :
113 : /* Replace cur_token if needed, based on lookahead */
114 59 : switch (cur_token)
115 : {
116 5 : case FORMAT:
117 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
118 5 : switch (next_token)
119 : {
120 5 : case JSON:
121 5 : cur_token = FORMAT_LA;
122 5 : break;
123 : }
124 5 : break;
125 :
126 37 : case NOT:
127 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
128 37 : switch (next_token)
129 : {
130 0 : case BETWEEN:
131 : case IN_P:
132 : case LIKE:
133 : case ILIKE:
134 : case SIMILAR:
135 0 : cur_token = NOT_LA;
136 0 : break;
137 : }
138 37 : break;
139 :
140 2 : case NULLS_P:
141 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
142 2 : switch (next_token)
143 : {
144 2 : case FIRST_P:
145 : case LAST_P:
146 2 : cur_token = NULLS_LA;
147 2 : break;
148 : }
149 2 : break;
150 :
151 8 : case WITH:
152 : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
153 8 : switch (next_token)
154 : {
155 1 : case TIME:
156 : case ORDINALITY:
157 1 : cur_token = WITH_LA;
158 1 : break;
159 : }
160 8 : break;
161 :
162 4 : case WITHOUT:
163 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
164 4 : switch (next_token)
165 : {
166 1 : case TIME:
167 1 : cur_token = WITHOUT_LA;
168 1 : break;
169 : }
170 4 : break;
171 3 : case UIDENT:
172 : case USCONST:
173 : /* Look ahead for UESCAPE */
174 3 : if (next_token == UESCAPE)
175 : {
176 : /* Yup, so get third token, which had better be SCONST */
177 : const char *escstr;
178 :
179 : /*
180 : * Again save and restore lexer output variables around the
181 : * call
182 : */
183 1 : cur_yylval = base_yylval;
184 1 : cur_yylloc = base_yylloc;
185 1 : cur_yytext = base_yytext;
186 :
187 : /* Get third token */
188 1 : next_token = base_yylex_location();
189 :
190 1 : if (next_token != SCONST)
191 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
192 :
193 : /*
194 : * Save and check escape string, which the scanner returns
195 : * with quotes
196 : */
197 1 : escstr = base_yylval.str;
198 1 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
199 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
200 :
201 1 : base_yylval = cur_yylval;
202 1 : base_yylloc = cur_yylloc;
203 1 : base_yytext = cur_yytext;
204 :
205 : /* Combine 3 tokens into 1 */
206 1 : base_yylval.str = make3_str(base_yylval.str,
207 : " UESCAPE ",
208 : escstr);
209 1 : base_yylloc = loc_strdup(base_yylval.str);
210 :
211 : /* Clear have_lookahead, thereby consuming all three tokens */
212 1 : have_lookahead = false;
213 : }
214 :
215 3 : if (cur_token == UIDENT)
216 1 : cur_token = IDENT;
217 2 : else if (cur_token == USCONST)
218 2 : cur_token = SCONST;
219 3 : break;
220 : }
221 :
222 59 : return cur_token;
223 : }
224 :
225 : /*
226 : * Call base_yylex() and fill in base_yylloc.
227 : *
228 : * pgc.l does not worry about setting yylloc, and given what we want for
229 : * that, trying to set it there would be pretty inconvenient. What we
230 : * want is: if the returned token has type <str>, then duplicate its
231 : * string value as yylloc; otherwise, make a downcased copy of yytext.
232 : * The downcasing is ASCII-only because all that we care about there
233 : * is producing uniformly-cased output of keywords. (That's mostly
234 : * cosmetic, but there are places in ecpglib that expect to receive
235 : * downcased keywords, plus it keeps us regression-test-compatible
236 : * with the pre-v18 implementation of ecpg.)
237 : */
238 : static int
239 36046 : base_yylex_location(void)
240 : {
241 36046 : int token = base_yylex();
242 :
243 36046 : switch (token)
244 : {
245 : /* List a token here if pgc.l assigns to base_yylval.str for it */
246 12023 : case Op:
247 : case CSTRING:
248 : case CPP_LINE:
249 : case CVARIABLE:
250 : case BCONST:
251 : case SCONST:
252 : case USCONST:
253 : case XCONST:
254 : case FCONST:
255 : case IDENT:
256 : case UIDENT:
257 : case IP:
258 : /* Duplicate the <str> value */
259 12023 : base_yylloc = loc_strdup(base_yylval.str);
260 12023 : break;
261 24023 : default:
262 : /* Else just use the input, i.e., yytext */
263 : {
264 : char *tmp;
265 :
266 24023 : tmp = loc_strdup(base_yytext);
267 : /* Apply an ASCII-only downcasing */
268 80469 : for (unsigned char *ptr = (unsigned char *) tmp; *ptr; ptr++)
269 : {
270 56446 : if (*ptr >= 'A' && *ptr <= 'Z')
271 11416 : *ptr += 'a' - 'A';
272 : }
273 24023 : base_yylloc = tmp;
274 24023 : break;
275 : }
276 : }
277 36046 : return token;
278 : }
279 :
280 : /*
281 : * check_uescapechar() and ecpg_isspace() should match their equivalents
282 : * in pgc.l.
283 : */
284 :
285 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
286 : static bool
287 1 : check_uescapechar(unsigned char escape)
288 : {
289 1 : if (isxdigit(escape)
290 1 : || escape == '+'
291 1 : || escape == '\''
292 1 : || escape == '"'
293 1 : || ecpg_isspace(escape))
294 0 : return false;
295 : else
296 1 : return true;
297 : }
298 :
299 : /*
300 : * ecpg_isspace() --- return true if flex scanner considers char whitespace
301 : */
302 : static bool
303 1 : ecpg_isspace(char ch)
304 : {
305 1 : if (ch == ' ' ||
306 1 : ch == '\t' ||
307 1 : ch == '\n' ||
308 1 : ch == '\r' ||
309 : ch == '\f')
310 0 : return true;
311 1 : return false;
312 : }
|