Line data Source code
1 : %top{
2 : /*-------------------------------------------------------------------------
3 : *
4 : * scan.l
5 : * lexical scanner for PostgreSQL
6 : *
7 : * NOTE NOTE NOTE:
8 : *
9 : * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
10 : * and src/interfaces/ecpg/preproc/pgc.l!
11 : *
12 : * The rules are designed so that the scanner never has to backtrack,
13 : * in the sense that there is always a rule that can match the input
14 : * consumed so far (the rule action may internally throw back some input
15 : * with yyless(), however). As explained in the flex manual, this makes
16 : * for a useful speed increase --- several percent faster when measuring
17 : * raw parsing (Flex + Bison). The extra complexity is mostly in the rules
18 : * for handling float numbers and continued string literals. If you change
19 : * the lexical rules, verify that you haven't broken the no-backtrack
20 : * property by running flex with the "-b" option and checking that the
21 : * resulting "lex.backup" file says that no backing up is needed. (As of
22 : * Postgres 9.2, this check is made automatically by the Makefile.)
23 : *
24 : *
25 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
26 : * Portions Copyright (c) 1994, Regents of the University of California
27 : *
28 : * IDENTIFICATION
29 : * src/backend/parser/scan.l
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include <ctype.h>
36 : #include <unistd.h>
37 :
38 : #include "common/string.h"
39 : #include "gramparse.h"
40 : #include "nodes/miscnodes.h"
41 : #include "parser/parser.h" /* only needed for GUC variables */
42 : #include "parser/scansup.h"
43 : #include "port/pg_bitutils.h"
44 : #include "mb/pg_wchar.h"
45 : #include "utils/builtins.h"
46 : }
47 :
48 : %{
49 :
50 : /* LCOV_EXCL_START */
51 :
52 : /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
53 : #undef fprintf
54 : #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
55 :
56 : static void
57 0 : fprintf_to_ereport(const char *fmt, const char *msg)
58 : {
59 0 : ereport(ERROR, (errmsg_internal("%s", msg)));
60 : }
61 :
62 : /*
63 : * GUC variable. This is a DIRECT violation of the warning given at the
64 : * head of gram.y, ie flex/bison code must not depend on any GUC variables;
65 : * as such, changing its value can induce very unintuitive behavior.
66 : * In practice, backslash_quote is not too awful since it only controls
67 : * whether to throw an error: it cannot change non-error results.
68 : */
69 : int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
70 :
71 : /*
72 : * Constant data exported from this file. This array maps from the
73 : * zero-based keyword numbers returned by ScanKeywordLookup to the
74 : * Bison token numbers needed by gram.y. This is exported because
75 : * callers need to pass it to scanner_init, if they are using the
76 : * standard keyword list ScanKeywords.
77 : */
78 : #define PG_KEYWORD(kwname, value, category, collabel) value,
79 :
80 : const uint16 ScanKeywordTokens[] = {
81 : #include "parser/kwlist.h"
82 : };
83 :
84 : #undef PG_KEYWORD
85 :
86 : /*
87 : * Set the type of YYSTYPE.
88 : */
89 : #define YYSTYPE core_YYSTYPE
90 :
91 : /*
92 : * Each call to yylex must set yylloc to the location of the found token
93 : * (expressed as a byte offset from the start of the input text).
94 : * When we parse a token that requires multiple lexer rules to process,
95 : * this should be done in the first such rule, else yylloc will point
96 : * into the middle of the token.
97 : */
98 : #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
99 :
100 : /*
101 : * Advance yylloc by the given number of bytes.
102 : */
103 : #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
104 :
105 : /*
106 : * Sometimes, we do want yylloc to point into the middle of a token; this is
107 : * useful for instance to throw an error about an escape sequence within a
108 : * string literal. But if we find no error there, we want to revert yylloc
109 : * to the token start, so that that's the location reported to the parser.
110 : * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
111 : * (Currently the implied "stack" is just one location, but someday we might
112 : * need to nest these.)
113 : */
114 : #define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
115 : #define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
116 :
117 : #define startlit() ( yyextra->literallen = 0 )
118 : static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
119 : static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
120 : static char *litbufdup(core_yyscan_t yyscanner);
121 : static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
122 : static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
123 : static void addunicode(char32_t c, yyscan_t yyscanner);
124 :
125 : #define yyerror(msg) scanner_yyerror(msg, yyscanner)
126 :
127 : #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
128 :
129 : %}
130 :
131 : %option reentrant
132 : %option bison-bridge
133 : %option bison-locations
134 : %option 8bit
135 : %option never-interactive
136 : %option nodefault
137 : %option noinput
138 : %option nounput
139 : %option noyywrap
140 : %option noyyalloc
141 : %option noyyrealloc
142 : %option noyyfree
143 : %option warn
144 : %option prefix="core_yy"
145 : %option extra-type="core_yy_extra_type *"
146 :
147 : /*
148 : * OK, here is a short description of lex/flex rules behavior.
149 : * The longest pattern which matches an input string is always chosen.
150 : * For equal-length patterns, the first occurring in the rules list is chosen.
151 : * INITIAL is the starting state, to which all non-conditional rules apply.
152 : * Exclusive states change parsing rules while the state is active. When in
153 : * an exclusive state, only those rules defined for that state apply.
154 : *
155 : * We use exclusive states for quoted strings, extended comments,
156 : * and to eliminate parsing troubles for numeric strings.
157 : * Exclusive states:
158 : * <xb> bit string literal
159 : * <xc> extended C-style comments
160 : * <xd> delimited identifiers (double-quoted identifiers)
161 : * <xh> hexadecimal byte string
162 : * <xq> standard quoted strings
163 : * <xqs> quote stop (detect continued strings)
164 : * <xe> extended quoted strings (support backslash escape sequences)
165 : * <xdolq> $foo$ quoted strings
166 : * <xui> quoted identifier with Unicode escapes
167 : * <xus> quoted string with Unicode escapes
168 : * <xeu> Unicode surrogate pair in extended quoted string
169 : *
170 : * Remember to add an <<EOF>> case whenever you add a new exclusive state!
171 : * The default one is probably not the right thing.
172 : */
173 :
174 : %x xb
175 : %x xc
176 : %x xd
177 : %x xh
178 : %x xq
179 : %x xqs
180 : %x xe
181 : %x xdolq
182 : %x xui
183 : %x xus
184 : %x xeu
185 :
186 : /*
187 : * In order to make the world safe for Windows and Mac clients as well as
188 : * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
189 : * sequence will be seen as two successive newlines, but that doesn't cause
190 : * any problems. Comments that start with -- and extend to the next
191 : * newline are treated as equivalent to a single whitespace character.
192 : *
193 : * NOTE a fine point: if there is no newline following --, we will absorb
194 : * everything to the end of the input as a comment. This is correct. Older
195 : * versions of Postgres failed to recognize -- as a comment if the input
196 : * did not end with a newline.
197 : *
198 : * non_newline_space tracks all the other space characters except newlines.
199 : *
200 : * XXX if you change the set of whitespace characters, fix scanner_isspace()
201 : * to agree.
202 : */
203 :
204 : space [ \t\n\r\f\v]
205 : non_newline_space [ \t\f\v]
206 : newline [\n\r]
207 : non_newline [^\n\r]
208 :
209 : comment ("--"{non_newline}*)
210 :
211 : whitespace ({space}+|{comment})
212 :
213 : /*
214 : * SQL requires at least one newline in the whitespace separating
215 : * string literals that are to be concatenated. Silly, but who are we
216 : * to argue? Note that {whitespace_with_newline} should not have * after
217 : * it, whereas {whitespace} should generally have a * after it...
218 : */
219 :
220 : special_whitespace ({space}+|{comment}{newline})
221 : non_newline_whitespace ({non_newline_space}|{comment})
222 : whitespace_with_newline ({non_newline_whitespace}*{newline}{special_whitespace}*)
223 :
224 : quote '
225 : /* If we see {quote} then {quotecontinue}, the quoted string continues */
226 : quotecontinue {whitespace_with_newline}{quote}
227 :
228 : /*
229 : * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
230 : * {quotecontinue}. It might seem that this could just be {whitespace}*,
231 : * but if there's a dash after {whitespace_with_newline}, it must be consumed
232 : * to see if there's another dash --- which would start a {comment} and thus
233 : * allow continuation of the {quotecontinue} token.
234 : */
235 : quotecontinuefail {whitespace}*"-"?
236 :
237 : /* Bit string
238 : * It is tempting to scan the string for only those characters
239 : * which are allowed. However, this leads to silently swallowed
240 : * characters if illegal characters are included in the string.
241 : * For example, if xbinside is [01] then B'ABCD' is interpreted
242 : * as a zero-length string, and the ABCD' is lost!
243 : * Better to pass the string forward and let the input routines
244 : * validate the contents.
245 : */
246 : xbstart [bB]{quote}
247 : xbinside [^']*
248 :
249 : /* Hexadecimal byte string */
250 : xhstart [xX]{quote}
251 : xhinside [^']*
252 :
253 : /* National character */
254 : xnstart [nN]{quote}
255 :
256 : /* Quoted string that allows backslash escapes */
257 : xestart [eE]{quote}
258 : xeinside [^\\']+
259 : xeescape [\\][^0-7]
260 : xeoctesc [\\][0-7]{1,3}
261 : xehexesc [\\]x[0-9A-Fa-f]{1,2}
262 : xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
263 : xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
264 :
265 : /* Extended quote
266 : * xqdouble implements embedded quote, ''''
267 : */
268 : xqstart {quote}
269 : xqdouble {quote}{quote}
270 : xqinside [^']+
271 :
272 : /* $foo$ style quotes ("dollar quoting")
273 : * The quoted string starts with $foo$ where "foo" is an optional string
274 : * in the form of an identifier, except that it may not contain "$",
275 : * and extends to the first occurrence of an identical string.
276 : * There is *no* processing of the quoted text.
277 : *
278 : * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
279 : * fails to match its trailing "$".
280 : */
281 : dolq_start [A-Za-z\200-\377_]
282 : dolq_cont [A-Za-z\200-\377_0-9]
283 : dolqdelim \$({dolq_start}{dolq_cont}*)?\$
284 : dolqfailed \${dolq_start}{dolq_cont}*
285 : dolqinside [^$]+
286 :
287 : /* Double quote
288 : * Allows embedded spaces and other special characters into identifiers.
289 : */
290 : dquote \"
291 : xdstart {dquote}
292 : xdstop {dquote}
293 : xddouble {dquote}{dquote}
294 : xdinside [^"]+
295 :
296 : /* Quoted identifier with Unicode escapes */
297 : xuistart [uU]&{dquote}
298 :
299 : /* Quoted string with Unicode escapes */
300 : xusstart [uU]&{quote}
301 :
302 : /* error rule to avoid backup */
303 : xufailed [uU]&
304 :
305 :
306 : /* C-style comments
307 : *
308 : * The "extended comment" syntax closely resembles allowable operator syntax.
309 : * The tricky part here is to get lex to recognize a string starting with
310 : * slash-star as a comment, when interpreting it as an operator would produce
311 : * a longer match --- remember lex will prefer a longer match! Also, if we
312 : * have something like plus-slash-star, lex will think this is a 3-character
313 : * operator whereas we want to see it as a + operator and a comment start.
314 : * The solution is two-fold:
315 : * 1. append {op_chars}* to xcstart so that it matches as much text as
316 : * {operator} would. Then the tie-breaker (first matching rule of same
317 : * length) ensures xcstart wins. We put back the extra stuff with yyless()
318 : * in case it contains a star-slash that should terminate the comment.
319 : * 2. In the operator rule, check for slash-star within the operator, and
320 : * if found throw it back with yyless(). This handles the plus-slash-star
321 : * problem.
322 : * Dash-dash comments have similar interactions with the operator rule.
323 : */
324 : xcstart \/\*{op_chars}*
325 : xcstop \*+\/
326 : xcinside [^*/]+
327 :
328 : ident_start [A-Za-z\200-\377_]
329 : ident_cont [A-Za-z\200-\377_0-9\$]
330 :
331 : identifier {ident_start}{ident_cont}*
332 :
333 : /* Assorted special-case operators and operator-like tokens */
334 : typecast "::"
335 : dot_dot \.\.
336 : colon_equals ":="
337 :
338 : /*
339 : * These operator-like tokens (unlike the above ones) also match the {operator}
340 : * rule, which means that they might be overridden by a longer match if they
341 : * are followed by a comment start or a + or - character. Accordingly, if you
342 : * add to this list, you must also add corresponding code to the {operator}
343 : * block to return the correct token in such cases. (This is not needed in
344 : * psqlscan.l since the token value is ignored there.)
345 : */
346 : equals_greater "=>"
347 : less_equals "<="
348 : greater_equals ">="
349 : less_greater "<>"
350 : not_equals "!="
351 : /* Note there is no need for left_arrow, since "<-" is not a single operator. */
352 : right_arrow "->"
353 :
354 : /*
355 : * "self" is the set of chars that should be returned as single-character
356 : * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
357 : * which can be one or more characters long (but if a single-char token
358 : * appears in the "self" set, it is not to be returned as an Op). Note
359 : * that the sets overlap, but each has some chars that are not in the other.
360 : *
361 : * If you change either set, adjust the character lists appearing in the
362 : * rule for "operator"!
363 : */
364 : self [,()\[\].;\:\|\+\-\*\/\%\^\<\>\=]
365 : op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
366 : operator {op_chars}+
367 :
368 : /*
369 : * Numbers
370 : *
371 : * Unary minus is not part of a number here. Instead we pass it separately to
372 : * the parser, and there it gets coerced via doNegate().
373 : *
374 : * {numericfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
375 : *
376 : * {realfail} is added to prevent the need for scanner
377 : * backup when the {real} rule fails to match completely.
378 : */
379 : decdigit [0-9]
380 : hexdigit [0-9A-Fa-f]
381 : octdigit [0-7]
382 : bindigit [0-1]
383 :
384 : decinteger {decdigit}(_?{decdigit})*
385 : hexinteger 0[xX](_?{hexdigit})+
386 : octinteger 0[oO](_?{octdigit})+
387 : bininteger 0[bB](_?{bindigit})+
388 :
389 : hexfail 0[xX]_?
390 : octfail 0[oO]_?
391 : binfail 0[bB]_?
392 :
393 : numeric (({decinteger}\.{decinteger}?)|(\.{decinteger}))
394 : numericfail {decinteger}\.\.
395 :
396 : real ({decinteger}|{numeric})[Ee][-+]?{decinteger}
397 : realfail ({decinteger}|{numeric})[Ee][-+]
398 :
399 : /* Positional parameters don't accept underscores. */
400 : param \${decdigit}+
401 :
402 : /*
403 : * An identifier immediately following an integer literal is disallowed because
404 : * in some cases it's ambiguous what is meant: for example, 0x1234 could be
405 : * either a hexinteger or a decinteger "0" and an identifier "x1234". We can
406 : * detect such problems by seeing if integer_junk matches a longer substring
407 : * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
408 : * bininteger). One "junk" pattern is sufficient because
409 : * {decinteger}{identifier} will match all the same strings we'd match with
410 : * {hexinteger}{identifier} etc.
411 : *
412 : * Note that the rule for integer_junk must appear after the ones for
413 : * XXXinteger to make this work correctly: 0x1234 will match both hexinteger
414 : * and integer_junk, and we need hexinteger to be chosen in that case.
415 : *
416 : * Also disallow strings matched by numeric_junk, real_junk and param_junk
417 : * for consistency.
418 : */
419 : integer_junk {decinteger}{identifier}
420 : numeric_junk {numeric}{identifier}
421 : real_junk {real}{identifier}
422 : param_junk \${decdigit}+{identifier}
423 :
424 : other .
425 :
426 : /*
427 : * Dollar quoted strings are totally opaque, and no escaping is done on them.
428 : * Other quoted strings must allow some special characters such as single-quote
429 : * and newline.
430 : * Embedded single-quotes are implemented both in the SQL standard
431 : * style of two adjacent single quotes "''" and in the Postgres/Java style
432 : * of escaped-quote "\'".
433 : * Other embedded escaped characters are matched explicitly and the leading
434 : * backslash is dropped from the string.
435 : * Note that xcstart must appear before operator, as explained above!
436 : * Also whitespace (comment) must appear before operator.
437 : */
438 :
439 : %%
440 :
441 : {whitespace} {
442 : /* ignore */
443 : }
444 6517692 :
445 10018 : {xcstart} {
446 : /* Set location in case of syntax error in comment */
447 10018 : SET_YYLLOC();
448 10018 : yyextra->xcdepth = 0;
449 10018 : BEGIN(xc);
450 : /* Put back any characters past slash-star; see above */
451 10018 : yyless(2);
452 : }
453 10018 :
454 : <xc>{
455 12 : {xcstart} {
456 12 : (yyextra->xcdepth)++;
457 : /* Put back any characters past slash-star; see above */
458 12 : yyless(2);
459 : }
460 12 :
461 10030 : {xcstop} {
462 10030 : if (yyextra->xcdepth <= 0)
463 10018 : BEGIN(INITIAL);
464 : else
465 12 : (yyextra->xcdepth)--;
466 : }
467 10030 :
468 55815 : {xcinside} {
469 : /* ignore */
470 : }
471 55815 :
472 45818 : {op_chars} {
473 : /* ignore */
474 : }
475 45818 :
476 0 : \*+ {
477 : /* ignore */
478 : }
479 0 :
480 0 : <<EOF>> {
481 0 : yyerror("unterminated /* comment");
482 : }
483 : } /* <xc> */
484 :
485 534 : {xbstart} {
486 : /* Binary bit type.
487 : * At some point we should simply pass the string
488 : * forward to the parser and label it there.
489 : * In the meantime, place a leading "b" on the string
490 : * to mark it for the input routine as a binary string.
491 : */
492 534 : SET_YYLLOC();
493 534 : BEGIN(xb);
494 534 : startlit();
495 534 : addlitchar('b', yyscanner);
496 : }
497 534 : <xh>{xhinside} |
498 3255 : <xb>{xbinside} {
499 3255 : addlit(yytext, yyleng, yyscanner);
500 : }
501 3255 : <xb><<EOF>> { yyerror("unterminated bit string literal"); }
502 0 :
503 2742 : {xhstart} {
504 : /* Hexadecimal bit type.
505 : * At some point we should simply pass the string
506 : * forward to the parser and label it there.
507 : * In the meantime, place a leading "x" on the string
508 : * to mark it for the input routine as a hex string.
509 : */
510 2742 : SET_YYLLOC();
511 2742 : BEGIN(xh);
512 2742 : startlit();
513 2742 : addlitchar('x', yyscanner);
514 : }
515 2742 : <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
516 0 :
517 2 : {xnstart} {
518 : /* National character.
519 : * We will pass this along as a normal character string,
520 : * but preceded with an internally-generated "NCHAR".
521 : */
522 : int kwnum;
523 :
524 2 : SET_YYLLOC();
525 2 : yyless(1); /* eat only 'n' this time */
526 :
527 2 : kwnum = ScanKeywordLookup("nchar",
528 2 : yyextra->keywordlist);
529 2 : if (kwnum >= 0)
530 : {
531 4 : yylval->keyword = GetScanKeyword(kwnum,
532 2 : yyextra->keywordlist);
533 2 : return yyextra->keyword_tokens[kwnum];
534 : }
535 : else
536 : {
537 : /* If NCHAR isn't a keyword, just return "n" */
538 0 : yylval->str = pstrdup("n");
539 0 : return IDENT;
540 : }
541 : }
542 :
543 507381 : {xqstart} {
544 507381 : yyextra->saw_non_ascii = false;
545 507381 : SET_YYLLOC();
546 507381 : BEGIN(xq);
547 507381 : startlit();
548 : }
549 507381 : {xestart} {
550 3084 : yyextra->saw_non_ascii = false;
551 3084 : SET_YYLLOC();
552 3084 : BEGIN(xe);
553 3084 : startlit();
554 : }
555 3084 : {xusstart} {
556 406 : SET_YYLLOC();
557 406 : BEGIN(xus);
558 406 : startlit();
559 : }
560 406 :
561 514138 : <xb,xh,xq,xe,xus>{quote} {
562 : /*
563 : * When we are scanning a quoted string and see an end
564 : * quote, we must look ahead for a possible continuation.
565 : * If we don't see one, we know the end quote was in fact
566 : * the end of the string. To reduce the lexer table size,
567 : * we use a single "xqs" state to do the lookahead for all
568 : * types of strings.
569 : */
570 514138 : yyextra->state_before_str_stop = YYSTATE;
571 514138 : BEGIN(xqs);
572 : }
573 514138 : <xqs>{quotecontinue} {
574 23 : /*
575 : * Found a quote continuation, so return to the in-quote
576 : * state and continue scanning the literal. Nothing is
577 : * added to the literal's contents.
578 : */
579 23 : BEGIN(yyextra->state_before_str_stop);
580 : }
581 23 : <xqs>{quotecontinuefail} |
582 514115 : <xqs>{other} |
583 : <xqs><<EOF>> {
584 : /*
585 : * Failed to see a quote continuation. Throw back
586 : * everything after the end quote, and handle the string
587 : * according to the state we were in previously.
588 : */
589 514115 : yyless(0);
590 514115 : BEGIN(INITIAL);
591 :
592 514115 : switch (yyextra->state_before_str_stop)
593 : {
594 534 : case xb:
595 534 : yylval->str = litbufdup(yyscanner);
596 534 : return BCONST;
597 2742 : case xh:
598 2742 : yylval->str = litbufdup(yyscanner);
599 2742 : return XCONST;
600 510433 : case xq:
601 : case xe:
602 : /*
603 : * Check that the data remains valid, if it might
604 : * have been made invalid by unescaping any chars.
605 : */
606 510433 : if (yyextra->saw_non_ascii)
607 3 : pg_verifymbstr(yyextra->literalbuf,
608 3 : yyextra->literallen,
609 : false);
610 510433 : yylval->str = litbufdup(yyscanner);
611 510433 : return SCONST;
612 406 : case xus:
613 406 : yylval->str = litbufdup(yyscanner);
614 406 : return USCONST;
615 0 : default:
616 0 : yyerror("unhandled previous state in xqs");
617 : }
618 : }
619 :
620 4437 : <xq,xe,xus>{xqdouble} {
621 4437 : addlitchar('\'', yyscanner);
622 : }
623 4437 : <xq,xus>{xqinside} {
624 498556 : addlit(yytext, yyleng, yyscanner);
625 : }
626 498556 : <xe>{xeinside} {
627 3402 : addlit(yytext, yyleng, yyscanner);
628 : }
629 3402 : <xe>{xeunicode} {
630 116 : char32_t c = strtoul(yytext + 2, NULL, 16);
631 :
632 : /* Remember start of overall string token ... */
633 116 : PUSH_YYLLOC();
634 : /* ... and set the error cursor to point at this esc seq */
635 116 : SET_YYLLOC();
636 :
637 116 : if (is_utf16_surrogate_first(c))
638 : {
639 20 : yyextra->utf16_first_part = c;
640 20 : BEGIN(xeu);
641 : }
642 96 : else if (is_utf16_surrogate_second(c))
643 0 : yyerror("invalid Unicode surrogate pair");
644 : else
645 96 : addunicode(c, yyscanner);
646 :
647 : /* Restore yylloc to be start of string token */
648 112 : POP_YYLLOC();
649 : }
650 112 : <xeu>{xeunicode} {
651 8 : char32_t c = strtoul(yytext + 2, NULL, 16);
652 :
653 : /* Remember start of overall string token ... */
654 8 : PUSH_YYLLOC();
655 : /* ... and set the error cursor to point at this esc seq */
656 8 : SET_YYLLOC();
657 :
658 8 : if (!is_utf16_surrogate_second(c))
659 8 : yyerror("invalid Unicode surrogate pair");
660 :
661 0 : c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
662 :
663 0 : addunicode(c, yyscanner);
664 :
665 : /* Restore yylloc to be start of string token */
666 0 : POP_YYLLOC();
667 :
668 0 : BEGIN(xe);
669 : }
670 0 : <xeu>. |
671 12 : <xeu>\n |
672 : <xeu><<EOF>> {
673 : /* Set the error cursor to point at missing esc seq */
674 12 : SET_YYLLOC();
675 12 : yyerror("invalid Unicode surrogate pair");
676 : }
677 : <xe,xeu>{xeunicodefail} {
678 8 : /* Set the error cursor to point at malformed esc seq */
679 8 : SET_YYLLOC();
680 8 : ereport(ERROR,
681 : (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
682 : errmsg("invalid Unicode escape"),
683 : errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
684 : lexer_errposition()));
685 : }
686 : <xe>{xeescape} {
687 1885 : if (yytext[1] == '\'')
688 : {
689 0 : if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
690 0 : (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
691 0 : PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
692 0 : ereport(ERROR,
693 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
694 : errmsg("unsafe use of \\' in a string literal"),
695 : errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
696 : lexer_errposition()));
697 : }
698 1885 : addlitchar(unescape_single_char(yytext[1], yyscanner),
699 : yyscanner);
700 : }
701 1885 : <xe>{xeoctesc} {
702 30 : unsigned char c = strtoul(yytext + 1, NULL, 8);
703 :
704 30 : addlitchar(c, yyscanner);
705 30 : if (c == '\0' || IS_HIGHBIT_SET(c))
706 0 : yyextra->saw_non_ascii = true;
707 : }
708 30 : <xe>{xehexesc} {
709 6 : unsigned char c = strtoul(yytext + 2, NULL, 16);
710 :
711 6 : addlitchar(c, yyscanner);
712 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
713 5 : yyextra->saw_non_ascii = true;
714 : }
715 6 : <xe>. {
716 0 : /* This is only needed for \ just before EOF */
717 0 : addlitchar(yytext[0], yyscanner);
718 : }
719 0 : <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
720 0 :
721 5852 : {dolqdelim} {
722 5852 : SET_YYLLOC();
723 5852 : yyextra->dolqstart = pstrdup(yytext);
724 5852 : BEGIN(xdolq);
725 5852 : startlit();
726 : }
727 5852 : {dolqfailed} {
728 0 : SET_YYLLOC();
729 : /* throw back all but the initial "$" */
730 0 : yyless(1);
731 : /* and treat it as {other} */
732 0 : return yytext[0];
733 : }
734 : <xdolq>{dolqdelim} {
735 6132 : if (strcmp(yytext, yyextra->dolqstart) == 0)
736 : {
737 5852 : pfree(yyextra->dolqstart);
738 5852 : yyextra->dolqstart = NULL;
739 5852 : BEGIN(INITIAL);
740 5852 : yylval->str = litbufdup(yyscanner);
741 5852 : return SCONST;
742 : }
743 : else
744 : {
745 : /*
746 : * When we fail to match $...$ to dolqstart, transfer
747 : * the $... part to the output, but put back the final
748 : * $ for rescanning. Consider $delim$...$junk$delim$
749 : */
750 280 : addlit(yytext, yyleng - 1, yyscanner);
751 280 : yyless(yyleng - 1);
752 : }
753 : }
754 280 : <xdolq>{dolqinside} {
755 8656 : addlit(yytext, yyleng, yyscanner);
756 : }
757 8656 : <xdolq>{dolqfailed} {
758 580 : addlit(yytext, yyleng, yyscanner);
759 : }
760 580 : <xdolq>. {
761 2224 : /* This is only needed for $ inside the quoted text */
762 2224 : addlitchar(yytext[0], yyscanner);
763 : }
764 2224 : <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
765 0 :
766 71021 : {xdstart} {
767 71021 : SET_YYLLOC();
768 71021 : BEGIN(xd);
769 71021 : startlit();
770 : }
771 71021 : {xuistart} {
772 18 : SET_YYLLOC();
773 18 : BEGIN(xui);
774 18 : startlit();
775 : }
776 18 : <xd>{xdstop} {
777 71021 : char *ident;
778 :
779 71021 : BEGIN(INITIAL);
780 71021 : if (yyextra->literallen == 0)
781 4 : yyerror("zero-length delimited identifier");
782 71017 : ident = litbufdup(yyscanner);
783 71017 : if (yyextra->literallen >= NAMEDATALEN)
784 0 : truncate_identifier(ident, yyextra->literallen, true);
785 71017 : yylval->str = ident;
786 71017 : return IDENT;
787 : }
788 : <xui>{dquote} {
789 18 : BEGIN(INITIAL);
790 18 : if (yyextra->literallen == 0)
791 0 : yyerror("zero-length delimited identifier");
792 : /* can't truncate till after we de-escape the ident */
793 18 : yylval->str = litbufdup(yyscanner);
794 18 : return UIDENT;
795 : }
796 : <xd,xui>{xddouble} {
797 89 : addlitchar('"', yyscanner);
798 : }
799 89 : <xd,xui>{xdinside} {
800 71118 : addlit(yytext, yyleng, yyscanner);
801 : }
802 71118 : <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
803 0 :
804 0 : {xufailed} {
805 : char *ident;
806 :
807 0 : SET_YYLLOC();
808 : /* throw back all but the initial u/U */
809 0 : yyless(1);
810 : /* and treat it as {identifier} */
811 0 : ident = downcase_truncate_identifier(yytext, yyleng, true);
812 0 : yylval->str = ident;
813 0 : return IDENT;
814 : }
815 :
816 150761 : {typecast} {
817 150761 : SET_YYLLOC();
818 150761 : return TYPECAST;
819 : }
820 :
821 445 : {dot_dot} {
822 445 : SET_YYLLOC();
823 445 : return DOT_DOT;
824 : }
825 :
826 29855 : {colon_equals} {
827 29855 : SET_YYLLOC();
828 29855 : return COLON_EQUALS;
829 : }
830 :
831 1368 : {equals_greater} {
832 1368 : SET_YYLLOC();
833 1368 : return EQUALS_GREATER;
834 : }
835 :
836 3670 : {less_equals} {
837 3670 : SET_YYLLOC();
838 3670 : return LESS_EQUALS;
839 : }
840 :
841 8101 : {greater_equals} {
842 8101 : SET_YYLLOC();
843 8101 : return GREATER_EQUALS;
844 : }
845 :
846 8144 : {less_greater} {
847 : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
848 8144 : SET_YYLLOC();
849 8144 : return NOT_EQUALS;
850 : }
851 :
852 18039 : {not_equals} {
853 : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
854 18039 : SET_YYLLOC();
855 18039 : return NOT_EQUALS;
856 : }
857 :
858 1043 : {right_arrow} {
859 1043 : SET_YYLLOC();
860 1043 : return RIGHT_ARROW;
861 : }
862 :
863 4274064 : {self} {
864 4274064 : SET_YYLLOC();
865 4274064 : return yytext[0];
866 : }
867 :
868 38866 : {operator} {
869 : /*
870 : * Check for embedded slash-star or dash-dash; those
871 : * are comment starts, so operator must stop there.
872 : * Note that slash-star or dash-dash at the first
873 : * character will match a prior rule, not this one.
874 : */
875 38866 : int nchars = yyleng;
876 38866 : char *slashstar = strstr(yytext, "/*");
877 38866 : char *dashdash = strstr(yytext, "--");
878 :
879 38866 : if (slashstar && dashdash)
880 : {
881 : /* if both appear, take the first one */
882 0 : if (slashstar > dashdash)
883 0 : slashstar = dashdash;
884 : }
885 38866 : else if (!slashstar)
886 38821 : slashstar = dashdash;
887 38866 : if (slashstar)
888 70 : nchars = slashstar - yytext;
889 :
890 : /*
891 : * For SQL compatibility, '+' and '-' cannot be the
892 : * last char of a multi-char operator unless the operator
893 : * contains chars that are not in SQL operators.
894 : * The idea is to lex '=-' as two operators, but not
895 : * to forbid operator names like '?-' that could not be
896 : * sequences of SQL operators.
897 : */
898 38866 : if (nchars > 1 &&
899 31314 : (yytext[nchars - 1] == '+' ||
900 31309 : yytext[nchars - 1] == '-'))
901 : {
902 : int ic;
903 :
904 2391 : for (ic = nchars - 2; ic >= 0; ic--)
905 : {
906 1348 : char c = yytext[ic];
907 1348 : if (c == '~' || c == '!' || c == '@' ||
908 1284 : c == '#' || c == '^' || c == '&' ||
909 1099 : c == '|' || c == '`' || c == '?' ||
910 : c == '%')
911 : break;
912 : }
913 1305 : if (ic < 0)
914 : {
915 : /*
916 : * didn't find a qualifying character, so remove
917 : * all trailing [+-]
918 : */
919 : do {
920 1043 : nchars--;
921 1043 : } while (nchars > 1 &&
922 28 : (yytext[nchars - 1] == '+' ||
923 28 : yytext[nchars - 1] == '-'));
924 : }
925 : }
926 :
927 38866 : SET_YYLLOC();
928 :
929 38866 : if (nchars < yyleng)
930 : {
931 : /* Strip the unwanted chars from the token */
932 1113 : yyless(nchars);
933 : /*
934 : * If what we have left is only one char, and it's
935 : * one of the characters matching "self", then
936 : * return it as a character token the same way
937 : * that the "self" rule would have.
938 : */
939 1113 : if (nchars == 1 &&
940 1015 : strchr(",()[].;:|+-*/%^<>=", yytext[0]))
941 1015 : return yytext[0];
942 : /*
943 : * Likewise, if what we have left is two chars, and
944 : * those match the tokens ">=", "<=", "=>", "<>" or
945 : * "!=", then we must return the appropriate token
946 : * rather than the generic Op.
947 : */
948 98 : if (nchars == 2)
949 : {
950 98 : if (yytext[0] == '=' && yytext[1] == '>')
951 30 : return EQUALS_GREATER;
952 68 : if (yytext[0] == '>' && yytext[1] == '=')
953 14 : return GREATER_EQUALS;
954 54 : if (yytext[0] == '<' && yytext[1] == '=')
955 14 : return LESS_EQUALS;
956 40 : if (yytext[0] == '<' && yytext[1] == '>')
957 18 : return NOT_EQUALS;
958 22 : if (yytext[0] == '!' && yytext[1] == '=')
959 19 : return NOT_EQUALS;
960 3 : if (yytext[0] == '-' && yytext[1] == '>')
961 3 : return RIGHT_ARROW;
962 : }
963 : }
964 :
965 : /*
966 : * Complain if operator is too long. Unlike the case
967 : * for identifiers, we make this an error not a notice-
968 : * and-truncate, because the odds are we are looking at
969 : * a syntactic mistake anyway.
970 : */
971 37753 : if (nchars >= NAMEDATALEN)
972 0 : yyerror("operator too long");
973 :
974 37753 : yylval->str = pstrdup(yytext);
975 37753 : return Op;
976 : }
977 :
978 27610 : {param} {
979 27610 : ErrorSaveContext escontext = {T_ErrorSaveContext};
980 : int32 val;
981 :
982 27610 : SET_YYLLOC();
983 27610 : val = pg_strtoint32_safe(yytext + 1, (Node *) &escontext);
984 27610 : if (escontext.error_occurred)
985 4 : yyerror("parameter number too large");
986 27606 : yylval->ival = val;
987 27606 : return PARAM;
988 : }
989 : {param_junk} {
990 8 : SET_YYLLOC();
991 8 : yyerror("trailing junk after parameter");
992 : }
993 :
994 293532 : {decinteger} {
995 293532 : SET_YYLLOC();
996 293532 : return process_integer_literal(yytext, yylval, 10);
997 : }
998 : {hexinteger} {
999 494 : SET_YYLLOC();
1000 494 : return process_integer_literal(yytext, yylval, 16);
1001 : }
1002 : {octinteger} {
1003 40 : SET_YYLLOC();
1004 40 : return process_integer_literal(yytext, yylval, 8);
1005 : }
1006 : {bininteger} {
1007 41 : SET_YYLLOC();
1008 41 : return process_integer_literal(yytext, yylval, 2);
1009 : }
1010 : {hexfail} {
1011 4 : SET_YYLLOC();
1012 4 : yyerror("invalid hexadecimal integer");
1013 : }
1014 : {octfail} {
1015 4 : SET_YYLLOC();
1016 4 : yyerror("invalid octal integer");
1017 : }
1018 : {binfail} {
1019 4 : SET_YYLLOC();
1020 4 : yyerror("invalid binary integer");
1021 : }
1022 : {numeric} {
1023 7175 : SET_YYLLOC();
1024 7175 : yylval->str = pstrdup(yytext);
1025 7175 : return FCONST;
1026 : }
1027 : {numericfail} {
1028 59 : /* throw back the .., and treat as integer */
1029 59 : yyless(yyleng - 2);
1030 59 : SET_YYLLOC();
1031 59 : return process_integer_literal(yytext, yylval, 10);
1032 : }
1033 : {real} {
1034 783 : SET_YYLLOC();
1035 783 : yylval->str = pstrdup(yytext);
1036 783 : return FCONST;
1037 : }
1038 : {realfail} {
1039 4 : SET_YYLLOC();
1040 4 : yyerror("trailing junk after numeric literal");
1041 : }
1042 : {integer_junk} {
1043 44 : SET_YYLLOC();
1044 44 : yyerror("trailing junk after numeric literal");
1045 : }
1046 : {numeric_junk} {
1047 32 : SET_YYLLOC();
1048 32 : yyerror("trailing junk after numeric literal");
1049 : }
1050 : {real_junk} {
1051 0 : SET_YYLLOC();
1052 0 : yyerror("trailing junk after numeric literal");
1053 : }
1054 :
1055 7005885 :
1056 : {identifier} {
1057 : int kwnum;
1058 : char *ident;
1059 :
1060 7005885 : SET_YYLLOC();
1061 :
1062 : /* Is it a keyword? */
1063 7005885 : kwnum = ScanKeywordLookup(yytext,
1064 7005885 : yyextra->keywordlist);
1065 7005885 : if (kwnum >= 0)
1066 : {
1067 6230870 : yylval->keyword = GetScanKeyword(kwnum,
1068 3115435 : yyextra->keywordlist);
1069 3115435 : return yyextra->keyword_tokens[kwnum];
1070 : }
1071 :
1072 : /*
1073 : * No. Convert the identifier to lower case, and truncate
1074 : * if necessary.
1075 : */
1076 3890450 : ident = downcase_truncate_identifier(yytext, yyleng, true);
1077 3890450 : yylval->str = ident;
1078 3890450 : return IDENT;
1079 : }
1080 :
1081 8 : {other} {
1082 8 : SET_YYLLOC();
1083 8 : return yytext[0];
1084 : }
1085 :
1086 493849 : <<EOF>> {
1087 493849 : SET_YYLLOC();
1088 493849 : yyterminate();
1089 : }
1090 :
1091 0 : %%
1092 :
1093 : /* LCOV_EXCL_STOP */
1094 :
1095 : /*
1096 : * Arrange access to yyextra for subroutines of the main yylex() function.
1097 : * We expect each subroutine to have a yyscanner parameter. Rather than
1098 : * use the yyget_xxx functions, which might or might not get inlined by the
1099 : * compiler, we cheat just a bit and cast yyscanner to the right type.
1100 : */
1101 : #undef yyextra
1102 : #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1103 :
1104 : /* Likewise for a couple of other things we need. */
1105 : #undef yylloc
1106 : #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1107 : #undef yyleng
1108 : #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1109 :
1110 :
1111 : /*
1112 : * scanner_errposition
1113 : * Report a lexer or grammar error cursor position, if possible.
1114 : *
1115 : * This is expected to be used within an ereport() call, or via an error
1116 : * callback such as setup_scanner_errposition_callback(). The return value
1117 : * is a dummy (always 0, in fact).
1118 : *
1119 : * Note that this can only be used for messages emitted during raw parsing
1120 : * (essentially, scan.l, parser.c, and gram.y), since it requires the
1121 : * yyscanner struct to still be available.
1122 : */
1123 : int
1124 819 : scanner_errposition(int location, core_yyscan_t yyscanner)
1125 : {
1126 : int pos;
1127 :
1128 819 : if (location < 0)
1129 0 : return 0; /* no-op if location is unknown */
1130 :
1131 : /* Convert byte offset to character number */
1132 819 : pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1133 : /* And pass it to the ereport mechanism */
1134 819 : return errposition(pos);
1135 : }
1136 :
1137 : /*
1138 : * Error context callback for inserting scanner error location.
1139 : *
1140 : * Note that this will be called for *any* error occurring while the
1141 : * callback is installed. We avoid inserting an irrelevant error location
1142 : * if the error is a query cancel --- are there any other important cases?
1143 : */
1144 : static void
1145 24 : scb_error_callback(void *arg)
1146 : {
1147 24 : ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
1148 :
1149 24 : if (geterrcode() != ERRCODE_QUERY_CANCELED)
1150 24 : (void) scanner_errposition(scbstate->location, scbstate->yyscanner);
1151 24 : }
1152 :
1153 : /*
1154 : * setup_scanner_errposition_callback
1155 : * Arrange for non-scanner errors to report an error position
1156 : *
1157 : * Sometimes the scanner calls functions that aren't part of the scanner
1158 : * subsystem and can't reasonably be passed the yyscanner pointer; yet
1159 : * we would like any errors thrown in those functions to be tagged with an
1160 : * error location. Use this function to set up an error context stack
1161 : * entry that will accomplish that. Usage pattern:
1162 : *
1163 : * declare a local variable "ScannerCallbackState scbstate"
1164 : * ...
1165 : * setup_scanner_errposition_callback(&scbstate, yyscanner, location);
1166 : * call function that might throw error;
1167 : * cancel_scanner_errposition_callback(&scbstate);
1168 : */
1169 : void
1170 622 : setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
1171 : core_yyscan_t yyscanner,
1172 : int location)
1173 : {
1174 : /* Setup error traceback support for ereport() */
1175 622 : scbstate->yyscanner = yyscanner;
1176 622 : scbstate->location = location;
1177 622 : scbstate->errcallback.callback = scb_error_callback;
1178 622 : scbstate->errcallback.arg = scbstate;
1179 622 : scbstate->errcallback.previous = error_context_stack;
1180 622 : error_context_stack = &scbstate->errcallback;
1181 622 : }
1182 :
1183 : /*
1184 : * Cancel a previously-set-up errposition callback.
1185 : */
1186 : void
1187 598 : cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
1188 : {
1189 : /* Pop the error context stack */
1190 598 : error_context_stack = scbstate->errcallback.previous;
1191 598 : }
1192 :
1193 : /*
1194 : * scanner_yyerror
1195 : * Report a lexer or grammar error.
1196 : *
1197 : * The message's cursor position is whatever YYLLOC was last set to,
1198 : * ie, the start of the current token if called within yylex(), or the
1199 : * most recently lexed token if called from the grammar.
1200 : * This is OK for syntax error messages from the Bison parser, because Bison
1201 : * parsers report error as soon as the first unparsable token is reached.
1202 : * Beware of using yyerror for other purposes, as the cursor position might
1203 : * be misleading!
1204 : */
1205 : void
1206 613 : scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1207 : {
1208 613 : const char *loc = yyextra->scanbuf + *yylloc;
1209 :
1210 613 : if (*loc == YY_END_OF_BUFFER_CHAR)
1211 : {
1212 12 : ereport(ERROR,
1213 : (errcode(ERRCODE_SYNTAX_ERROR),
1214 : /* translator: %s is typically the translation of "syntax error" */
1215 : errmsg("%s at end of input", _(message)),
1216 : lexer_errposition()));
1217 : }
1218 : else
1219 : {
1220 601 : ereport(ERROR,
1221 : (errcode(ERRCODE_SYNTAX_ERROR),
1222 : /* translator: first %s is typically the translation of "syntax error" */
1223 : errmsg("%s at or near \"%s\"", _(message), loc),
1224 : lexer_errposition()));
1225 : }
1226 : }
1227 :
1228 :
1229 : /*
1230 : * Called before any actual parsing is done
1231 : */
1232 : core_yyscan_t
1233 505956 : scanner_init(const char *str,
1234 : core_yy_extra_type *yyext,
1235 : const ScanKeywordList *keywordlist,
1236 : const uint16 *keyword_tokens)
1237 : {
1238 505956 : Size slen = strlen(str);
1239 : yyscan_t scanner;
1240 :
1241 505956 : if (yylex_init(&scanner) != 0)
1242 0 : elog(ERROR, "yylex_init() failed: %m");
1243 :
1244 505956 : core_yyset_extra(yyext, scanner);
1245 :
1246 505956 : yyext->keywordlist = keywordlist;
1247 505956 : yyext->keyword_tokens = keyword_tokens;
1248 :
1249 505956 : yyext->backslash_quote = backslash_quote;
1250 :
1251 : /*
1252 : * Make a scan buffer with special termination needed by flex.
1253 : */
1254 505956 : yyext->scanbuf = (char *) palloc(slen + 2);
1255 505956 : yyext->scanbuflen = slen;
1256 505956 : memcpy(yyext->scanbuf, str, slen);
1257 505956 : yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1258 505956 : yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1259 :
1260 : /* initialize literal buffer to a reasonable but expansible size */
1261 505956 : yyext->literalalloc = 1024;
1262 505956 : yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1263 505956 : yyext->literallen = 0;
1264 :
1265 505956 : return scanner;
1266 : }
1267 :
1268 :
1269 : /*
1270 : * Called after parsing is done to clean up after scanner_init()
1271 : */
1272 : void
1273 505042 : scanner_finish(core_yyscan_t yyscanner)
1274 : {
1275 : /*
1276 : * We don't bother to call yylex_destroy(), because all it would do is
1277 : * pfree a small amount of control storage. It's cheaper to leak the
1278 : * storage until the parsing context is destroyed. The amount of space
1279 : * involved is usually negligible compared to the output parse tree
1280 : * anyway.
1281 : *
1282 : * We do bother to pfree the scanbuf and literal buffer, but only if they
1283 : * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1284 : */
1285 505042 : if (yyextra->scanbuflen >= 8192)
1286 60 : pfree(yyextra->scanbuf);
1287 505042 : if (yyextra->literalalloc >= 8192)
1288 43 : pfree(yyextra->literalbuf);
1289 505042 : }
1290 :
1291 :
1292 : static void
1293 585939 : addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1294 : {
1295 : /* enlarge buffer if needed */
1296 585939 : if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1297 : {
1298 168 : yyextra->literalalloc = pg_nextpower2_32(yyextra->literallen + yleng + 1);
1299 168 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1300 168 : yyextra->literalalloc);
1301 : }
1302 : /* append new data */
1303 585939 : memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1304 585939 : yyextra->literallen += yleng;
1305 585939 : }
1306 :
1307 :
1308 : static void
1309 11947 : addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1310 : {
1311 : /* enlarge buffer if needed */
1312 11947 : if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1313 : {
1314 0 : yyextra->literalalloc *= 2;
1315 0 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1316 0 : yyextra->literalalloc);
1317 : }
1318 : /* append new data */
1319 11947 : yyextra->literalbuf[yyextra->literallen] = ychar;
1320 11947 : yyextra->literallen += 1;
1321 11947 : }
1322 :
1323 :
1324 : /*
1325 : * Create a palloc'd copy of literalbuf, adding a trailing null.
1326 : */
1327 : static char *
1328 591002 : litbufdup(core_yyscan_t yyscanner)
1329 : {
1330 591002 : int llen = yyextra->literallen;
1331 : char *new;
1332 :
1333 591002 : new = palloc(llen + 1);
1334 591002 : memcpy(new, yyextra->literalbuf, llen);
1335 591002 : new[llen] = '\0';
1336 591002 : return new;
1337 : }
1338 :
1339 : /*
1340 : * Process {decinteger}, {hexinteger}, etc. Note this will also do the right
1341 : * thing with {numeric}, ie digits and a decimal point.
1342 : */
1343 : static int
1344 294166 : process_integer_literal(const char *token, YYSTYPE *lval, int base)
1345 : {
1346 294166 : ErrorSaveContext escontext = {T_ErrorSaveContext};
1347 : int32 val;
1348 :
1349 294166 : val = pg_strtoint32_safe(token, (Node *) &escontext);
1350 294166 : if (escontext.error_occurred)
1351 : {
1352 : /* integer too large (or contains decimal pt), treat it as a float */
1353 1068 : lval->str = pstrdup(token);
1354 1068 : return FCONST;
1355 : }
1356 293098 : lval->ival = val;
1357 293098 : return ICONST;
1358 : }
1359 :
1360 : static void
1361 96 : addunicode(char32_t c, core_yyscan_t yyscanner)
1362 : {
1363 : ScannerCallbackState scbstate;
1364 : char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
1365 :
1366 96 : if (!is_valid_unicode_codepoint(c))
1367 4 : yyerror("invalid Unicode escape value");
1368 :
1369 : /*
1370 : * We expect that pg_unicode_to_server() will complain about any
1371 : * unconvertible code point, so we don't have to set saw_non_ascii.
1372 : */
1373 92 : setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
1374 92 : pg_unicode_to_server(c, (unsigned char *) buf);
1375 92 : cancel_scanner_errposition_callback(&scbstate);
1376 92 : addlit(buf, strlen(buf), yyscanner);
1377 92 : }
1378 :
1379 : static unsigned char
1380 1885 : unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1381 : {
1382 1885 : switch (c)
1383 : {
1384 18 : case 'b':
1385 18 : return '\b';
1386 6 : case 'f':
1387 6 : return '\f';
1388 899 : case 'n':
1389 899 : return '\n';
1390 46 : case 'r':
1391 46 : return '\r';
1392 21 : case 't':
1393 21 : return '\t';
1394 0 : case 'v':
1395 0 : return '\v';
1396 895 : default:
1397 : /* check for backslash followed by non-7-bit-ASCII */
1398 895 : if (c == '\0' || IS_HIGHBIT_SET(c))
1399 0 : yyextra->saw_non_ascii = true;
1400 :
1401 895 : return c;
1402 : }
1403 : }
1404 :
1405 : /*
1406 : * Interface functions to make flex use palloc() instead of malloc().
1407 : * It'd be better to make these static, but flex insists otherwise.
1408 : */
1409 :
1410 : void *
1411 1517868 : core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1412 : {
1413 1517868 : return palloc(bytes);
1414 : }
1415 :
1416 : void *
1417 0 : core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1418 : {
1419 0 : if (ptr)
1420 0 : return repalloc(ptr, bytes);
1421 : else
1422 0 : return palloc(bytes);
1423 : }
1424 :
1425 : void
1426 0 : core_yyfree(void *ptr, core_yyscan_t yyscanner)
1427 : {
1428 0 : if (ptr)
1429 0 : pfree(ptr);
1430 0 : }
|