Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 : * flags are both true or both false in current usage, but we keep them
25 : * separate for clarity.
26 : *
27 : * If oprisdelim is set, the following characters are treated as delimiters
28 : * (in addition to whitespace): ! | & ( )
29 : *
30 : * is_tsquery affects *only* the content of error messages.
31 : *
32 : * is_web can be true to further modify tsquery parsing.
33 : *
34 : * If escontext is an ErrorSaveContext node, then soft errors can be
35 : * captured there rather than being thrown.
36 : */
37 : struct TSVectorParseStateData
38 : {
39 : char *prsbuf; /* next input character */
40 : char *bufstart; /* whole string (used only for errors) */
41 : char *word; /* buffer to hold the current word */
42 : int len; /* size in bytes allocated for 'word' */
43 : int eml; /* max bytes per character */
44 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 : bool is_web; /* we're in websearch_to_tsquery() */
47 : Node *escontext; /* for soft error reporting */
48 : };
49 :
50 :
51 : /*
52 : * Initializes a parser state object for the given input string.
53 : * A bitmask of flags (see ts_utils.h) and an error context object
54 : * can be provided as well.
55 : */
56 : TSVectorParseState
57 5269 : init_tsvector_parser(char *input, int flags, Node *escontext)
58 : {
59 : TSVectorParseState state;
60 :
61 5269 : state = palloc_object(struct TSVectorParseStateData);
62 5269 : state->prsbuf = input;
63 5269 : state->bufstart = input;
64 5269 : state->len = 32;
65 5269 : state->word = (char *) palloc(state->len);
66 5269 : state->eml = pg_database_encoding_max_length();
67 5269 : state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 5269 : state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 5269 : state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 5269 : state->escontext = escontext;
71 :
72 5269 : return state;
73 : }
74 :
75 : /*
76 : * Reinitializes parser to parse 'input', instead of previous input.
77 : *
78 : * Note that bufstart (the string reported in errors) is not changed.
79 : */
80 : void
81 5833 : reset_tsvector_parser(TSVectorParseState state, char *input)
82 : {
83 5833 : state->prsbuf = input;
84 5833 : }
85 :
86 : /*
87 : * Shuts down a tsvector parser.
88 : */
89 : void
90 5265 : close_tsvector_parser(TSVectorParseState state)
91 : {
92 5265 : pfree(state->word);
93 5265 : pfree(state);
94 5265 : }
95 :
96 : /* increase the size of 'word' if needed to hold one more character */
97 : #define RESIZEPRSBUF \
98 : do { \
99 : int clen = curpos - state->word; \
100 : if ( clen + state->eml >= state->len ) \
101 : { \
102 : state->len *= 2; \
103 : state->word = (char *) repalloc(state->word, state->len); \
104 : curpos = state->word + clen; \
105 : } \
106 : } while (0)
107 :
108 : /* Fills gettoken_tsvector's output parameters, and returns true */
109 : #define RETURN_TOKEN \
110 : do { \
111 : if (pos_ptr != NULL) \
112 : { \
113 : *pos_ptr = pos; \
114 : *poslen = npos; \
115 : } \
116 : else if (pos != NULL) \
117 : pfree(pos); \
118 : \
119 : if (strval != NULL) \
120 : *strval = state->word; \
121 : if (lenval != NULL) \
122 : *lenval = curpos - state->word; \
123 : if (endptr != NULL) \
124 : *endptr = state->prsbuf; \
125 : return true; \
126 : } while(0)
127 :
128 :
129 : /* State codes used in gettoken_tsvector */
130 : #define WAITWORD 1
131 : #define WAITENDWORD 2
132 : #define WAITNEXTCHAR 3
133 : #define WAITENDCMPLX 4
134 : #define WAITPOSINFO 5
135 : #define INPOSINFO 6
136 : #define WAITPOSDELIM 7
137 : #define WAITCHARCMPLX 8
138 :
139 : #define PRSSYNTAXERROR return prssyntaxerror(state)
140 :
141 : static bool
142 12 : prssyntaxerror(TSVectorParseState state)
143 : {
144 12 : errsave(state->escontext,
145 : (errcode(ERRCODE_SYNTAX_ERROR),
146 : state->is_tsquery ?
147 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 : /* In soft error situation, return false as convenience for caller */
150 8 : return false;
151 : }
152 :
153 :
154 : /*
155 : * Get next token from string being parsed. Returns true if successful,
156 : * false if end of input string is reached or soft error.
157 : *
158 : * On success, these output parameters are filled in:
159 : *
160 : * *strval pointer to token
161 : * *lenval length of *strval
162 : * *pos_ptr pointer to a palloc'd array of positions and weights
163 : * associated with the token. If the caller is not interested
164 : * in the information, NULL can be supplied. Otherwise
165 : * the caller is responsible for pfreeing the array.
166 : * *poslen number of elements in *pos_ptr
167 : * *endptr scan resumption point
168 : *
169 : * Pass NULL for any unwanted output parameters.
170 : *
171 : * If state->escontext is an ErrorSaveContext, then caller must check
172 : * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 : * error or normal end-of-string.
174 : */
175 : bool
176 128899 : gettoken_tsvector(TSVectorParseState state,
177 : char **strval, int *lenval,
178 : WordEntryPos **pos_ptr, int *poslen,
179 : char **endptr)
180 : {
181 128899 : int oldstate = 0;
182 128899 : char *curpos = state->word;
183 128899 : int statecode = WAITWORD;
184 :
185 : /*
186 : * pos is for collecting the comma delimited list of positions followed by
187 : * the actual token.
188 : */
189 128899 : WordEntryPos *pos = NULL;
190 128899 : int npos = 0; /* elements of pos used */
191 128899 : int posalen = 0; /* allocated size of pos */
192 :
193 : while (1)
194 : {
195 525253 : if (statecode == WAITWORD)
196 : {
197 247153 : if (*(state->prsbuf) == '\0')
198 2520 : return false;
199 244633 : else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200 115 : statecode = WAITENDCMPLX;
201 244518 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 : {
203 4 : statecode = WAITNEXTCHAR;
204 4 : oldstate = WAITENDWORD;
205 : }
206 244514 : else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 244514 : (state->is_web && t_iseq(state->prsbuf, '"')))
208 0 : PRSSYNTAXERROR;
209 244514 : else if (!isspace((unsigned char) *state->prsbuf))
210 : {
211 126260 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
212 126260 : statecode = WAITENDWORD;
213 : }
214 : }
215 278100 : else if (statecode == WAITNEXTCHAR)
216 : {
217 113 : if (*(state->prsbuf) == '\0')
218 0 : ereturn(state->escontext, false,
219 : (errcode(ERRCODE_SYNTAX_ERROR),
220 : errmsg("there is no escaped character: \"%s\"",
221 : state->bufstart)));
222 : else
223 : {
224 113 : RESIZEPRSBUF;
225 113 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
226 : Assert(oldstate != 0);
227 113 : statecode = oldstate;
228 : }
229 : }
230 277987 : else if (statecode == WAITENDWORD)
231 : {
232 256838 : if (!state->is_web && t_iseq(state->prsbuf, '\\'))
233 : {
234 48 : statecode = WAITNEXTCHAR;
235 48 : oldstate = WAITENDWORD;
236 : }
237 256790 : else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
238 138909 : (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
239 137633 : (state->is_web && t_iseq(state->prsbuf, '"')))
240 : {
241 119167 : RESIZEPRSBUF;
242 119167 : if (curpos == state->word)
243 0 : PRSSYNTAXERROR;
244 119167 : *(curpos) = '\0';
245 119167 : RETURN_TOKEN;
246 : }
247 137623 : else if (t_iseq(state->prsbuf, ':'))
248 : {
249 7097 : if (curpos == state->word)
250 0 : PRSSYNTAXERROR;
251 7097 : *(curpos) = '\0';
252 7097 : if (state->oprisdelim)
253 485 : RETURN_TOKEN;
254 : else
255 6612 : statecode = INPOSINFO;
256 : }
257 : else
258 : {
259 130526 : RESIZEPRSBUF;
260 130526 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
261 : }
262 : }
263 21149 : else if (statecode == WAITENDCMPLX)
264 : {
265 669 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
266 : {
267 115 : statecode = WAITCHARCMPLX;
268 : }
269 554 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
270 : {
271 61 : statecode = WAITNEXTCHAR;
272 61 : oldstate = WAITENDCMPLX;
273 : }
274 493 : else if (*(state->prsbuf) == '\0')
275 0 : PRSSYNTAXERROR;
276 : else
277 : {
278 493 : RESIZEPRSBUF;
279 493 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
280 : }
281 : }
282 20480 : else if (statecode == WAITCHARCMPLX)
283 : {
284 115 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
285 : {
286 0 : RESIZEPRSBUF;
287 0 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
288 0 : statecode = WAITENDCMPLX;
289 : }
290 : else
291 : {
292 115 : RESIZEPRSBUF;
293 115 : *(curpos) = '\0';
294 115 : if (curpos == state->word)
295 12 : PRSSYNTAXERROR;
296 103 : if (state->oprisdelim)
297 : {
298 : /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
299 46 : RETURN_TOKEN;
300 : }
301 : else
302 57 : statecode = WAITPOSINFO;
303 57 : continue; /* recheck current character */
304 : }
305 : }
306 20365 : else if (statecode == WAITPOSINFO)
307 : {
308 57 : if (t_iseq(state->prsbuf, ':'))
309 0 : statecode = INPOSINFO;
310 : else
311 57 : RETURN_TOKEN;
312 : }
313 20308 : else if (statecode == INPOSINFO)
314 : {
315 7016 : if (isdigit((unsigned char) *state->prsbuf))
316 : {
317 7016 : if (posalen == 0)
318 : {
319 6612 : posalen = 4;
320 6612 : pos = palloc_array(WordEntryPos, posalen);
321 6612 : npos = 0;
322 : }
323 404 : else if (npos + 1 >= posalen)
324 : {
325 76 : posalen *= 2;
326 76 : pos = repalloc_array(pos, WordEntryPos, posalen);
327 : }
328 7016 : npos++;
329 7016 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
330 : /* we cannot get here in tsquery, so no need for 2 errmsgs */
331 7016 : if (WEP_GETPOS(pos[npos - 1]) == 0)
332 0 : ereturn(state->escontext, false,
333 : (errcode(ERRCODE_SYNTAX_ERROR),
334 : errmsg("wrong position info in tsvector: \"%s\"",
335 : state->bufstart)));
336 7016 : WEP_SETWEIGHT(pos[npos - 1], 0);
337 7016 : statecode = WAITPOSDELIM;
338 : }
339 : else
340 0 : PRSSYNTAXERROR;
341 : }
342 13292 : else if (statecode == WAITPOSDELIM)
343 : {
344 13292 : if (t_iseq(state->prsbuf, ','))
345 404 : statecode = INPOSINFO;
346 12888 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
347 : {
348 280 : if (WEP_GETWEIGHT(pos[npos - 1]))
349 0 : PRSSYNTAXERROR;
350 280 : WEP_SETWEIGHT(pos[npos - 1], 3);
351 : }
352 12608 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
353 : {
354 144 : if (WEP_GETWEIGHT(pos[npos - 1]))
355 0 : PRSSYNTAXERROR;
356 144 : WEP_SETWEIGHT(pos[npos - 1], 2);
357 : }
358 12464 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
359 : {
360 184 : if (WEP_GETWEIGHT(pos[npos - 1]))
361 0 : PRSSYNTAXERROR;
362 184 : WEP_SETWEIGHT(pos[npos - 1], 1);
363 : }
364 12280 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
365 : {
366 88 : if (WEP_GETWEIGHT(pos[npos - 1]))
367 0 : PRSSYNTAXERROR;
368 88 : WEP_SETWEIGHT(pos[npos - 1], 0);
369 : }
370 12192 : else if (isspace((unsigned char) *state->prsbuf) ||
371 5868 : *(state->prsbuf) == '\0')
372 6612 : RETURN_TOKEN;
373 5580 : else if (!isdigit((unsigned char) *state->prsbuf))
374 0 : PRSSYNTAXERROR;
375 : }
376 : else /* internal error */
377 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
378 : statecode);
379 :
380 : /* get next char */
381 396297 : state->prsbuf += pg_mblen_cstr(state->prsbuf);
382 : }
383 : }
|