Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 : * flags are both true or both false in current usage, but we keep them
25 : * separate for clarity.
26 : *
27 : * If oprisdelim is set, the following characters are treated as delimiters
28 : * (in addition to whitespace): ! | & ( )
29 : *
30 : * is_tsquery affects *only* the content of error messages.
31 : *
32 : * is_web can be true to further modify tsquery parsing.
33 : *
34 : * If escontext is an ErrorSaveContext node, then soft errors can be
35 : * captured there rather than being thrown.
36 : */
37 : struct TSVectorParseStateData
38 : {
39 : char *prsbuf; /* next input character */
40 : char *bufstart; /* whole string (used only for errors) */
41 : char *word; /* buffer to hold the current word */
42 : int len; /* size in bytes allocated for 'word' */
43 : int eml; /* max bytes per character */
44 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 : bool is_web; /* we're in websearch_to_tsquery() */
47 : Node *escontext; /* for soft error reporting */
48 : };
49 :
50 :
51 : /*
52 : * Initializes a parser state object for the given input string.
53 : * A bitmask of flags (see ts_utils.h) and an error context object
54 : * can be provided as well.
55 : */
56 : TSVectorParseState
57 7638 : init_tsvector_parser(char *input, int flags, Node *escontext)
58 : {
59 : TSVectorParseState state;
60 :
61 7638 : state = palloc_object(struct TSVectorParseStateData);
62 7638 : state->prsbuf = input;
63 7638 : state->bufstart = input;
64 7638 : state->len = 32;
65 7638 : state->word = (char *) palloc(state->len);
66 7638 : state->eml = pg_database_encoding_max_length();
67 7638 : state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 7638 : state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 7638 : state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 7638 : state->escontext = escontext;
71 :
72 7638 : return state;
73 : }
74 :
75 : /*
76 : * Reinitializes parser to parse 'input', instead of previous input.
77 : *
78 : * Note that bufstart (the string reported in errors) is not changed.
79 : */
80 : void
81 8136 : reset_tsvector_parser(TSVectorParseState state, char *input)
82 : {
83 8136 : state->prsbuf = input;
84 8136 : }
85 :
86 : /*
87 : * Shuts down a tsvector parser.
88 : */
89 : void
90 7632 : close_tsvector_parser(TSVectorParseState state)
91 : {
92 7632 : pfree(state->word);
93 7632 : pfree(state);
94 7632 : }
95 :
96 : /* increase the size of 'word' if needed to hold one more character */
97 : #define RESIZEPRSBUF \
98 : do { \
99 : int clen = curpos - state->word; \
100 : if ( clen + state->eml >= state->len ) \
101 : { \
102 : state->len *= 2; \
103 : state->word = (char *) repalloc(state->word, state->len); \
104 : curpos = state->word + clen; \
105 : } \
106 : } while (0)
107 :
108 : /* Fills gettoken_tsvector's output parameters, and returns true */
109 : #define RETURN_TOKEN \
110 : do { \
111 : if (pos_ptr != NULL) \
112 : { \
113 : *pos_ptr = pos; \
114 : *poslen = npos; \
115 : } \
116 : else if (pos != NULL) \
117 : pfree(pos); \
118 : \
119 : if (strval != NULL) \
120 : *strval = state->word; \
121 : if (lenval != NULL) \
122 : *lenval = curpos - state->word; \
123 : if (endptr != NULL) \
124 : *endptr = state->prsbuf; \
125 : return true; \
126 : } while(0)
127 :
128 :
129 : /* State codes used in gettoken_tsvector */
130 : #define WAITWORD 1
131 : #define WAITENDWORD 2
132 : #define WAITNEXTCHAR 3
133 : #define WAITENDCMPLX 4
134 : #define WAITPOSINFO 5
135 : #define INPOSINFO 6
136 : #define WAITPOSDELIM 7
137 : #define WAITCHARCMPLX 8
138 :
139 : #define PRSSYNTAXERROR return prssyntaxerror(state)
140 :
141 : static bool
142 18 : prssyntaxerror(TSVectorParseState state)
143 : {
144 18 : errsave(state->escontext,
145 : (errcode(ERRCODE_SYNTAX_ERROR),
146 : state->is_tsquery ?
147 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 : /* In soft error situation, return false as convenience for caller */
150 12 : return false;
151 : }
152 :
153 :
154 : /*
155 : * Get next token from string being parsed. Returns true if successful,
156 : * false if end of input string is reached or soft error.
157 : *
158 : * On success, these output parameters are filled in:
159 : *
160 : * *strval pointer to token
161 : * *lenval length of *strval
162 : * *pos_ptr pointer to a palloc'd array of positions and weights
163 : * associated with the token. If the caller is not interested
164 : * in the information, NULL can be supplied. Otherwise
165 : * the caller is responsible for pfreeing the array.
166 : * *poslen number of elements in *pos_ptr
167 : * *endptr scan resumption point
168 : *
169 : * Pass NULL for any unwanted output parameters.
170 : *
171 : * If state->escontext is an ErrorSaveContext, then caller must check
172 : * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 : * error or normal end-of-string.
174 : */
175 : bool
176 192736 : gettoken_tsvector(TSVectorParseState state,
177 : char **strval, int *lenval,
178 : WordEntryPos **pos_ptr, int *poslen,
179 : char **endptr)
180 : {
181 192736 : int oldstate = 0;
182 192736 : char *curpos = state->word;
183 192736 : int statecode = WAITWORD;
184 :
185 : /*
186 : * pos is for collecting the comma delimited list of positions followed by
187 : * the actual token.
188 : */
189 192736 : WordEntryPos *pos = NULL;
190 192736 : int npos = 0; /* elements of pos used */
191 192736 : int posalen = 0; /* allocated size of pos */
192 :
193 : while (1)
194 : {
195 785238 : if (statecode == WAITWORD)
196 : {
197 370116 : if (*(state->prsbuf) == '\0')
198 3778 : return false;
199 366338 : else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200 162 : statecode = WAITENDCMPLX;
201 366176 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 : {
203 6 : statecode = WAITNEXTCHAR;
204 6 : oldstate = WAITENDWORD;
205 : }
206 366170 : else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 366170 : (state->is_web && t_iseq(state->prsbuf, '"')))
208 0 : PRSSYNTAXERROR;
209 366170 : else if (!isspace((unsigned char) *state->prsbuf))
210 : {
211 188790 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
212 188790 : statecode = WAITENDWORD;
213 : }
214 : }
215 415122 : else if (statecode == WAITNEXTCHAR)
216 : {
217 162 : if (*(state->prsbuf) == '\0')
218 0 : ereturn(state->escontext, false,
219 : (errcode(ERRCODE_SYNTAX_ERROR),
220 : errmsg("there is no escaped character: \"%s\"",
221 : state->bufstart)));
222 : else
223 : {
224 162 : RESIZEPRSBUF;
225 162 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
226 : Assert(oldstate != 0);
227 162 : statecode = oldstate;
228 : }
229 : }
230 414960 : else if (statecode == WAITENDWORD)
231 : {
232 383334 : if (!state->is_web && t_iseq(state->prsbuf, '\\'))
233 : {
234 72 : statecode = WAITNEXTCHAR;
235 72 : oldstate = WAITENDWORD;
236 : }
237 383262 : else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
238 206880 : (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
239 205092 : (state->is_web && t_iseq(state->prsbuf, '"')))
240 : {
241 178182 : RESIZEPRSBUF;
242 178182 : if (curpos == state->word)
243 0 : PRSSYNTAXERROR;
244 178182 : *(curpos) = '\0';
245 178182 : RETURN_TOKEN;
246 : }
247 205080 : else if (t_iseq(state->prsbuf, ':'))
248 : {
249 10614 : if (curpos == state->word)
250 0 : PRSSYNTAXERROR;
251 10614 : *(curpos) = '\0';
252 10614 : if (state->oprisdelim)
253 696 : RETURN_TOKEN;
254 : else
255 9918 : statecode = INPOSINFO;
256 : }
257 : else
258 : {
259 194466 : RESIZEPRSBUF;
260 194466 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
261 : }
262 : }
263 31626 : else if (statecode == WAITENDCMPLX)
264 : {
265 924 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
266 : {
267 162 : statecode = WAITCHARCMPLX;
268 : }
269 762 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
270 : {
271 84 : statecode = WAITNEXTCHAR;
272 84 : oldstate = WAITENDCMPLX;
273 : }
274 678 : else if (*(state->prsbuf) == '\0')
275 0 : PRSSYNTAXERROR;
276 : else
277 : {
278 678 : RESIZEPRSBUF;
279 678 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
280 : }
281 : }
282 30702 : else if (statecode == WAITCHARCMPLX)
283 : {
284 162 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
285 : {
286 0 : RESIZEPRSBUF;
287 0 : curpos += ts_copychar_cstr(curpos, state->prsbuf);
288 0 : statecode = WAITENDCMPLX;
289 : }
290 : else
291 : {
292 162 : RESIZEPRSBUF;
293 162 : *(curpos) = '\0';
294 162 : if (curpos == state->word)
295 18 : PRSSYNTAXERROR;
296 144 : if (state->oprisdelim)
297 : {
298 : /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
299 66 : RETURN_TOKEN;
300 : }
301 : else
302 78 : statecode = WAITPOSINFO;
303 78 : continue; /* recheck current character */
304 : }
305 : }
306 30540 : else if (statecode == WAITPOSINFO)
307 : {
308 78 : if (t_iseq(state->prsbuf, ':'))
309 0 : statecode = INPOSINFO;
310 : else
311 78 : RETURN_TOKEN;
312 : }
313 30462 : else if (statecode == INPOSINFO)
314 : {
315 10524 : if (isdigit((unsigned char) *state->prsbuf))
316 : {
317 10524 : if (posalen == 0)
318 : {
319 9918 : posalen = 4;
320 9918 : pos = palloc_array(WordEntryPos, posalen);
321 9918 : npos = 0;
322 : }
323 606 : else if (npos + 1 >= posalen)
324 : {
325 114 : posalen *= 2;
326 114 : pos = repalloc_array(pos, WordEntryPos, posalen);
327 : }
328 10524 : npos++;
329 10524 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
330 : /* we cannot get here in tsquery, so no need for 2 errmsgs */
331 10524 : if (WEP_GETPOS(pos[npos - 1]) == 0)
332 0 : ereturn(state->escontext, false,
333 : (errcode(ERRCODE_SYNTAX_ERROR),
334 : errmsg("wrong position info in tsvector: \"%s\"",
335 : state->bufstart)));
336 10524 : WEP_SETWEIGHT(pos[npos - 1], 0);
337 10524 : statecode = WAITPOSDELIM;
338 : }
339 : else
340 0 : PRSSYNTAXERROR;
341 : }
342 19938 : else if (statecode == WAITPOSDELIM)
343 : {
344 19938 : if (t_iseq(state->prsbuf, ','))
345 606 : statecode = INPOSINFO;
346 19332 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
347 : {
348 420 : if (WEP_GETWEIGHT(pos[npos - 1]))
349 0 : PRSSYNTAXERROR;
350 420 : WEP_SETWEIGHT(pos[npos - 1], 3);
351 : }
352 18912 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
353 : {
354 216 : if (WEP_GETWEIGHT(pos[npos - 1]))
355 0 : PRSSYNTAXERROR;
356 216 : WEP_SETWEIGHT(pos[npos - 1], 2);
357 : }
358 18696 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
359 : {
360 276 : if (WEP_GETWEIGHT(pos[npos - 1]))
361 0 : PRSSYNTAXERROR;
362 276 : WEP_SETWEIGHT(pos[npos - 1], 1);
363 : }
364 18420 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
365 : {
366 132 : if (WEP_GETWEIGHT(pos[npos - 1]))
367 0 : PRSSYNTAXERROR;
368 132 : WEP_SETWEIGHT(pos[npos - 1], 0);
369 : }
370 18288 : else if (isspace((unsigned char) *state->prsbuf) ||
371 8802 : *(state->prsbuf) == '\0')
372 9918 : RETURN_TOKEN;
373 8370 : else if (!isdigit((unsigned char) *state->prsbuf))
374 0 : PRSSYNTAXERROR;
375 : }
376 : else /* internal error */
377 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
378 : statecode);
379 :
380 : /* get next char */
381 592424 : state->prsbuf += pg_mblen_cstr(state->prsbuf);
382 : }
383 : }
|