Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 : * flags are both true or both false in current usage, but we keep them
25 : * separate for clarity.
26 : *
27 : * If oprisdelim is set, the following characters are treated as delimiters
28 : * (in addition to whitespace): ! | & ( )
29 : *
30 : * is_tsquery affects *only* the content of error messages.
31 : *
32 : * is_web can be true to further modify tsquery parsing.
33 : *
34 : * If escontext is an ErrorSaveContext node, then soft errors can be
35 : * captured there rather than being thrown.
36 : */
37 : struct TSVectorParseStateData
38 : {
39 : char *prsbuf; /* next input character */
40 : char *bufstart; /* whole string (used only for errors) */
41 : char *word; /* buffer to hold the current word */
42 : int len; /* size in bytes allocated for 'word' */
43 : int eml; /* max bytes per character */
44 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 : bool is_web; /* we're in websearch_to_tsquery() */
47 : Node *escontext; /* for soft error reporting */
48 : };
49 :
50 :
51 : /*
52 : * Initializes a parser state object for the given input string.
53 : * A bitmask of flags (see ts_utils.h) and an error context object
54 : * can be provided as well.
55 : */
56 : TSVectorParseState
57 7638 : init_tsvector_parser(char *input, int flags, Node *escontext)
58 : {
59 : TSVectorParseState state;
60 :
61 7638 : state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
62 7638 : state->prsbuf = input;
63 7638 : state->bufstart = input;
64 7638 : state->len = 32;
65 7638 : state->word = (char *) palloc(state->len);
66 7638 : state->eml = pg_database_encoding_max_length();
67 7638 : state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 7638 : state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 7638 : state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 7638 : state->escontext = escontext;
71 :
72 7638 : return state;
73 : }
74 :
75 : /*
76 : * Reinitializes parser to parse 'input', instead of previous input.
77 : *
78 : * Note that bufstart (the string reported in errors) is not changed.
79 : */
80 : void
81 8136 : reset_tsvector_parser(TSVectorParseState state, char *input)
82 : {
83 8136 : state->prsbuf = input;
84 8136 : }
85 :
86 : /*
87 : * Shuts down a tsvector parser.
88 : */
89 : void
90 7632 : close_tsvector_parser(TSVectorParseState state)
91 : {
92 7632 : pfree(state->word);
93 7632 : pfree(state);
94 7632 : }
95 :
96 : /* increase the size of 'word' if needed to hold one more character */
97 : #define RESIZEPRSBUF \
98 : do { \
99 : int clen = curpos - state->word; \
100 : if ( clen + state->eml >= state->len ) \
101 : { \
102 : state->len *= 2; \
103 : state->word = (char *) repalloc(state->word, state->len); \
104 : curpos = state->word + clen; \
105 : } \
106 : } while (0)
107 :
108 : /* Fills gettoken_tsvector's output parameters, and returns true */
109 : #define RETURN_TOKEN \
110 : do { \
111 : if (pos_ptr != NULL) \
112 : { \
113 : *pos_ptr = pos; \
114 : *poslen = npos; \
115 : } \
116 : else if (pos != NULL) \
117 : pfree(pos); \
118 : \
119 : if (strval != NULL) \
120 : *strval = state->word; \
121 : if (lenval != NULL) \
122 : *lenval = curpos - state->word; \
123 : if (endptr != NULL) \
124 : *endptr = state->prsbuf; \
125 : return true; \
126 : } while(0)
127 :
128 :
129 : /* State codes used in gettoken_tsvector */
130 : #define WAITWORD 1
131 : #define WAITENDWORD 2
132 : #define WAITNEXTCHAR 3
133 : #define WAITENDCMPLX 4
134 : #define WAITPOSINFO 5
135 : #define INPOSINFO 6
136 : #define WAITPOSDELIM 7
137 : #define WAITCHARCMPLX 8
138 :
139 : #define PRSSYNTAXERROR return prssyntaxerror(state)
140 :
141 : static bool
142 18 : prssyntaxerror(TSVectorParseState state)
143 : {
144 18 : errsave(state->escontext,
145 : (errcode(ERRCODE_SYNTAX_ERROR),
146 : state->is_tsquery ?
147 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 : /* In soft error situation, return false as convenience for caller */
150 12 : return false;
151 : }
152 :
153 :
154 : /*
155 : * Get next token from string being parsed. Returns true if successful,
156 : * false if end of input string is reached or soft error.
157 : *
158 : * On success, these output parameters are filled in:
159 : *
160 : * *strval pointer to token
161 : * *lenval length of *strval
162 : * *pos_ptr pointer to a palloc'd array of positions and weights
163 : * associated with the token. If the caller is not interested
164 : * in the information, NULL can be supplied. Otherwise
165 : * the caller is responsible for pfreeing the array.
166 : * *poslen number of elements in *pos_ptr
167 : * *endptr scan resumption point
168 : *
169 : * Pass NULL for any unwanted output parameters.
170 : *
171 : * If state->escontext is an ErrorSaveContext, then caller must check
172 : * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 : * error or normal end-of-string.
174 : */
175 : bool
176 192736 : gettoken_tsvector(TSVectorParseState state,
177 : char **strval, int *lenval,
178 : WordEntryPos **pos_ptr, int *poslen,
179 : char **endptr)
180 : {
181 192736 : int oldstate = 0;
182 192736 : char *curpos = state->word;
183 192736 : int statecode = WAITWORD;
184 :
185 : /*
186 : * pos is for collecting the comma delimited list of positions followed by
187 : * the actual token.
188 : */
189 192736 : WordEntryPos *pos = NULL;
190 192736 : int npos = 0; /* elements of pos used */
191 192736 : int posalen = 0; /* allocated size of pos */
192 :
193 : while (1)
194 : {
195 785238 : if (statecode == WAITWORD)
196 : {
197 370116 : if (*(state->prsbuf) == '\0')
198 3778 : return false;
199 366338 : else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200 162 : statecode = WAITENDCMPLX;
201 366176 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 : {
203 6 : statecode = WAITNEXTCHAR;
204 6 : oldstate = WAITENDWORD;
205 : }
206 366170 : else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 366170 : (state->is_web && t_iseq(state->prsbuf, '"')))
208 0 : PRSSYNTAXERROR;
209 366170 : else if (!t_isspace(state->prsbuf))
210 : {
211 188790 : COPYCHAR(curpos, state->prsbuf);
212 188790 : curpos += pg_mblen(state->prsbuf);
213 188790 : statecode = WAITENDWORD;
214 : }
215 : }
216 415122 : else if (statecode == WAITNEXTCHAR)
217 : {
218 162 : if (*(state->prsbuf) == '\0')
219 0 : ereturn(state->escontext, false,
220 : (errcode(ERRCODE_SYNTAX_ERROR),
221 : errmsg("there is no escaped character: \"%s\"",
222 : state->bufstart)));
223 : else
224 : {
225 162 : RESIZEPRSBUF;
226 162 : COPYCHAR(curpos, state->prsbuf);
227 162 : curpos += pg_mblen(state->prsbuf);
228 : Assert(oldstate != 0);
229 162 : statecode = oldstate;
230 : }
231 : }
232 414960 : else if (statecode == WAITENDWORD)
233 : {
234 383334 : if (!state->is_web && t_iseq(state->prsbuf, '\\'))
235 : {
236 72 : statecode = WAITNEXTCHAR;
237 72 : oldstate = WAITENDWORD;
238 : }
239 383262 : else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
240 206880 : (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
241 205092 : (state->is_web && t_iseq(state->prsbuf, '"')))
242 : {
243 178182 : RESIZEPRSBUF;
244 178182 : if (curpos == state->word)
245 0 : PRSSYNTAXERROR;
246 178182 : *(curpos) = '\0';
247 178182 : RETURN_TOKEN;
248 : }
249 205080 : else if (t_iseq(state->prsbuf, ':'))
250 : {
251 10614 : if (curpos == state->word)
252 0 : PRSSYNTAXERROR;
253 10614 : *(curpos) = '\0';
254 10614 : if (state->oprisdelim)
255 696 : RETURN_TOKEN;
256 : else
257 9918 : statecode = INPOSINFO;
258 : }
259 : else
260 : {
261 194466 : RESIZEPRSBUF;
262 194466 : COPYCHAR(curpos, state->prsbuf);
263 194466 : curpos += pg_mblen(state->prsbuf);
264 : }
265 : }
266 31626 : else if (statecode == WAITENDCMPLX)
267 : {
268 924 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
269 : {
270 162 : statecode = WAITCHARCMPLX;
271 : }
272 762 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
273 : {
274 84 : statecode = WAITNEXTCHAR;
275 84 : oldstate = WAITENDCMPLX;
276 : }
277 678 : else if (*(state->prsbuf) == '\0')
278 0 : PRSSYNTAXERROR;
279 : else
280 : {
281 678 : RESIZEPRSBUF;
282 678 : COPYCHAR(curpos, state->prsbuf);
283 678 : curpos += pg_mblen(state->prsbuf);
284 : }
285 : }
286 30702 : else if (statecode == WAITCHARCMPLX)
287 : {
288 162 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
289 : {
290 0 : RESIZEPRSBUF;
291 0 : COPYCHAR(curpos, state->prsbuf);
292 0 : curpos += pg_mblen(state->prsbuf);
293 0 : statecode = WAITENDCMPLX;
294 : }
295 : else
296 : {
297 162 : RESIZEPRSBUF;
298 162 : *(curpos) = '\0';
299 162 : if (curpos == state->word)
300 18 : PRSSYNTAXERROR;
301 144 : if (state->oprisdelim)
302 : {
303 : /* state->prsbuf+=pg_mblen(state->prsbuf); */
304 66 : RETURN_TOKEN;
305 : }
306 : else
307 78 : statecode = WAITPOSINFO;
308 78 : continue; /* recheck current character */
309 : }
310 : }
311 30540 : else if (statecode == WAITPOSINFO)
312 : {
313 78 : if (t_iseq(state->prsbuf, ':'))
314 0 : statecode = INPOSINFO;
315 : else
316 78 : RETURN_TOKEN;
317 : }
318 30462 : else if (statecode == INPOSINFO)
319 : {
320 10524 : if (t_isdigit(state->prsbuf))
321 : {
322 10524 : if (posalen == 0)
323 : {
324 9918 : posalen = 4;
325 9918 : pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
326 9918 : npos = 0;
327 : }
328 606 : else if (npos + 1 >= posalen)
329 : {
330 114 : posalen *= 2;
331 114 : pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
332 : }
333 10524 : npos++;
334 10524 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
335 : /* we cannot get here in tsquery, so no need for 2 errmsgs */
336 10524 : if (WEP_GETPOS(pos[npos - 1]) == 0)
337 0 : ereturn(state->escontext, false,
338 : (errcode(ERRCODE_SYNTAX_ERROR),
339 : errmsg("wrong position info in tsvector: \"%s\"",
340 : state->bufstart)));
341 10524 : WEP_SETWEIGHT(pos[npos - 1], 0);
342 10524 : statecode = WAITPOSDELIM;
343 : }
344 : else
345 0 : PRSSYNTAXERROR;
346 : }
347 19938 : else if (statecode == WAITPOSDELIM)
348 : {
349 19938 : if (t_iseq(state->prsbuf, ','))
350 606 : statecode = INPOSINFO;
351 19332 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
352 : {
353 420 : if (WEP_GETWEIGHT(pos[npos - 1]))
354 0 : PRSSYNTAXERROR;
355 420 : WEP_SETWEIGHT(pos[npos - 1], 3);
356 : }
357 18912 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
358 : {
359 216 : if (WEP_GETWEIGHT(pos[npos - 1]))
360 0 : PRSSYNTAXERROR;
361 216 : WEP_SETWEIGHT(pos[npos - 1], 2);
362 : }
363 18696 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
364 : {
365 276 : if (WEP_GETWEIGHT(pos[npos - 1]))
366 0 : PRSSYNTAXERROR;
367 276 : WEP_SETWEIGHT(pos[npos - 1], 1);
368 : }
369 18420 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
370 : {
371 132 : if (WEP_GETWEIGHT(pos[npos - 1]))
372 0 : PRSSYNTAXERROR;
373 132 : WEP_SETWEIGHT(pos[npos - 1], 0);
374 : }
375 18288 : else if (t_isspace(state->prsbuf) ||
376 8802 : *(state->prsbuf) == '\0')
377 9918 : RETURN_TOKEN;
378 8370 : else if (!t_isdigit(state->prsbuf))
379 0 : PRSSYNTAXERROR;
380 : }
381 : else /* internal error */
382 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
383 : statecode);
384 :
385 : /* get next char */
386 592424 : state->prsbuf += pg_mblen(state->prsbuf);
387 : }
388 : }
|