Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsquery_cleanup.c
4 : * Cleanup query from NOT values and/or stopword
5 : * Utility functions to correct work.
6 : *
7 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/tsquery_cleanup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "miscadmin.h"
19 : #include "tsearch/ts_utils.h"
20 : #include "varatt.h"
21 :
22 : typedef struct NODE
23 : {
24 : struct NODE *left;
25 : struct NODE *right;
26 : QueryItem *valnode;
27 : } NODE;
28 :
29 : /*
30 : * make query tree from plain view of query
31 : */
32 : static NODE *
33 2928 : maketree(QueryItem *in)
34 : {
35 2928 : NODE *node = (NODE *) palloc(sizeof(NODE));
36 :
37 : /* since this function recurses, it could be driven to stack overflow. */
38 2928 : check_stack_depth();
39 :
40 2928 : node->valnode = in;
41 2928 : node->right = node->left = NULL;
42 2928 : if (in->type == QI_OPR)
43 : {
44 1290 : node->right = maketree(in + 1);
45 1290 : if (in->qoperator.oper != OP_NOT)
46 1206 : node->left = maketree(in + in->qoperator.left);
47 : }
48 2928 : return node;
49 : }
50 :
51 : /*
52 : * Internal state for plaintree and plainnode
53 : */
54 : typedef struct
55 : {
56 : QueryItem *ptr;
57 : int len; /* allocated size of ptr */
58 : int cur; /* number of elements in ptr */
59 : } PLAINTREE;
60 :
61 : static void
62 1602 : plainnode(PLAINTREE *state, NODE *node)
63 : {
64 : /* since this function recurses, it could be driven to stack overflow. */
65 1602 : check_stack_depth();
66 :
67 1602 : if (state->cur == state->len)
68 : {
69 0 : state->len *= 2;
70 0 : state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
71 : }
72 1602 : memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
73 1602 : if (node->valnode->type == QI_VAL)
74 972 : state->cur++;
75 630 : else if (node->valnode->qoperator.oper == OP_NOT)
76 : {
77 66 : state->ptr[state->cur].qoperator.left = 1;
78 66 : state->cur++;
79 66 : plainnode(state, node->right);
80 : }
81 : else
82 : {
83 564 : int cur = state->cur;
84 :
85 564 : state->cur++;
86 564 : plainnode(state, node->right);
87 564 : state->ptr[cur].qoperator.left = state->cur - cur;
88 564 : plainnode(state, node->left);
89 : }
90 1602 : pfree(node);
91 1602 : }
92 :
93 : /*
94 : * make plain view of tree from a NODE-tree representation
95 : */
96 : static QueryItem *
97 408 : plaintree(NODE *root, int *len)
98 : {
99 : PLAINTREE pl;
100 :
101 408 : pl.cur = 0;
102 408 : pl.len = 16;
103 408 : if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
104 : {
105 408 : pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
106 408 : plainnode(&pl, root);
107 : }
108 : else
109 0 : pl.ptr = NULL;
110 408 : *len = pl.cur;
111 408 : return pl.ptr;
112 : }
113 :
114 : static void
115 42 : freetree(NODE *node)
116 : {
117 : /* since this function recurses, it could be driven to stack overflow. */
118 42 : check_stack_depth();
119 :
120 42 : if (!node)
121 0 : return;
122 42 : if (node->left)
123 0 : freetree(node->left);
124 42 : if (node->right)
125 0 : freetree(node->right);
126 42 : pfree(node);
127 : }
128 :
129 : /*
130 : * clean tree for ! operator.
131 : * It's useful for debug, but in
132 : * other case, such view is used with search in index.
133 : * Operator ! always return TRUE
134 : */
135 : static NODE *
136 0 : clean_NOT_intree(NODE *node)
137 : {
138 : /* since this function recurses, it could be driven to stack overflow. */
139 0 : check_stack_depth();
140 :
141 0 : if (node->valnode->type == QI_VAL)
142 0 : return node;
143 :
144 0 : if (node->valnode->qoperator.oper == OP_NOT)
145 : {
146 0 : freetree(node);
147 0 : return NULL;
148 : }
149 :
150 : /* operator & or | */
151 0 : if (node->valnode->qoperator.oper == OP_OR)
152 : {
153 0 : if ((node->left = clean_NOT_intree(node->left)) == NULL ||
154 0 : (node->right = clean_NOT_intree(node->right)) == NULL)
155 : {
156 0 : freetree(node);
157 0 : return NULL;
158 : }
159 : }
160 : else
161 : {
162 0 : NODE *res = node;
163 :
164 : Assert(node->valnode->qoperator.oper == OP_AND ||
165 : node->valnode->qoperator.oper == OP_PHRASE);
166 :
167 0 : node->left = clean_NOT_intree(node->left);
168 0 : node->right = clean_NOT_intree(node->right);
169 0 : if (node->left == NULL && node->right == NULL)
170 : {
171 0 : pfree(node);
172 0 : res = NULL;
173 : }
174 0 : else if (node->left == NULL)
175 : {
176 0 : res = node->right;
177 0 : pfree(node);
178 : }
179 0 : else if (node->right == NULL)
180 : {
181 0 : res = node->left;
182 0 : pfree(node);
183 : }
184 0 : return res;
185 : }
186 0 : return node;
187 : }
188 :
189 : QueryItem *
190 0 : clean_NOT(QueryItem *ptr, int *len)
191 : {
192 0 : NODE *root = maketree(ptr);
193 :
194 0 : return plaintree(clean_NOT_intree(root), len);
195 : }
196 :
197 :
198 : /*
199 : * Remove QI_VALSTOP (stopword) nodes from query tree.
200 : *
201 : * Returns NULL if the query degenerates to nothing. Input must not be NULL.
202 : *
203 : * When we remove a phrase operator due to removing one or both of its
204 : * arguments, we might need to adjust the distance of a parent phrase
205 : * operator. For example, 'a' is a stopword, so:
206 : * (b <-> a) <-> c should become b <2> c
207 : * b <-> (a <-> c) should become b <2> c
208 : * (b <-> (a <-> a)) <-> c should become b <3> c
209 : * b <-> ((a <-> a) <-> c) should become b <3> c
210 : * To handle that, we define two output parameters:
211 : * ladd: amount to add to a phrase distance to the left of this node
212 : * radd: amount to add to a phrase distance to the right of this node
213 : * We need two outputs because we could need to bubble up adjustments to two
214 : * different parent phrase operators. Consider
215 : * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
216 : * After we've removed the two a's and are considering the <2> node (which is
217 : * now just x <2> y), we have an ladd distance of 1 that needs to propagate
218 : * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
219 : * propagate to the rightmost <->, so that we'll end up with
220 : * w <2> ((x <2> y) <4> z)
221 : * Near the bottom of the tree, we may have subtrees consisting only of
222 : * stopwords. The distances of any phrase operators within such a subtree are
223 : * summed and propagated to both ladd and radd, since we don't know which side
224 : * of the lowest surviving phrase operator we are in. The rule is that any
225 : * subtree that degenerates to NULL must return equal values of ladd and radd,
226 : * and the parent node dealing with it should incorporate only one of those.
227 : *
228 : * Currently, we only implement this adjustment for adjacent phrase operators.
229 : * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
230 : * isn't ideal, but there is no way to represent the really desired semantics
231 : * without some redesign of the tsquery structure. Certainly it would not be
232 : * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
233 : * corner case, let it go for now. But we can fix it in cases where the
234 : * intervening non-phrase operator also gets removed, for example
235 : * '((x <-> a) | a) <-> y' will become 'x <2> y'.
236 : */
237 : static NODE *
238 2928 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
239 : {
240 : /* since this function recurses, it could be driven to stack overflow. */
241 2928 : check_stack_depth();
242 :
243 : /* default output parameters indicate no change in parent distance */
244 2928 : *ladd = *radd = 0;
245 :
246 2928 : if (node->valnode->type == QI_VAL)
247 972 : return node;
248 1956 : else if (node->valnode->type == QI_VALSTOP)
249 : {
250 666 : pfree(node);
251 666 : return NULL;
252 : }
253 :
254 : Assert(node->valnode->type == QI_OPR);
255 :
256 1290 : if (node->valnode->qoperator.oper == OP_NOT)
257 : {
258 : /* NOT doesn't change pattern width, so just report child distances */
259 84 : node->right = clean_stopword_intree(node->right, ladd, radd);
260 84 : if (!node->right)
261 : {
262 18 : freetree(node);
263 18 : return NULL;
264 : }
265 : }
266 : else
267 : {
268 1206 : NODE *res = node;
269 : bool isphrase;
270 : int ndistance,
271 : lladd,
272 : lradd,
273 : rladd,
274 : rradd;
275 :
276 : /* First, recurse */
277 1206 : node->left = clean_stopword_intree(node->left, &lladd, &lradd);
278 1206 : node->right = clean_stopword_intree(node->right, &rladd, &rradd);
279 :
280 : /* Check if current node is OP_PHRASE, get its distance */
281 1206 : isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
282 1206 : ndistance = isphrase ? node->valnode->qoperator.distance : 0;
283 :
284 1206 : if (node->left == NULL && node->right == NULL)
285 : {
286 : /*
287 : * When we collapse out a phrase node entirely, propagate its own
288 : * distance into both *ladd and *radd; it is the responsibility of
289 : * the parent node to count it only once. Also, for a phrase
290 : * node, distances coming from children are summed and propagated
291 : * up to parent (we assume lladd == lradd and rladd == rradd, else
292 : * rule was broken at a lower level). But if this isn't a phrase
293 : * node, take the larger of the two child distances; that
294 : * corresponds to what TS_execute will do in non-stopword cases.
295 : */
296 24 : if (isphrase)
297 0 : *ladd = *radd = lladd + ndistance + rladd;
298 : else
299 24 : *ladd = *radd = Max(lladd, rladd);
300 24 : freetree(node);
301 24 : return NULL;
302 : }
303 1182 : else if (node->left == NULL)
304 : {
305 : /* Removing this operator and left subnode */
306 : /* lladd and lradd are equal/redundant, don't count both */
307 252 : if (isphrase)
308 : {
309 : /* operator's own distance must propagate to left */
310 162 : *ladd = lladd + ndistance + rladd;
311 162 : *radd = rradd;
312 : }
313 : else
314 : {
315 : /* at non-phrase op, just forget the left subnode entirely */
316 90 : *ladd = rladd;
317 90 : *radd = rradd;
318 : }
319 252 : res = node->right;
320 252 : pfree(node);
321 : }
322 930 : else if (node->right == NULL)
323 : {
324 : /* Removing this operator and right subnode */
325 : /* rladd and rradd are equal/redundant, don't count both */
326 366 : if (isphrase)
327 : {
328 : /* operator's own distance must propagate to right */
329 216 : *ladd = lladd;
330 216 : *radd = lradd + ndistance + rradd;
331 : }
332 : else
333 : {
334 : /* at non-phrase op, just forget the right subnode entirely */
335 150 : *ladd = lladd;
336 150 : *radd = lradd;
337 : }
338 366 : res = node->left;
339 366 : pfree(node);
340 : }
341 564 : else if (isphrase)
342 : {
343 : /* Absorb appropriate corrections at this level */
344 342 : node->valnode->qoperator.distance += lradd + rladd;
345 : /* Propagate up any unaccounted-for corrections */
346 342 : *ladd = lladd;
347 342 : *radd = rradd;
348 : }
349 : else
350 : {
351 : /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
352 : }
353 :
354 1182 : return res;
355 : }
356 66 : return node;
357 : }
358 :
359 : /*
360 : * Number of elements in query tree
361 : */
362 : static int32
363 1602 : calcstrlen(NODE *node)
364 : {
365 1602 : int32 size = 0;
366 :
367 1602 : if (node->valnode->type == QI_VAL)
368 : {
369 972 : size = node->valnode->qoperand.length + 1;
370 : }
371 : else
372 : {
373 : Assert(node->valnode->type == QI_OPR);
374 :
375 630 : size = calcstrlen(node->right);
376 630 : if (node->valnode->qoperator.oper != OP_NOT)
377 564 : size += calcstrlen(node->left);
378 : }
379 :
380 1602 : return size;
381 : }
382 :
383 : /*
384 : * Remove QI_VALSTOP (stopword) nodes from TSQuery.
385 : */
386 : TSQuery
387 432 : cleanup_tsquery_stopwords(TSQuery in, bool noisy)
388 : {
389 : int32 len,
390 : lenstr,
391 : commonlen,
392 : i;
393 : NODE *root;
394 : int ladd,
395 : radd;
396 : TSQuery out;
397 : QueryItem *items;
398 : char *operands;
399 :
400 432 : if (in->size == 0)
401 0 : return in;
402 :
403 : /* eliminate stop words */
404 432 : root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
405 432 : if (root == NULL)
406 : {
407 24 : if (noisy)
408 24 : ereport(NOTICE,
409 : (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
410 24 : out = palloc(HDRSIZETQ);
411 24 : out->size = 0;
412 24 : SET_VARSIZE(out, HDRSIZETQ);
413 24 : return out;
414 : }
415 :
416 : /*
417 : * Build TSQuery from plain view
418 : */
419 :
420 408 : lenstr = calcstrlen(root);
421 408 : items = plaintree(root, &len);
422 408 : commonlen = COMPUTESIZE(len, lenstr);
423 :
424 408 : out = palloc(commonlen);
425 408 : SET_VARSIZE(out, commonlen);
426 408 : out->size = len;
427 :
428 408 : memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
429 :
430 408 : items = GETQUERY(out);
431 408 : operands = GETOPERAND(out);
432 2010 : for (i = 0; i < out->size; i++)
433 : {
434 1602 : QueryOperand *op = (QueryOperand *) &items[i];
435 :
436 1602 : if (op->type != QI_VAL)
437 630 : continue;
438 :
439 972 : memcpy(operands, GETOPERAND(in) + op->distance, op->length);
440 972 : operands[op->length] = '\0';
441 972 : op->distance = operands - GETOPERAND(out);
442 972 : operands += op->length + 1;
443 : }
444 :
445 408 : return out;
446 : }
|