Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_selfuncs.c
4 : * Selectivity estimation functions for text search operators.
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_selfuncs.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/htup_details.h"
17 : #include "catalog/pg_statistic.h"
18 : #include "catalog/pg_type.h"
19 : #include "miscadmin.h"
20 : #include "nodes/nodes.h"
21 : #include "tsearch/ts_type.h"
22 : #include "utils/fmgrprotos.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/selfuncs.h"
25 :
26 :
27 : /*
28 : * The default text search selectivity is chosen to be small enough to
29 : * encourage indexscans for typical table densities. See selfuncs.h and
30 : * DEFAULT_EQ_SEL for details.
31 : */
32 : #define DEFAULT_TS_MATCH_SEL 0.005
33 :
34 : /* lookup table type for binary searching through MCELEMs */
35 : typedef struct
36 : {
37 : text *element;
38 : float4 frequency;
39 : } TextFreq;
40 :
41 : /* type of keys for bsearch'ing through an array of TextFreqs */
42 : typedef struct
43 : {
44 : char *lexeme;
45 : int length;
46 : } LexemeKey;
47 :
48 : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
49 : static Selectivity mcelem_tsquery_selec(TSQuery query,
50 : Datum *mcelem, int nmcelem,
51 : float4 *numbers, int nnumbers);
52 : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
53 : TextFreq *lookup, int length, float4 minfreq);
54 : static int compare_lexeme_textfreq(const void *e1, const void *e2);
55 :
56 : #define tsquery_opr_selec_no_stats(query) \
57 : tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
58 :
59 :
60 : /*
61 : * tsmatchsel -- Selectivity of "@@"
62 : *
63 : * restriction selectivity function for tsvector @@ tsquery and
64 : * tsquery @@ tsvector
65 : */
66 : Datum
67 1026 : tsmatchsel(PG_FUNCTION_ARGS)
68 : {
69 1026 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
70 :
71 : #ifdef NOT_USED
72 : Oid operator = PG_GETARG_OID(1);
73 : #endif
74 1026 : List *args = (List *) PG_GETARG_POINTER(2);
75 1026 : int varRelid = PG_GETARG_INT32(3);
76 : VariableStatData vardata;
77 : Node *other;
78 : bool varonleft;
79 : Selectivity selec;
80 :
81 : /*
82 : * If expression is not variable = something or something = variable, then
83 : * punt and return a default estimate.
84 : */
85 1026 : if (!get_restriction_variable(root, args, varRelid,
86 : &vardata, &other, &varonleft))
87 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
88 :
89 : /*
90 : * Can't do anything useful if the something is not a constant, either.
91 : */
92 1026 : if (!IsA(other, Const))
93 : {
94 0 : ReleaseVariableStats(vardata);
95 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
96 : }
97 :
98 : /*
99 : * The "@@" operator is strict, so we can cope with NULL right away
100 : */
101 1026 : if (((Const *) other)->constisnull)
102 : {
103 0 : ReleaseVariableStats(vardata);
104 0 : PG_RETURN_FLOAT8(0.0);
105 : }
106 :
107 : /*
108 : * OK, there's a Var and a Const we're dealing with here. We need the
109 : * Const to be a TSQuery, else we can't do anything useful. We have to
110 : * check this because the Var might be the TSQuery not the TSVector.
111 : */
112 1026 : if (((Const *) other)->consttype == TSQUERYOID)
113 : {
114 : /* tsvector @@ tsquery or the other way around */
115 : Assert(vardata.vartype == TSVECTOROID);
116 :
117 1026 : selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
118 : }
119 : else
120 : {
121 : /* If we can't see the query structure, must punt */
122 0 : selec = DEFAULT_TS_MATCH_SEL;
123 : }
124 :
125 1026 : ReleaseVariableStats(vardata);
126 :
127 1026 : CLAMP_PROBABILITY(selec);
128 :
129 1026 : PG_RETURN_FLOAT8((float8) selec);
130 : }
131 :
132 :
133 : /*
134 : * tsmatchjoinsel -- join selectivity of "@@"
135 : *
136 : * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
137 : */
138 : Datum
139 0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
140 : {
141 : /* for the moment we just punt */
142 0 : PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
143 : }
144 :
145 :
146 : /*
147 : * @@ selectivity for tsvector var vs tsquery constant
148 : */
149 : static Selectivity
150 1026 : tsquerysel(VariableStatData *vardata, Datum constval)
151 : {
152 : Selectivity selec;
153 : TSQuery query;
154 :
155 : /* The caller made sure the const is a TSQuery, so get it now */
156 1026 : query = DatumGetTSQuery(constval);
157 :
158 : /* Empty query matches nothing */
159 1026 : if (query->size == 0)
160 0 : return (Selectivity) 0.0;
161 :
162 1026 : if (HeapTupleIsValid(vardata->statsTuple))
163 : {
164 : Form_pg_statistic stats;
165 : AttStatsSlot sslot;
166 :
167 990 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
168 :
169 : /* MCELEM will be an array of TEXT elements for a tsvector column */
170 990 : if (get_attstatsslot(&sslot, vardata->statsTuple,
171 : STATISTIC_KIND_MCELEM, InvalidOid,
172 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
173 : {
174 : /*
175 : * There is a most-common-elements slot for the tsvector Var, so
176 : * use that.
177 : */
178 990 : selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
179 : sslot.numbers, sslot.nnumbers);
180 990 : free_attstatsslot(&sslot);
181 : }
182 : else
183 : {
184 : /* No most-common-elements info, so do without */
185 0 : selec = tsquery_opr_selec_no_stats(query);
186 : }
187 :
188 : /*
189 : * MCE stats count only non-null rows, so adjust for null rows.
190 : */
191 990 : selec *= (1.0 - stats->stanullfrac);
192 : }
193 : else
194 : {
195 : /* No stats at all, so do without */
196 36 : selec = tsquery_opr_selec_no_stats(query);
197 : /* we assume no nulls here, so no stanullfrac correction */
198 : }
199 :
200 1026 : return selec;
201 : }
202 :
203 : /*
204 : * Extract data from the pg_statistic arrays into useful format.
205 : */
206 : static Selectivity
207 990 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
208 : float4 *numbers, int nnumbers)
209 : {
210 : float4 minfreq;
211 : TextFreq *lookup;
212 : Selectivity selec;
213 : int i;
214 :
215 : /*
216 : * There should be two more Numbers than Values, because the last two
217 : * cells are taken for minimal and maximal frequency. Punt if not.
218 : *
219 : * (Note: the MCELEM statistics slot definition allows for a third extra
220 : * number containing the frequency of nulls, but we're not expecting that
221 : * to appear for a tsvector column.)
222 : */
223 990 : if (nnumbers != nmcelem + 2)
224 0 : return tsquery_opr_selec_no_stats(query);
225 :
226 : /*
227 : * Transpose the data into a single array so we can use bsearch().
228 : */
229 990 : lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
230 990990 : for (i = 0; i < nmcelem; i++)
231 : {
232 : /*
233 : * The text Datums came from an array, so it cannot be compressed or
234 : * stored out-of-line -- it's safe to use VARSIZE_ANY*.
235 : */
236 : Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
237 990000 : lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
238 990000 : lookup[i].frequency = numbers[i];
239 : }
240 :
241 : /*
242 : * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
243 : * the one before the last cell of the Numbers array. See ts_typanalyze.c
244 : */
245 990 : minfreq = numbers[nnumbers - 2];
246 :
247 990 : selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
248 : nmcelem, minfreq);
249 :
250 990 : pfree(lookup);
251 :
252 990 : return selec;
253 : }
254 :
255 : /*
256 : * Traverse the tsquery in preorder, calculating selectivity as:
257 : *
258 : * selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
259 : *
260 : * selec(left_oper) + selec(right_oper) -
261 : * selec(left_oper) * selec(right_oper) in OR nodes,
262 : *
263 : * 1 - select(oper) in NOT nodes
264 : *
265 : * histogram-based estimation in prefix VAL nodes
266 : *
267 : * freq[val] in exact VAL nodes, if the value is in MCELEM
268 : * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
269 : *
270 : * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
271 : * binary search for determining freq[MCELEM].
272 : *
273 : * If we don't have stats for the tsvector, we still use this logic,
274 : * except we use default estimates for VAL nodes. This case is signaled
275 : * by lookup == NULL.
276 : */
277 : static Selectivity
278 3078 : tsquery_opr_selec(QueryItem *item, char *operand,
279 : TextFreq *lookup, int length, float4 minfreq)
280 : {
281 : Selectivity selec;
282 :
283 : /* since this function recurses, it could be driven to stack overflow */
284 3078 : check_stack_depth();
285 :
286 3078 : if (item->type == QI_VAL)
287 : {
288 1842 : QueryOperand *oper = (QueryOperand *) item;
289 : LexemeKey key;
290 :
291 : /*
292 : * Prepare the key for bsearch().
293 : */
294 1842 : key.lexeme = operand + oper->distance;
295 1842 : key.length = oper->length;
296 :
297 1842 : if (oper->prefix)
298 : {
299 : /* Prefix match, ie the query item is lexeme:* */
300 : Selectivity matched,
301 : allmces;
302 : int i,
303 : n_matched;
304 :
305 : /*
306 : * Our strategy is to scan through the MCELEM list and combine the
307 : * frequencies of the ones that match the prefix. We then
308 : * extrapolate the fraction of matching MCELEMs to the remaining
309 : * rows, assuming that the MCELEMs are representative of the whole
310 : * lexeme population in this respect. (Compare
311 : * histogram_selectivity().) Note that these are most common
312 : * elements not most common values, so they're not mutually
313 : * exclusive. We treat occurrences as independent events.
314 : *
315 : * This is only a good plan if we have a pretty fair number of
316 : * MCELEMs available; we set the threshold at 100. If no stats or
317 : * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
318 : */
319 102 : if (lookup == NULL || length < 100)
320 42 : return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
321 :
322 72 : matched = allmces = 0;
323 72 : n_matched = 0;
324 72072 : for (i = 0; i < length; i++)
325 : {
326 72000 : TextFreq *t = lookup + i;
327 72000 : int tlen = VARSIZE_ANY_EXHDR(t->element);
328 :
329 72000 : if (tlen >= key.length &&
330 72000 : strncmp(key.lexeme, VARDATA_ANY(t->element),
331 72000 : key.length) == 0)
332 : {
333 2592 : matched += t->frequency - matched * t->frequency;
334 2592 : n_matched++;
335 : }
336 72000 : allmces += t->frequency - allmces * t->frequency;
337 : }
338 :
339 : /* Clamp to ensure sanity in the face of roundoff error */
340 72 : CLAMP_PROBABILITY(matched);
341 72 : CLAMP_PROBABILITY(allmces);
342 :
343 72 : selec = matched + (1.0 - allmces) * ((double) n_matched / length);
344 :
345 : /*
346 : * In any case, never believe that a prefix match has selectivity
347 : * less than we would assign for a non-MCELEM lexeme. This
348 : * preserves the property that "word:*" should be estimated to
349 : * match at least as many rows as "word" would be.
350 : */
351 72 : selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
352 : }
353 : else
354 : {
355 : /* Regular exact lexeme match */
356 : TextFreq *searchres;
357 :
358 : /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
359 1740 : if (lookup == NULL)
360 12 : return (Selectivity) DEFAULT_TS_MATCH_SEL;
361 :
362 1728 : searchres = (TextFreq *) bsearch(&key, lookup, length,
363 : sizeof(TextFreq),
364 : compare_lexeme_textfreq);
365 :
366 1728 : if (searchres)
367 : {
368 : /*
369 : * The element is in MCELEM. Return precise selectivity (or
370 : * at least as precise as ANALYZE could find out).
371 : */
372 1608 : selec = searchres->frequency;
373 : }
374 : else
375 : {
376 : /*
377 : * The element is not in MCELEM. Punt, but assume that the
378 : * selectivity cannot be more than minfreq / 2.
379 : */
380 120 : selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
381 : }
382 : }
383 : }
384 : else
385 : {
386 : /* Current TSQuery node is an operator */
387 : Selectivity s1,
388 : s2;
389 :
390 1236 : switch (item->qoperator.oper)
391 : {
392 420 : case OP_NOT:
393 420 : selec = 1.0 - tsquery_opr_selec(item + 1, operand,
394 : lookup, length, minfreq);
395 420 : break;
396 :
397 570 : case OP_PHRASE:
398 : case OP_AND:
399 570 : s1 = tsquery_opr_selec(item + 1, operand,
400 : lookup, length, minfreq);
401 570 : s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
402 : lookup, length, minfreq);
403 570 : selec = s1 * s2;
404 570 : break;
405 :
406 246 : case OP_OR:
407 246 : s1 = tsquery_opr_selec(item + 1, operand,
408 : lookup, length, minfreq);
409 246 : s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
410 : lookup, length, minfreq);
411 246 : selec = s1 + s2 - s1 * s2;
412 246 : break;
413 :
414 0 : default:
415 0 : elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
416 : selec = 0; /* keep compiler quiet */
417 : break;
418 : }
419 : }
420 :
421 : /* Clamp intermediate results to stay sane despite roundoff error */
422 3036 : CLAMP_PROBABILITY(selec);
423 :
424 3036 : return selec;
425 : }
426 :
427 : /*
428 : * bsearch() comparator for a lexeme (non-NULL terminated string with length)
429 : * and a TextFreq. Use length, then byte-for-byte comparison, because that's
430 : * how ANALYZE code sorted data before storing it in a statistic tuple.
431 : * See ts_typanalyze.c for details.
432 : */
433 : static int
434 13344 : compare_lexeme_textfreq(const void *e1, const void *e2)
435 : {
436 13344 : const LexemeKey *key = (const LexemeKey *) e1;
437 13344 : const TextFreq *t = (const TextFreq *) e2;
438 : int len1,
439 : len2;
440 :
441 13344 : len1 = key->length;
442 13344 : len2 = VARSIZE_ANY_EXHDR(t->element);
443 :
444 : /* Compare lengths first, possibly avoiding a strncmp call */
445 13344 : if (len1 > len2)
446 1080 : return 1;
447 12264 : else if (len1 < len2)
448 0 : return -1;
449 :
450 : /* Fall back on byte-for-byte comparison */
451 12264 : return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
452 : }
|