Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * like_support.c
4 : * Planner support functions for LIKE, regex, and related operators.
5 : *
6 : * These routines handle special optimization of operators that can be
7 : * used with index scans even though they are not known to the executor's
8 : * indexscan machinery. The key idea is that these operators allow us
9 : * to derive approximate indexscan qual clauses, such that any tuples
10 : * that pass the operator clause itself must also satisfy the simpler
11 : * indexscan condition(s). Then we can use the indexscan machinery
12 : * to avoid scanning as much of the table as we'd otherwise have to,
13 : * while applying the original operator as a qpqual condition to ensure
14 : * we deliver only the tuples we want. (In essence, we're using a regular
15 : * index as if it were a lossy index.)
16 : *
17 : * An example of what we're doing is
18 : * textfield LIKE 'abc%def'
19 : * from which we can generate the indexscanable conditions
20 : * textfield >= 'abc' AND textfield < 'abd'
21 : * which allow efficient scanning of an index on textfield.
22 : * (In reality, character set and collation issues make the transformation
23 : * from LIKE to indexscan limits rather harder than one might think ...
24 : * but that's the basic idea.)
25 : *
26 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/adt/like_support.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include <math.h>
38 :
39 : #include "access/htup_details.h"
40 : #include "catalog/pg_collation.h"
41 : #include "catalog/pg_operator.h"
42 : #include "catalog/pg_opfamily.h"
43 : #include "catalog/pg_statistic.h"
44 : #include "catalog/pg_type.h"
45 : #include "mb/pg_wchar.h"
46 : #include "miscadmin.h"
47 : #include "nodes/makefuncs.h"
48 : #include "nodes/nodeFuncs.h"
49 : #include "nodes/supportnodes.h"
50 : #include "utils/builtins.h"
51 : #include "utils/datum.h"
52 : #include "utils/lsyscache.h"
53 : #include "utils/pg_locale.h"
54 : #include "utils/selfuncs.h"
55 : #include "utils/varlena.h"
56 :
57 :
58 : typedef enum
59 : {
60 : Pattern_Type_Like,
61 : Pattern_Type_Like_IC,
62 : Pattern_Type_Regex,
63 : Pattern_Type_Regex_IC,
64 : Pattern_Type_Prefix,
65 : } Pattern_Type;
66 :
67 : typedef enum
68 : {
69 : Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact,
70 : } Pattern_Prefix_Status;
71 :
72 : static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 : static List *match_pattern_prefix(Node *leftop,
74 : Node *rightop,
75 : Pattern_Type ptype,
76 : Oid expr_coll,
77 : Oid opfamily,
78 : Oid indexcollation);
79 : static double patternsel_common(PlannerInfo *root,
80 : Oid oprid,
81 : Oid opfuncid,
82 : List *args,
83 : int varRelid,
84 : Oid collation,
85 : Pattern_Type ptype,
86 : bool negate);
87 : static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
88 : Pattern_Type ptype,
89 : Oid collation,
90 : Const **prefix,
91 : Selectivity *rest_selec);
92 : static Selectivity prefix_selectivity(PlannerInfo *root,
93 : VariableStatData *vardata,
94 : Oid eqopr, Oid ltopr, Oid geopr,
95 : Oid collation,
96 : Const *prefixcon);
97 : static Selectivity like_selectivity(const char *patt, int pattlen,
98 : bool case_insensitive);
99 : static Selectivity regex_selectivity(const char *patt, int pattlen,
100 : bool case_insensitive,
101 : int fixed_prefix_len);
102 : static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
103 : Oid collation);
104 : static Datum string_to_datum(const char *str, Oid datatype);
105 : static Const *string_to_const(const char *str, Oid datatype);
106 : static Const *string_to_bytea_const(const char *str, size_t str_len);
107 :
108 :
109 : /*
110 : * Planner support functions for LIKE, regex, and related operators
111 : */
112 : Datum
113 5112 : textlike_support(PG_FUNCTION_ARGS)
114 : {
115 5112 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
116 :
117 5112 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
118 : }
119 :
120 : Datum
121 358 : texticlike_support(PG_FUNCTION_ARGS)
122 : {
123 358 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
124 :
125 358 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
126 : }
127 :
128 : Datum
129 23016 : textregexeq_support(PG_FUNCTION_ARGS)
130 : {
131 23016 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
132 :
133 23016 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
134 : }
135 :
136 : Datum
137 118 : texticregexeq_support(PG_FUNCTION_ARGS)
138 : {
139 118 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
140 :
141 118 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
142 : }
143 :
144 : Datum
145 156 : text_starts_with_support(PG_FUNCTION_ARGS)
146 : {
147 156 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
148 :
149 156 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Prefix));
150 : }
151 :
152 : /* Common code for the above */
153 : static Node *
154 28760 : like_regex_support(Node *rawreq, Pattern_Type ptype)
155 : {
156 28760 : Node *ret = NULL;
157 :
158 28760 : if (IsA(rawreq, SupportRequestSelectivity))
159 : {
160 : /*
161 : * Make a selectivity estimate for a function call, just as we'd do if
162 : * the call was via the corresponding operator.
163 : */
164 24 : SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
165 : Selectivity s1;
166 :
167 24 : if (req->is_join)
168 : {
169 : /*
170 : * For the moment we just punt. If patternjoinsel is ever
171 : * improved to do better, this should be made to call it.
172 : */
173 0 : s1 = DEFAULT_MATCH_SEL;
174 : }
175 : else
176 : {
177 : /* Share code with operator restriction selectivity functions */
178 24 : s1 = patternsel_common(req->root,
179 : InvalidOid,
180 : req->funcid,
181 : req->args,
182 : req->varRelid,
183 : req->inputcollid,
184 : ptype,
185 : false);
186 : }
187 24 : req->selectivity = s1;
188 24 : ret = (Node *) req;
189 : }
190 28736 : else if (IsA(rawreq, SupportRequestIndexCondition))
191 : {
192 : /* Try to convert operator/function call to index conditions */
193 8142 : SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
194 :
195 : /*
196 : * Currently we have no "reverse" match operators with the pattern on
197 : * the left, so we only need consider cases with the indexkey on the
198 : * left.
199 : */
200 8142 : if (req->indexarg != 0)
201 0 : return NULL;
202 :
203 8142 : if (is_opclause(req->node))
204 : {
205 8118 : OpExpr *clause = (OpExpr *) req->node;
206 :
207 : Assert(list_length(clause->args) == 2);
208 : ret = (Node *)
209 8118 : match_pattern_prefix((Node *) linitial(clause->args),
210 8118 : (Node *) lsecond(clause->args),
211 : ptype,
212 : clause->inputcollid,
213 : req->opfamily,
214 : req->indexcollation);
215 : }
216 24 : else if (is_funcclause(req->node)) /* be paranoid */
217 : {
218 24 : FuncExpr *clause = (FuncExpr *) req->node;
219 :
220 : Assert(list_length(clause->args) == 2);
221 : ret = (Node *)
222 24 : match_pattern_prefix((Node *) linitial(clause->args),
223 24 : (Node *) lsecond(clause->args),
224 : ptype,
225 : clause->inputcollid,
226 : req->opfamily,
227 : req->indexcollation);
228 : }
229 : }
230 :
231 28760 : return ret;
232 : }
233 :
234 : /*
235 : * match_pattern_prefix
236 : * Try to generate an indexqual for a LIKE or regex operator.
237 : */
238 : static List *
239 8142 : match_pattern_prefix(Node *leftop,
240 : Node *rightop,
241 : Pattern_Type ptype,
242 : Oid expr_coll,
243 : Oid opfamily,
244 : Oid indexcollation)
245 : {
246 : List *result;
247 : Const *patt;
248 : Const *prefix;
249 : Pattern_Prefix_Status pstatus;
250 : Oid ldatatype;
251 : Oid rdatatype;
252 : Oid eqopr;
253 : Oid ltopr;
254 : Oid geopr;
255 8142 : Oid preopr = InvalidOid;
256 : bool collation_aware;
257 : Expr *expr;
258 : FmgrInfo ltproc;
259 : Const *greaterstr;
260 :
261 : /*
262 : * Can't do anything with a non-constant or NULL pattern argument.
263 : *
264 : * Note that since we restrict ourselves to cases with a hard constant on
265 : * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
266 : * about verifying that.
267 : */
268 8142 : if (!IsA(rightop, Const) ||
269 8094 : ((Const *) rightop)->constisnull)
270 48 : return NIL;
271 8094 : patt = (Const *) rightop;
272 :
273 : /*
274 : * Try to extract a fixed prefix from the pattern.
275 : */
276 8094 : pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
277 : &prefix, NULL);
278 :
279 : /* fail if no fixed prefix */
280 8094 : if (pstatus == Pattern_Prefix_None)
281 300 : return NIL;
282 :
283 : /*
284 : * Identify the operators we want to use, based on the type of the
285 : * left-hand argument. Usually these are just the type's regular
286 : * comparison operators, but if we are considering one of the semi-legacy
287 : * "pattern" opclasses, use the "pattern" operators instead. Those are
288 : * not collation-sensitive but always use C collation, as we want. The
289 : * selected operators also determine the needed type of the prefix
290 : * constant.
291 : */
292 7794 : ldatatype = exprType(leftop);
293 7794 : switch (ldatatype)
294 : {
295 80 : case TEXTOID:
296 80 : if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
297 : {
298 0 : eqopr = TextEqualOperator;
299 0 : ltopr = TextPatternLessOperator;
300 0 : geopr = TextPatternGreaterEqualOperator;
301 0 : collation_aware = false;
302 : }
303 80 : else if (opfamily == TEXT_SPGIST_FAM_OID)
304 : {
305 24 : eqopr = TextEqualOperator;
306 24 : ltopr = TextPatternLessOperator;
307 24 : geopr = TextPatternGreaterEqualOperator;
308 : /* This opfamily has direct support for prefixing */
309 24 : preopr = TextPrefixOperator;
310 24 : collation_aware = false;
311 : }
312 : else
313 : {
314 56 : eqopr = TextEqualOperator;
315 56 : ltopr = TextLessOperator;
316 56 : geopr = TextGreaterEqualOperator;
317 56 : collation_aware = true;
318 : }
319 80 : rdatatype = TEXTOID;
320 80 : break;
321 7690 : case NAMEOID:
322 :
323 : /*
324 : * Note that here, we need the RHS type to be text, so that the
325 : * comparison value isn't improperly truncated to NAMEDATALEN.
326 : */
327 7690 : eqopr = NameEqualTextOperator;
328 7690 : ltopr = NameLessTextOperator;
329 7690 : geopr = NameGreaterEqualTextOperator;
330 7690 : collation_aware = true;
331 7690 : rdatatype = TEXTOID;
332 7690 : break;
333 24 : case BPCHAROID:
334 24 : if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
335 : {
336 0 : eqopr = BpcharEqualOperator;
337 0 : ltopr = BpcharPatternLessOperator;
338 0 : geopr = BpcharPatternGreaterEqualOperator;
339 0 : collation_aware = false;
340 : }
341 : else
342 : {
343 24 : eqopr = BpcharEqualOperator;
344 24 : ltopr = BpcharLessOperator;
345 24 : geopr = BpcharGreaterEqualOperator;
346 24 : collation_aware = true;
347 : }
348 24 : rdatatype = BPCHAROID;
349 24 : break;
350 0 : case BYTEAOID:
351 0 : eqopr = ByteaEqualOperator;
352 0 : ltopr = ByteaLessOperator;
353 0 : geopr = ByteaGreaterEqualOperator;
354 0 : collation_aware = false;
355 0 : rdatatype = BYTEAOID;
356 0 : break;
357 0 : default:
358 : /* Can't get here unless we're attached to the wrong operator */
359 0 : return NIL;
360 : }
361 :
362 : /*
363 : * If necessary, coerce the prefix constant to the right type. The given
364 : * prefix constant is either text or bytea type, therefore the only case
365 : * where we need to do anything is when converting text to bpchar. Those
366 : * two types are binary-compatible, so relabeling the Const node is
367 : * sufficient.
368 : */
369 7794 : if (prefix->consttype != rdatatype)
370 : {
371 : Assert(prefix->consttype == TEXTOID &&
372 : rdatatype == BPCHAROID);
373 24 : prefix->consttype = rdatatype;
374 : }
375 :
376 : /*
377 : * If we found an exact-match pattern, generate an "=" indexqual.
378 : *
379 : * Here and below, check to see whether the desired operator is actually
380 : * supported by the index opclass, and fail quietly if not. This allows
381 : * us to not be concerned with specific opclasses (except for the legacy
382 : * "pattern" cases); any index that correctly implements the operators
383 : * will work.
384 : */
385 7794 : if (pstatus == Pattern_Prefix_Exact)
386 : {
387 6458 : if (!op_in_opfamily(eqopr, opfamily))
388 12 : return NIL;
389 6446 : if (indexcollation != expr_coll)
390 6392 : return NIL;
391 54 : expr = make_opclause(eqopr, BOOLOID, false,
392 : (Expr *) leftop, (Expr *) prefix,
393 : InvalidOid, indexcollation);
394 54 : result = list_make1(expr);
395 54 : return result;
396 : }
397 :
398 : /*
399 : * Anything other than Pattern_Prefix_Exact is not supported if the
400 : * expression collation is nondeterministic. The optimized equality or
401 : * prefix tests use bytewise comparisons, which is not consistent with
402 : * nondeterministic collations.
403 : *
404 : * expr_coll is not set for a non-collation-aware data type such as bytea.
405 : */
406 1336 : if (expr_coll && !get_collation_isdeterministic(expr_coll))
407 6 : return NIL;
408 :
409 : /*
410 : * Otherwise, we have a nonempty required prefix of the values. Some
411 : * opclasses support prefix checks directly, otherwise we'll try to
412 : * generate a range constraint.
413 : */
414 1330 : if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
415 : {
416 24 : expr = make_opclause(preopr, BOOLOID, false,
417 : (Expr *) leftop, (Expr *) prefix,
418 : InvalidOid, indexcollation);
419 24 : result = list_make1(expr);
420 24 : return result;
421 : }
422 :
423 : /*
424 : * Since we need a range constraint, it's only going to work reliably if
425 : * the index is collation-insensitive or has "C" collation. Note that
426 : * here we are looking at the index's collation, not the expression's
427 : * collation -- this test is *not* dependent on the LIKE/regex operator's
428 : * collation.
429 : */
430 1306 : if (collation_aware &&
431 1306 : !pg_newlocale_from_collation(indexcollation)->collate_is_c)
432 14 : return NIL;
433 :
434 : /*
435 : * We can always say "x >= prefix".
436 : */
437 1292 : if (!op_in_opfamily(geopr, opfamily))
438 12 : return NIL;
439 1280 : expr = make_opclause(geopr, BOOLOID, false,
440 : (Expr *) leftop, (Expr *) prefix,
441 : InvalidOid, indexcollation);
442 1280 : result = list_make1(expr);
443 :
444 : /*-------
445 : * If we can create a string larger than the prefix, we can say
446 : * "x < greaterstr". NB: we rely on make_greater_string() to generate
447 : * a guaranteed-greater string, not just a probably-greater string.
448 : * In general this is only guaranteed in C locale, so we'd better be
449 : * using a C-locale index collation.
450 : *-------
451 : */
452 1280 : if (!op_in_opfamily(ltopr, opfamily))
453 0 : return result;
454 1280 : fmgr_info(get_opcode(ltopr), <proc);
455 1280 : greaterstr = make_greater_string(prefix, <proc, indexcollation);
456 1280 : if (greaterstr)
457 : {
458 1280 : expr = make_opclause(ltopr, BOOLOID, false,
459 : (Expr *) leftop, (Expr *) greaterstr,
460 : InvalidOid, indexcollation);
461 1280 : result = lappend(result, expr);
462 : }
463 :
464 1280 : return result;
465 : }
466 :
467 :
468 : /*
469 : * patternsel_common - generic code for pattern-match restriction selectivity.
470 : *
471 : * To support using this from either the operator or function paths, caller
472 : * may pass either operator OID or underlying function OID; we look up the
473 : * latter from the former if needed. (We could just have patternsel() call
474 : * get_opcode(), but the work would be wasted if we don't have a need to
475 : * compare a fixed prefix to the pg_statistic data.)
476 : *
477 : * Note that oprid and/or opfuncid should be for the positive-match operator
478 : * even when negate is true.
479 : */
480 : static double
481 12368 : patternsel_common(PlannerInfo *root,
482 : Oid oprid,
483 : Oid opfuncid,
484 : List *args,
485 : int varRelid,
486 : Oid collation,
487 : Pattern_Type ptype,
488 : bool negate)
489 : {
490 : VariableStatData vardata;
491 : Node *other;
492 : bool varonleft;
493 : Datum constval;
494 : Oid consttype;
495 : Oid vartype;
496 : Oid rdatatype;
497 : Oid eqopr;
498 : Oid ltopr;
499 : Oid geopr;
500 : Pattern_Prefix_Status pstatus;
501 : Const *patt;
502 12368 : Const *prefix = NULL;
503 12368 : Selectivity rest_selec = 0;
504 12368 : double nullfrac = 0.0;
505 : double result;
506 :
507 : /*
508 : * Initialize result to the appropriate default estimate depending on
509 : * whether it's a match or not-match operator.
510 : */
511 12368 : if (negate)
512 2846 : result = 1.0 - DEFAULT_MATCH_SEL;
513 : else
514 9522 : result = DEFAULT_MATCH_SEL;
515 :
516 : /*
517 : * If expression is not variable op constant, then punt and return the
518 : * default estimate.
519 : */
520 12368 : if (!get_restriction_variable(root, args, varRelid,
521 : &vardata, &other, &varonleft))
522 444 : return result;
523 11924 : if (!varonleft || !IsA(other, Const))
524 : {
525 50 : ReleaseVariableStats(vardata);
526 50 : return result;
527 : }
528 :
529 : /*
530 : * If the constant is NULL, assume operator is strict and return zero, ie,
531 : * operator will never return TRUE. (It's zero even for a negator op.)
532 : */
533 11874 : if (((Const *) other)->constisnull)
534 : {
535 0 : ReleaseVariableStats(vardata);
536 0 : return 0.0;
537 : }
538 11874 : constval = ((Const *) other)->constvalue;
539 11874 : consttype = ((Const *) other)->consttype;
540 :
541 : /*
542 : * The right-hand const is type text or bytea for all supported operators.
543 : * We do not expect to see binary-compatible types here, since
544 : * const-folding should have relabeled the const to exactly match the
545 : * operator's declared type.
546 : */
547 11874 : if (consttype != TEXTOID && consttype != BYTEAOID)
548 : {
549 24 : ReleaseVariableStats(vardata);
550 24 : return result;
551 : }
552 :
553 : /*
554 : * Similarly, the exposed type of the left-hand side should be one of
555 : * those we know. (Do not look at vardata.atttype, which might be
556 : * something binary-compatible but different.) We can use it to identify
557 : * the comparison operators and the required type of the comparison
558 : * constant, much as in match_pattern_prefix().
559 : */
560 11850 : vartype = vardata.vartype;
561 :
562 11850 : switch (vartype)
563 : {
564 1586 : case TEXTOID:
565 1586 : eqopr = TextEqualOperator;
566 1586 : ltopr = TextLessOperator;
567 1586 : geopr = TextGreaterEqualOperator;
568 1586 : rdatatype = TEXTOID;
569 1586 : break;
570 10170 : case NAMEOID:
571 :
572 : /*
573 : * Note that here, we need the RHS type to be text, so that the
574 : * comparison value isn't improperly truncated to NAMEDATALEN.
575 : */
576 10170 : eqopr = NameEqualTextOperator;
577 10170 : ltopr = NameLessTextOperator;
578 10170 : geopr = NameGreaterEqualTextOperator;
579 10170 : rdatatype = TEXTOID;
580 10170 : break;
581 84 : case BPCHAROID:
582 84 : eqopr = BpcharEqualOperator;
583 84 : ltopr = BpcharLessOperator;
584 84 : geopr = BpcharGreaterEqualOperator;
585 84 : rdatatype = BPCHAROID;
586 84 : break;
587 6 : case BYTEAOID:
588 6 : eqopr = ByteaEqualOperator;
589 6 : ltopr = ByteaLessOperator;
590 6 : geopr = ByteaGreaterEqualOperator;
591 6 : rdatatype = BYTEAOID;
592 6 : break;
593 4 : default:
594 : /* Can't get here unless we're attached to the wrong operator */
595 4 : ReleaseVariableStats(vardata);
596 4 : return result;
597 : }
598 :
599 : /*
600 : * Grab the nullfrac for use below.
601 : */
602 11846 : if (HeapTupleIsValid(vardata.statsTuple))
603 : {
604 : Form_pg_statistic stats;
605 :
606 9756 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
607 9756 : nullfrac = stats->stanullfrac;
608 : }
609 :
610 : /*
611 : * Pull out any fixed prefix implied by the pattern, and estimate the
612 : * fractional selectivity of the remainder of the pattern. Unlike many
613 : * other selectivity estimators, we use the pattern operator's actual
614 : * collation for this step. This is not because we expect the collation
615 : * to make a big difference in the selectivity estimate (it seldom would),
616 : * but because we want to be sure we cache compiled regexps under the
617 : * right cache key, so that they can be re-used at runtime.
618 : */
619 11846 : patt = (Const *) other;
620 11846 : pstatus = pattern_fixed_prefix(patt, ptype, collation,
621 : &prefix, &rest_selec);
622 :
623 : /*
624 : * If necessary, coerce the prefix constant to the right type. The only
625 : * case where we need to do anything is when converting text to bpchar.
626 : * Those two types are binary-compatible, so relabeling the Const node is
627 : * sufficient.
628 : */
629 11822 : if (prefix && prefix->consttype != rdatatype)
630 : {
631 : Assert(prefix->consttype == TEXTOID &&
632 : rdatatype == BPCHAROID);
633 36 : prefix->consttype = rdatatype;
634 : }
635 :
636 11822 : if (pstatus == Pattern_Prefix_Exact)
637 : {
638 : /*
639 : * Pattern specifies an exact match, so estimate as for '='
640 : */
641 6666 : result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
642 : false, true, false);
643 : }
644 : else
645 : {
646 : /*
647 : * Not exact-match pattern. If we have a sufficiently large
648 : * histogram, estimate selectivity for the histogram part of the
649 : * population by counting matches in the histogram. If not, estimate
650 : * selectivity of the fixed prefix and remainder of pattern
651 : * separately, then combine the two to get an estimate of the
652 : * selectivity for the part of the column population represented by
653 : * the histogram. (For small histograms, we combine these
654 : * approaches.)
655 : *
656 : * We then add up data for any most-common-values values; these are
657 : * not in the histogram population, and we can get exact answers for
658 : * them by applying the pattern operator, so there's no reason to
659 : * approximate. (If the MCVs cover a significant part of the total
660 : * population, this gives us a big leg up in accuracy.)
661 : */
662 : Selectivity selec;
663 : int hist_size;
664 : FmgrInfo opproc;
665 : double mcv_selec,
666 : sumcommon;
667 :
668 : /* Try to use the histogram entries to get selectivity */
669 5156 : if (!OidIsValid(opfuncid))
670 5132 : opfuncid = get_opcode(oprid);
671 5156 : fmgr_info(opfuncid, &opproc);
672 :
673 5156 : selec = histogram_selectivity(&vardata, &opproc, collation,
674 : constval, true,
675 : 10, 1, &hist_size);
676 :
677 : /* If not at least 100 entries, use the heuristic method */
678 5156 : if (hist_size < 100)
679 : {
680 : Selectivity heursel;
681 : Selectivity prefixsel;
682 :
683 3778 : if (pstatus == Pattern_Prefix_Partial)
684 3006 : prefixsel = prefix_selectivity(root, &vardata,
685 : eqopr, ltopr, geopr,
686 : collation,
687 : prefix);
688 : else
689 772 : prefixsel = 1.0;
690 3778 : heursel = prefixsel * rest_selec;
691 :
692 3778 : if (selec < 0) /* fewer than 10 histogram entries? */
693 3360 : selec = heursel;
694 : else
695 : {
696 : /*
697 : * For histogram sizes from 10 to 100, we combine the
698 : * histogram and heuristic selectivities, putting increasingly
699 : * more trust in the histogram for larger sizes.
700 : */
701 418 : double hist_weight = hist_size / 100.0;
702 :
703 418 : selec = selec * hist_weight + heursel * (1.0 - hist_weight);
704 : }
705 : }
706 :
707 : /* In any case, don't believe extremely small or large estimates. */
708 5156 : if (selec < 0.0001)
709 1470 : selec = 0.0001;
710 3686 : else if (selec > 0.9999)
711 132 : selec = 0.9999;
712 :
713 : /*
714 : * If we have most-common-values info, add up the fractions of the MCV
715 : * entries that satisfy MCV OP PATTERN. These fractions contribute
716 : * directly to the result selectivity. Also add up the total fraction
717 : * represented by MCV entries.
718 : */
719 5156 : mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
720 : constval, true,
721 : &sumcommon);
722 :
723 : /*
724 : * Now merge the results from the MCV and histogram calculations,
725 : * realizing that the histogram covers only the non-null values that
726 : * are not listed in MCV.
727 : */
728 5156 : selec *= 1.0 - nullfrac - sumcommon;
729 5156 : selec += mcv_selec;
730 5156 : result = selec;
731 : }
732 :
733 : /* now adjust if we wanted not-match rather than match */
734 11822 : if (negate)
735 2390 : result = 1.0 - result - nullfrac;
736 :
737 : /* result should be in range, but make sure... */
738 11822 : CLAMP_PROBABILITY(result);
739 :
740 11822 : if (prefix)
741 : {
742 11196 : pfree(DatumGetPointer(prefix->constvalue));
743 11196 : pfree(prefix);
744 : }
745 :
746 11822 : ReleaseVariableStats(vardata);
747 :
748 11822 : return result;
749 : }
750 :
751 : /*
752 : * Fix impedance mismatch between SQL-callable functions and patternsel_common
753 : */
754 : static double
755 12344 : patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
756 : {
757 12344 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
758 12344 : Oid operator = PG_GETARG_OID(1);
759 12344 : List *args = (List *) PG_GETARG_POINTER(2);
760 12344 : int varRelid = PG_GETARG_INT32(3);
761 12344 : Oid collation = PG_GET_COLLATION();
762 :
763 : /*
764 : * If this is for a NOT LIKE or similar operator, get the corresponding
765 : * positive-match operator and work with that.
766 : */
767 12344 : if (negate)
768 : {
769 2846 : operator = get_negator(operator);
770 2846 : if (!OidIsValid(operator))
771 0 : elog(ERROR, "patternsel called for operator without a negator");
772 : }
773 :
774 12344 : return patternsel_common(root,
775 : operator,
776 : InvalidOid,
777 : args,
778 : varRelid,
779 : collation,
780 : ptype,
781 : negate);
782 : }
783 :
784 : /*
785 : * regexeqsel - Selectivity of regular-expression pattern match.
786 : */
787 : Datum
788 7680 : regexeqsel(PG_FUNCTION_ARGS)
789 : {
790 7680 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
791 : }
792 :
793 : /*
794 : * icregexeqsel - Selectivity of case-insensitive regex match.
795 : */
796 : Datum
797 64 : icregexeqsel(PG_FUNCTION_ARGS)
798 : {
799 64 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
800 : }
801 :
802 : /*
803 : * likesel - Selectivity of LIKE pattern match.
804 : */
805 : Datum
806 1572 : likesel(PG_FUNCTION_ARGS)
807 : {
808 1572 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
809 : }
810 :
811 : /*
812 : * prefixsel - selectivity of prefix operator
813 : */
814 : Datum
815 54 : prefixsel(PG_FUNCTION_ARGS)
816 : {
817 54 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
818 : }
819 :
820 : /*
821 : *
822 : * iclikesel - Selectivity of ILIKE pattern match.
823 : */
824 : Datum
825 128 : iclikesel(PG_FUNCTION_ARGS)
826 : {
827 128 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
828 : }
829 :
830 : /*
831 : * regexnesel - Selectivity of regular-expression pattern non-match.
832 : */
833 : Datum
834 2686 : regexnesel(PG_FUNCTION_ARGS)
835 : {
836 2686 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
837 : }
838 :
839 : /*
840 : * icregexnesel - Selectivity of case-insensitive regex non-match.
841 : */
842 : Datum
843 16 : icregexnesel(PG_FUNCTION_ARGS)
844 : {
845 16 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
846 : }
847 :
848 : /*
849 : * nlikesel - Selectivity of LIKE pattern non-match.
850 : */
851 : Datum
852 136 : nlikesel(PG_FUNCTION_ARGS)
853 : {
854 136 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
855 : }
856 :
857 : /*
858 : * icnlikesel - Selectivity of ILIKE pattern non-match.
859 : */
860 : Datum
861 8 : icnlikesel(PG_FUNCTION_ARGS)
862 : {
863 8 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
864 : }
865 :
866 : /*
867 : * patternjoinsel - Generic code for pattern-match join selectivity.
868 : */
869 : static double
870 236 : patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
871 : {
872 : /* For the moment we just punt. */
873 236 : return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
874 : }
875 :
876 : /*
877 : * regexeqjoinsel - Join selectivity of regular-expression pattern match.
878 : */
879 : Datum
880 236 : regexeqjoinsel(PG_FUNCTION_ARGS)
881 : {
882 236 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
883 : }
884 :
885 : /*
886 : * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
887 : */
888 : Datum
889 0 : icregexeqjoinsel(PG_FUNCTION_ARGS)
890 : {
891 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
892 : }
893 :
894 : /*
895 : * likejoinsel - Join selectivity of LIKE pattern match.
896 : */
897 : Datum
898 0 : likejoinsel(PG_FUNCTION_ARGS)
899 : {
900 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
901 : }
902 :
903 : /*
904 : * prefixjoinsel - Join selectivity of prefix operator
905 : */
906 : Datum
907 0 : prefixjoinsel(PG_FUNCTION_ARGS)
908 : {
909 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
910 : }
911 :
912 : /*
913 : * iclikejoinsel - Join selectivity of ILIKE pattern match.
914 : */
915 : Datum
916 0 : iclikejoinsel(PG_FUNCTION_ARGS)
917 : {
918 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
919 : }
920 :
921 : /*
922 : * regexnejoinsel - Join selectivity of regex non-match.
923 : */
924 : Datum
925 0 : regexnejoinsel(PG_FUNCTION_ARGS)
926 : {
927 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
928 : }
929 :
930 : /*
931 : * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
932 : */
933 : Datum
934 0 : icregexnejoinsel(PG_FUNCTION_ARGS)
935 : {
936 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
937 : }
938 :
939 : /*
940 : * nlikejoinsel - Join selectivity of LIKE pattern non-match.
941 : */
942 : Datum
943 0 : nlikejoinsel(PG_FUNCTION_ARGS)
944 : {
945 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
946 : }
947 :
948 : /*
949 : * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
950 : */
951 : Datum
952 0 : icnlikejoinsel(PG_FUNCTION_ARGS)
953 : {
954 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
955 : }
956 :
957 :
958 : /*-------------------------------------------------------------------------
959 : *
960 : * Pattern analysis functions
961 : *
962 : * These routines support analysis of LIKE and regular-expression patterns
963 : * by the planner/optimizer. It's important that they agree with the
964 : * regular-expression code in backend/regex/ and the LIKE code in
965 : * backend/utils/adt/like.c. Also, the computation of the fixed prefix
966 : * must be conservative: if we report a string longer than the true fixed
967 : * prefix, the query may produce actually wrong answers, rather than just
968 : * getting a bad selectivity estimate!
969 : *
970 : *-------------------------------------------------------------------------
971 : */
972 :
973 : /*
974 : * Extract the fixed prefix, if any, for a pattern.
975 : *
976 : * *prefix is set to a palloc'd prefix string (in the form of a Const node),
977 : * or to NULL if no fixed prefix exists for the pattern.
978 : * If rest_selec is not NULL, *rest_selec is set to an estimate of the
979 : * selectivity of the remainder of the pattern (without any fixed prefix).
980 : * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
981 : *
982 : * The return value distinguishes no fixed prefix, a partial prefix,
983 : * or an exact-match-only pattern.
984 : */
985 :
986 : static Pattern_Prefix_Status
987 2736 : like_fixed_prefix(Const *patt_const, Const **prefix_const,
988 : Selectivity *rest_selec)
989 : {
990 : char *match;
991 : char *patt;
992 : int pattlen;
993 2736 : Oid typeid = patt_const->consttype;
994 : int pos,
995 : match_pos;
996 :
997 : /* the right-hand const is type text or bytea */
998 : Assert(typeid == BYTEAOID || typeid == TEXTOID);
999 :
1000 2736 : if (typeid != BYTEAOID)
1001 : {
1002 2724 : patt = TextDatumGetCString(patt_const->constvalue);
1003 2724 : pattlen = strlen(patt);
1004 : }
1005 : else
1006 : {
1007 12 : bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1008 :
1009 12 : pattlen = VARSIZE_ANY_EXHDR(bstr);
1010 12 : patt = (char *) palloc(pattlen);
1011 12 : memcpy(patt, VARDATA_ANY(bstr), pattlen);
1012 : Assert(bstr == DatumGetPointer(patt_const->constvalue));
1013 : }
1014 :
1015 2736 : match = palloc(pattlen + 1);
1016 2736 : match_pos = 0;
1017 15628 : for (pos = 0; pos < pattlen; pos++)
1018 : {
1019 : /* % and _ are wildcard characters in LIKE */
1020 15522 : if (patt[pos] == '%' ||
1021 13978 : patt[pos] == '_')
1022 : break;
1023 :
1024 : /* Backslash escapes the next character */
1025 12892 : if (patt[pos] == '\\')
1026 : {
1027 274 : pos++;
1028 274 : if (pos >= pattlen)
1029 0 : break;
1030 : }
1031 :
1032 12892 : match[match_pos++] = patt[pos];
1033 : }
1034 :
1035 2736 : match[match_pos] = '\0';
1036 :
1037 2736 : if (typeid != BYTEAOID)
1038 2724 : *prefix_const = string_to_const(match, typeid);
1039 : else
1040 12 : *prefix_const = string_to_bytea_const(match, match_pos);
1041 :
1042 2736 : if (rest_selec != NULL)
1043 1708 : *rest_selec = like_selectivity(&patt[pos], pattlen - pos, false);
1044 :
1045 2736 : pfree(patt);
1046 2736 : pfree(match);
1047 :
1048 : /* in LIKE, an empty pattern is an exact match! */
1049 2736 : if (pos == pattlen)
1050 106 : return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1051 :
1052 2630 : if (match_pos > 0)
1053 2300 : return Pattern_Prefix_Partial;
1054 :
1055 330 : return Pattern_Prefix_None;
1056 : }
1057 :
1058 : /*
1059 : * Case-insensitive variant of like_fixed_prefix(). Multibyte and
1060 : * locale-aware for detecting cased characters.
1061 : */
1062 : static Pattern_Prefix_Status
1063 238 : like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const,
1064 : Selectivity *rest_selec)
1065 : {
1066 238 : text *val = DatumGetTextPP(patt_const->constvalue);
1067 238 : Oid typeid = patt_const->consttype;
1068 238 : int nbytes = VARSIZE_ANY_EXHDR(val);
1069 : int wpos;
1070 : pg_wchar *wpatt;
1071 : int wpattlen;
1072 : pg_wchar *wmatch;
1073 238 : int wmatch_pos = 0;
1074 : char *match;
1075 : int match_mblen;
1076 238 : pg_locale_t locale = 0;
1077 :
1078 : /* the right-hand const is type text or bytea */
1079 : Assert(typeid == BYTEAOID || typeid == TEXTOID);
1080 :
1081 238 : if (typeid == BYTEAOID)
1082 0 : ereport(ERROR,
1083 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1084 : errmsg("case insensitive matching not supported on type bytea")));
1085 :
1086 238 : if (!OidIsValid(collation))
1087 : {
1088 : /*
1089 : * This typically means that the parser could not resolve a conflict
1090 : * of implicit collations, so report it that way.
1091 : */
1092 0 : ereport(ERROR,
1093 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1094 : errmsg("could not determine which collation to use for ILIKE"),
1095 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1096 : }
1097 :
1098 238 : locale = pg_newlocale_from_collation(collation);
1099 :
1100 238 : wpatt = palloc((nbytes + 1) * sizeof(pg_wchar));
1101 238 : wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes);
1102 :
1103 238 : wmatch = palloc((nbytes + 1) * sizeof(pg_wchar));
1104 334 : for (wpos = 0; wpos < wpattlen; wpos++)
1105 : {
1106 : /* % and _ are wildcard characters in LIKE */
1107 334 : if (wpatt[wpos] == '%' ||
1108 270 : wpatt[wpos] == '_')
1109 : break;
1110 :
1111 : /* Backslash escapes the next character */
1112 270 : if (wpatt[wpos] == '\\')
1113 : {
1114 0 : wpos++;
1115 0 : if (wpos >= wpattlen)
1116 0 : break;
1117 : }
1118 :
1119 : /*
1120 : * For ILIKE, stop if it's a case-varying character (it's sort of a
1121 : * wildcard).
1122 : */
1123 270 : if (pg_iswcased(wpatt[wpos], locale))
1124 174 : break;
1125 :
1126 96 : wmatch[wmatch_pos++] = wpatt[wpos];
1127 : }
1128 :
1129 238 : wmatch[wmatch_pos] = '\0';
1130 :
1131 238 : match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1);
1132 238 : match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos);
1133 238 : match[match_mblen] = '\0';
1134 238 : pfree(wmatch);
1135 :
1136 238 : *prefix_const = string_to_const(match, TEXTOID);
1137 238 : pfree(match);
1138 :
1139 238 : if (rest_selec != NULL)
1140 : {
1141 120 : int wrestlen = wpattlen - wmatch_pos;
1142 : char *rest;
1143 : int rest_mblen;
1144 :
1145 120 : rest = palloc(pg_database_encoding_max_length() * wrestlen + 1);
1146 120 : rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen);
1147 :
1148 120 : *rest_selec = like_selectivity(rest, rest_mblen, true);
1149 120 : pfree(rest);
1150 : }
1151 :
1152 238 : pfree(wpatt);
1153 :
1154 : /* in LIKE, an empty pattern is an exact match! */
1155 238 : if (wpos == wpattlen)
1156 0 : return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1157 :
1158 238 : if (wmatch_pos > 0)
1159 48 : return Pattern_Prefix_Partial;
1160 :
1161 190 : return Pattern_Prefix_None;
1162 : }
1163 :
1164 : static Pattern_Prefix_Status
1165 16864 : regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1166 : Const **prefix_const, Selectivity *rest_selec)
1167 : {
1168 16864 : Oid typeid = patt_const->consttype;
1169 : char *prefix;
1170 : bool exact;
1171 :
1172 : /*
1173 : * Should be unnecessary, there are no bytea regex operators defined. As
1174 : * such, it should be noted that the rest of this function has *not* been
1175 : * made safe for binary (possibly NULL containing) strings.
1176 : */
1177 16864 : if (typeid == BYTEAOID)
1178 0 : ereport(ERROR,
1179 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1180 : errmsg("regular-expression matching not supported on type bytea")));
1181 :
1182 : /* Use the regexp machinery to extract the prefix, if any */
1183 16864 : prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1184 : case_insensitive, collation,
1185 : &exact);
1186 :
1187 16840 : if (prefix == NULL)
1188 : {
1189 762 : *prefix_const = NULL;
1190 :
1191 762 : if (rest_selec != NULL)
1192 : {
1193 626 : char *patt = TextDatumGetCString(patt_const->constvalue);
1194 :
1195 626 : *rest_selec = regex_selectivity(patt, strlen(patt),
1196 : case_insensitive,
1197 : 0);
1198 626 : pfree(patt);
1199 : }
1200 :
1201 762 : return Pattern_Prefix_None;
1202 : }
1203 :
1204 16078 : *prefix_const = string_to_const(prefix, typeid);
1205 :
1206 16078 : if (rest_selec != NULL)
1207 : {
1208 9290 : if (exact)
1209 : {
1210 : /* Exact match, so there's no additional selectivity */
1211 6602 : *rest_selec = 1.0;
1212 : }
1213 : else
1214 : {
1215 2688 : char *patt = TextDatumGetCString(patt_const->constvalue);
1216 :
1217 5376 : *rest_selec = regex_selectivity(patt, strlen(patt),
1218 : case_insensitive,
1219 2688 : strlen(prefix));
1220 2688 : pfree(patt);
1221 : }
1222 : }
1223 :
1224 16078 : pfree(prefix);
1225 :
1226 16078 : if (exact)
1227 13018 : return Pattern_Prefix_Exact; /* pattern specifies exact match */
1228 : else
1229 3060 : return Pattern_Prefix_Partial;
1230 : }
1231 :
1232 : static Pattern_Prefix_Status
1233 19940 : pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1234 : Const **prefix, Selectivity *rest_selec)
1235 : {
1236 : Pattern_Prefix_Status result;
1237 :
1238 19940 : switch (ptype)
1239 : {
1240 2736 : case Pattern_Type_Like:
1241 2736 : result = like_fixed_prefix(patt, prefix, rest_selec);
1242 2736 : break;
1243 238 : case Pattern_Type_Like_IC:
1244 238 : result = like_fixed_prefix_ci(patt, collation, prefix,
1245 : rest_selec);
1246 238 : break;
1247 16802 : case Pattern_Type_Regex:
1248 16802 : result = regex_fixed_prefix(patt, false, collation,
1249 : prefix, rest_selec);
1250 16778 : break;
1251 62 : case Pattern_Type_Regex_IC:
1252 62 : result = regex_fixed_prefix(patt, true, collation,
1253 : prefix, rest_selec);
1254 62 : break;
1255 102 : case Pattern_Type_Prefix:
1256 : /* Prefix type work is trivial. */
1257 102 : result = Pattern_Prefix_Partial;
1258 102 : *prefix = makeConst(patt->consttype,
1259 : patt->consttypmod,
1260 : patt->constcollid,
1261 : patt->constlen,
1262 : datumCopy(patt->constvalue,
1263 102 : patt->constbyval,
1264 : patt->constlen),
1265 102 : patt->constisnull,
1266 102 : patt->constbyval);
1267 102 : if (rest_selec != NULL)
1268 78 : *rest_selec = 1.0; /* all */
1269 102 : break;
1270 0 : default:
1271 0 : elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1272 : result = Pattern_Prefix_None; /* keep compiler quiet */
1273 : break;
1274 : }
1275 19916 : return result;
1276 : }
1277 :
1278 : /*
1279 : * Estimate the selectivity of a fixed prefix for a pattern match.
1280 : *
1281 : * A fixed prefix "foo" is estimated as the selectivity of the expression
1282 : * "variable >= 'foo' AND variable < 'fop'".
1283 : *
1284 : * The selectivity estimate is with respect to the portion of the column
1285 : * population represented by the histogram --- the caller must fold this
1286 : * together with info about MCVs and NULLs.
1287 : *
1288 : * We use the given comparison operators and collation to do the estimation.
1289 : * The given variable and Const must be of the associated datatype(s).
1290 : *
1291 : * XXX Note: we make use of the upper bound to estimate operator selectivity
1292 : * even if the locale is such that we cannot rely on the upper-bound string.
1293 : * The selectivity only needs to be approximately right anyway, so it seems
1294 : * more useful to use the upper-bound code than not.
1295 : */
1296 : static Selectivity
1297 3006 : prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1298 : Oid eqopr, Oid ltopr, Oid geopr,
1299 : Oid collation,
1300 : Const *prefixcon)
1301 : {
1302 : Selectivity prefixsel;
1303 : FmgrInfo opproc;
1304 : Const *greaterstrcon;
1305 : Selectivity eq_sel;
1306 :
1307 : /* Estimate the selectivity of "x >= prefix" */
1308 3006 : fmgr_info(get_opcode(geopr), &opproc);
1309 :
1310 3006 : prefixsel = ineq_histogram_selectivity(root, vardata,
1311 : geopr, &opproc, true, true,
1312 : collation,
1313 : prefixcon->constvalue,
1314 : prefixcon->consttype);
1315 :
1316 3006 : if (prefixsel < 0.0)
1317 : {
1318 : /* No histogram is present ... return a suitable default estimate */
1319 702 : return DEFAULT_MATCH_SEL;
1320 : }
1321 :
1322 : /*
1323 : * If we can create a string larger than the prefix, say "x < greaterstr".
1324 : */
1325 2304 : fmgr_info(get_opcode(ltopr), &opproc);
1326 2304 : greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1327 2304 : if (greaterstrcon)
1328 : {
1329 : Selectivity topsel;
1330 :
1331 2304 : topsel = ineq_histogram_selectivity(root, vardata,
1332 : ltopr, &opproc, false, false,
1333 : collation,
1334 : greaterstrcon->constvalue,
1335 : greaterstrcon->consttype);
1336 :
1337 : /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1338 : Assert(topsel >= 0.0);
1339 :
1340 : /*
1341 : * Merge the two selectivities in the same way as for a range query
1342 : * (see clauselist_selectivity()). Note that we don't need to worry
1343 : * about double-exclusion of nulls, since ineq_histogram_selectivity
1344 : * doesn't count those anyway.
1345 : */
1346 2304 : prefixsel = topsel + prefixsel - 1.0;
1347 : }
1348 :
1349 : /*
1350 : * If the prefix is long then the two bounding values might be too close
1351 : * together for the histogram to distinguish them usefully, resulting in a
1352 : * zero estimate (plus or minus roundoff error). To avoid returning a
1353 : * ridiculously small estimate, compute the estimated selectivity for
1354 : * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1355 : * estimate should be at least that.)
1356 : *
1357 : * We apply this even if we couldn't make a greater string. That case
1358 : * suggests that the prefix is near the maximum possible, and thus
1359 : * probably off the end of the histogram, and thus we probably got a very
1360 : * small estimate from the >= condition; so we still need to clamp.
1361 : */
1362 2304 : eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1363 : false, true, false);
1364 :
1365 2304 : prefixsel = Max(prefixsel, eq_sel);
1366 :
1367 2304 : return prefixsel;
1368 : }
1369 :
1370 :
1371 : /*
1372 : * Estimate the selectivity of a pattern of the specified type.
1373 : * Note that any fixed prefix of the pattern will have been removed already,
1374 : * so actually we may be looking at just a fragment of the pattern.
1375 : *
1376 : * For now, we use a very simplistic approach: fixed characters reduce the
1377 : * selectivity a good deal, character ranges reduce it a little,
1378 : * wildcards (such as % for LIKE or .* for regex) increase it.
1379 : */
1380 :
1381 : #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1382 : #define CHAR_RANGE_SEL 0.25
1383 : #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1384 : #define FULL_WILDCARD_SEL 5.0
1385 : #define PARTIAL_WILDCARD_SEL 2.0
1386 :
1387 : static Selectivity
1388 1828 : like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1389 : {
1390 1828 : Selectivity sel = 1.0;
1391 : int pos;
1392 :
1393 : /* Skip any leading wildcard; it's already factored into initial sel */
1394 3530 : for (pos = 0; pos < pattlen; pos++)
1395 : {
1396 2588 : if (patt[pos] != '%' && patt[pos] != '_')
1397 886 : break;
1398 : }
1399 :
1400 7294 : for (; pos < pattlen; pos++)
1401 : {
1402 : /* % and _ are wildcard characters in LIKE */
1403 5466 : if (patt[pos] == '%')
1404 782 : sel *= FULL_WILDCARD_SEL;
1405 4684 : else if (patt[pos] == '_')
1406 192 : sel *= ANY_CHAR_SEL;
1407 4492 : else if (patt[pos] == '\\')
1408 : {
1409 : /* Backslash quotes the next character */
1410 40 : pos++;
1411 40 : if (pos >= pattlen)
1412 0 : break;
1413 40 : sel *= FIXED_CHAR_SEL;
1414 : }
1415 : else
1416 4452 : sel *= FIXED_CHAR_SEL;
1417 : }
1418 : /* Could get sel > 1 if multiple wildcards */
1419 1828 : if (sel > 1.0)
1420 0 : sel = 1.0;
1421 1828 : return sel;
1422 : }
1423 :
1424 : static Selectivity
1425 3746 : regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1426 : {
1427 3746 : Selectivity sel = 1.0;
1428 3746 : int paren_depth = 0;
1429 3746 : int paren_pos = 0; /* dummy init to keep compiler quiet */
1430 : int pos;
1431 :
1432 : /* since this function recurses, it could be driven to stack overflow */
1433 3746 : check_stack_depth();
1434 :
1435 40706 : for (pos = 0; pos < pattlen; pos++)
1436 : {
1437 36984 : if (patt[pos] == '(')
1438 : {
1439 438 : if (paren_depth == 0)
1440 414 : paren_pos = pos; /* remember start of parenthesized item */
1441 438 : paren_depth++;
1442 : }
1443 36546 : else if (patt[pos] == ')' && paren_depth > 0)
1444 : {
1445 432 : paren_depth--;
1446 432 : if (paren_depth == 0)
1447 408 : sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1448 408 : pos - (paren_pos + 1),
1449 : case_insensitive);
1450 : }
1451 36114 : else if (patt[pos] == '|' && paren_depth == 0)
1452 : {
1453 : /*
1454 : * If unquoted | is present at paren level 0 in pattern, we have
1455 : * multiple alternatives; sum their probabilities.
1456 : */
1457 48 : sel += regex_selectivity_sub(patt + (pos + 1),
1458 24 : pattlen - (pos + 1),
1459 : case_insensitive);
1460 24 : break; /* rest of pattern is now processed */
1461 : }
1462 36090 : else if (patt[pos] == '[')
1463 : {
1464 252 : bool negclass = false;
1465 :
1466 252 : if (patt[++pos] == '^')
1467 : {
1468 60 : negclass = true;
1469 60 : pos++;
1470 : }
1471 252 : if (patt[pos] == ']') /* ']' at start of class is not special */
1472 24 : pos++;
1473 1316 : while (pos < pattlen && patt[pos] != ']')
1474 1064 : pos++;
1475 252 : if (paren_depth == 0)
1476 162 : sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1477 : }
1478 35838 : else if (patt[pos] == '.')
1479 : {
1480 926 : if (paren_depth == 0)
1481 526 : sel *= ANY_CHAR_SEL;
1482 : }
1483 34912 : else if (patt[pos] == '*' ||
1484 34100 : patt[pos] == '?' ||
1485 33934 : patt[pos] == '+')
1486 : {
1487 : /* Ought to be smarter about quantifiers... */
1488 992 : if (paren_depth == 0)
1489 534 : sel *= PARTIAL_WILDCARD_SEL;
1490 : }
1491 33920 : else if (patt[pos] == '{')
1492 : {
1493 264 : while (pos < pattlen && patt[pos] != '}')
1494 188 : pos++;
1495 76 : if (paren_depth == 0)
1496 64 : sel *= PARTIAL_WILDCARD_SEL;
1497 : }
1498 33844 : else if (patt[pos] == '\\')
1499 : {
1500 : /* backslash quotes the next character */
1501 296 : pos++;
1502 296 : if (pos >= pattlen)
1503 0 : break;
1504 296 : if (paren_depth == 0)
1505 152 : sel *= FIXED_CHAR_SEL;
1506 : }
1507 : else
1508 : {
1509 33548 : if (paren_depth == 0)
1510 30772 : sel *= FIXED_CHAR_SEL;
1511 : }
1512 : }
1513 : /* Could get sel > 1 if multiple wildcards */
1514 3746 : if (sel > 1.0)
1515 26 : sel = 1.0;
1516 3746 : return sel;
1517 : }
1518 :
1519 : static Selectivity
1520 3314 : regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1521 : int fixed_prefix_len)
1522 : {
1523 : Selectivity sel;
1524 :
1525 : /* If patt doesn't end with $, consider it to have a trailing wildcard */
1526 3314 : if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1527 414 : (pattlen == 1 || patt[pattlen - 2] != '\\'))
1528 : {
1529 : /* has trailing $ */
1530 414 : sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1531 : }
1532 : else
1533 : {
1534 : /* no trailing $ */
1535 2900 : sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1536 2900 : sel *= FULL_WILDCARD_SEL;
1537 : }
1538 :
1539 : /*
1540 : * If there's a fixed prefix, discount its selectivity. We have to be
1541 : * careful here since a very long prefix could result in pow's result
1542 : * underflowing to zero (in which case "sel" probably has as well).
1543 : */
1544 3314 : if (fixed_prefix_len > 0)
1545 : {
1546 2688 : double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1547 :
1548 2688 : if (prefixsel > 0.0)
1549 2688 : sel /= prefixsel;
1550 : }
1551 :
1552 : /* Make sure result stays in range */
1553 3314 : CLAMP_PROBABILITY(sel);
1554 3314 : return sel;
1555 : }
1556 :
1557 :
1558 : /*
1559 : * For bytea, the increment function need only increment the current byte
1560 : * (there are no multibyte characters to worry about).
1561 : */
1562 : static bool
1563 0 : byte_increment(unsigned char *ptr, int len)
1564 : {
1565 0 : if (*ptr >= 255)
1566 0 : return false;
1567 0 : (*ptr)++;
1568 0 : return true;
1569 : }
1570 :
1571 : /*
1572 : * Try to generate a string greater than the given string or any
1573 : * string it is a prefix of. If successful, return a palloc'd string
1574 : * in the form of a Const node; else return NULL.
1575 : *
1576 : * The caller must provide the appropriate "less than" comparison function
1577 : * for testing the strings, along with the collation to use.
1578 : *
1579 : * The key requirement here is that given a prefix string, say "foo",
1580 : * we must be able to generate another string "fop" that is greater than
1581 : * all strings "foobar" starting with "foo". We can test that we have
1582 : * generated a string greater than the prefix string, but in non-C collations
1583 : * that is not a bulletproof guarantee that an extension of the string might
1584 : * not sort after it; an example is that "foo " is less than "foo!", but it
1585 : * is not clear that a "dictionary" sort ordering will consider "foo!" less
1586 : * than "foo bar". CAUTION: Therefore, this function should be used only for
1587 : * estimation purposes when working in a non-C collation.
1588 : *
1589 : * To try to catch most cases where an extended string might otherwise sort
1590 : * before the result value, we determine which of the strings "Z", "z", "y",
1591 : * and "9" is seen as largest by the collation, and append that to the given
1592 : * prefix before trying to find a string that compares as larger.
1593 : *
1594 : * To search for a greater string, we repeatedly "increment" the rightmost
1595 : * character, using an encoding-specific character incrementer function.
1596 : * When it's no longer possible to increment the last character, we truncate
1597 : * off that character and start incrementing the next-to-rightmost.
1598 : * For example, if "z" were the last character in the sort order, then we
1599 : * could produce "foo" as a string greater than "fonz".
1600 : *
1601 : * This could be rather slow in the worst case, but in most cases we
1602 : * won't have to try more than one or two strings before succeeding.
1603 : *
1604 : * Note that it's important for the character incrementer not to be too anal
1605 : * about producing every possible character code, since in some cases the only
1606 : * way to get a larger string is to increment a previous character position.
1607 : * So we don't want to spend too much time trying every possible character
1608 : * code at the last position. A good rule of thumb is to be sure that we
1609 : * don't try more than 256*K values for a K-byte character (and definitely
1610 : * not 256^K, which is what an exhaustive search would approach).
1611 : */
1612 : static Const *
1613 3584 : make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1614 : {
1615 3584 : Oid datatype = str_const->consttype;
1616 : char *workstr;
1617 : int len;
1618 : Datum cmpstr;
1619 3584 : char *cmptxt = NULL;
1620 : mbcharacter_incrementer charinc;
1621 :
1622 : /*
1623 : * Get a modifiable copy of the prefix string in C-string format, and set
1624 : * up the string we will compare to as a Datum. In C locale this can just
1625 : * be the given prefix string, otherwise we need to add a suffix. Type
1626 : * BYTEA sorts bytewise so it never needs a suffix either.
1627 : */
1628 3584 : if (datatype == BYTEAOID)
1629 : {
1630 0 : bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1631 :
1632 0 : len = VARSIZE_ANY_EXHDR(bstr);
1633 0 : workstr = (char *) palloc(len);
1634 0 : memcpy(workstr, VARDATA_ANY(bstr), len);
1635 : Assert(bstr == DatumGetPointer(str_const->constvalue));
1636 0 : cmpstr = str_const->constvalue;
1637 : }
1638 : else
1639 : {
1640 3584 : if (datatype == NAMEOID)
1641 0 : workstr = DatumGetCString(DirectFunctionCall1(nameout,
1642 : str_const->constvalue));
1643 : else
1644 3584 : workstr = TextDatumGetCString(str_const->constvalue);
1645 3584 : len = strlen(workstr);
1646 3584 : if (len == 0 || pg_newlocale_from_collation(collation)->collate_is_c)
1647 3558 : cmpstr = str_const->constvalue;
1648 : else
1649 : {
1650 : /* If first time through, determine the suffix to use */
1651 : static char suffixchar = 0;
1652 : static Oid suffixcollation = 0;
1653 :
1654 26 : if (!suffixchar || suffixcollation != collation)
1655 : {
1656 : char *best;
1657 :
1658 6 : best = "Z";
1659 6 : if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1660 0 : best = "z";
1661 6 : if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1662 0 : best = "y";
1663 6 : if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1664 0 : best = "9";
1665 6 : suffixchar = *best;
1666 6 : suffixcollation = collation;
1667 : }
1668 :
1669 : /* And build the string to compare to */
1670 26 : if (datatype == NAMEOID)
1671 : {
1672 0 : cmptxt = palloc(len + 2);
1673 0 : memcpy(cmptxt, workstr, len);
1674 0 : cmptxt[len] = suffixchar;
1675 0 : cmptxt[len + 1] = '\0';
1676 0 : cmpstr = PointerGetDatum(cmptxt);
1677 : }
1678 : else
1679 : {
1680 26 : cmptxt = palloc(VARHDRSZ + len + 1);
1681 26 : SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1682 26 : memcpy(VARDATA(cmptxt), workstr, len);
1683 26 : *(VARDATA(cmptxt) + len) = suffixchar;
1684 26 : cmpstr = PointerGetDatum(cmptxt);
1685 : }
1686 : }
1687 : }
1688 :
1689 : /* Select appropriate character-incrementer function */
1690 3584 : if (datatype == BYTEAOID)
1691 0 : charinc = byte_increment;
1692 : else
1693 3584 : charinc = pg_database_encoding_character_incrementer();
1694 :
1695 : /* And search ... */
1696 3584 : while (len > 0)
1697 : {
1698 : int charlen;
1699 : unsigned char *lastchar;
1700 :
1701 : /* Identify the last character --- for bytea, just the last byte */
1702 3584 : if (datatype == BYTEAOID)
1703 0 : charlen = 1;
1704 : else
1705 3584 : charlen = len - pg_mbcliplen(workstr, len, len - 1);
1706 3584 : lastchar = (unsigned char *) (workstr + len - charlen);
1707 :
1708 : /*
1709 : * Try to generate a larger string by incrementing the last character
1710 : * (for BYTEA, we treat each byte as a character).
1711 : *
1712 : * Note: the incrementer function is expected to return true if it's
1713 : * generated a valid-per-the-encoding new character, otherwise false.
1714 : * The contents of the character on false return are unspecified.
1715 : */
1716 3584 : while (charinc(lastchar, charlen))
1717 : {
1718 : Const *workstr_const;
1719 :
1720 3584 : if (datatype == BYTEAOID)
1721 0 : workstr_const = string_to_bytea_const(workstr, len);
1722 : else
1723 3584 : workstr_const = string_to_const(workstr, datatype);
1724 :
1725 3584 : if (DatumGetBool(FunctionCall2Coll(ltproc,
1726 : collation,
1727 : cmpstr,
1728 : workstr_const->constvalue)))
1729 : {
1730 : /* Successfully made a string larger than cmpstr */
1731 3584 : if (cmptxt)
1732 26 : pfree(cmptxt);
1733 3584 : pfree(workstr);
1734 3584 : return workstr_const;
1735 : }
1736 :
1737 : /* No good, release unusable value and try again */
1738 0 : pfree(DatumGetPointer(workstr_const->constvalue));
1739 0 : pfree(workstr_const);
1740 : }
1741 :
1742 : /*
1743 : * No luck here, so truncate off the last character and try to
1744 : * increment the next one.
1745 : */
1746 0 : len -= charlen;
1747 0 : workstr[len] = '\0';
1748 : }
1749 :
1750 : /* Failed... */
1751 0 : if (cmptxt)
1752 0 : pfree(cmptxt);
1753 0 : pfree(workstr);
1754 :
1755 0 : return NULL;
1756 : }
1757 :
1758 : /*
1759 : * Generate a Datum of the appropriate type from a C string.
1760 : * Note that all of the supported types are pass-by-ref, so the
1761 : * returned value should be pfree'd if no longer needed.
1762 : */
1763 : static Datum
1764 22624 : string_to_datum(const char *str, Oid datatype)
1765 : {
1766 : Assert(str != NULL);
1767 :
1768 : /*
1769 : * We cheat a little by assuming that CStringGetTextDatum() will do for
1770 : * bpchar and varchar constants too...
1771 : */
1772 22624 : if (datatype == NAMEOID)
1773 0 : return DirectFunctionCall1(namein, CStringGetDatum(str));
1774 22624 : else if (datatype == BYTEAOID)
1775 0 : return DirectFunctionCall1(byteain, CStringGetDatum(str));
1776 : else
1777 22624 : return CStringGetTextDatum(str);
1778 : }
1779 :
1780 : /*
1781 : * Generate a Const node of the appropriate type from a C string.
1782 : */
1783 : static Const *
1784 22624 : string_to_const(const char *str, Oid datatype)
1785 : {
1786 22624 : Datum conval = string_to_datum(str, datatype);
1787 : Oid collation;
1788 : int constlen;
1789 :
1790 : /*
1791 : * We only need to support a few datatypes here, so hard-wire properties
1792 : * instead of incurring the expense of catalog lookups.
1793 : */
1794 22624 : switch (datatype)
1795 : {
1796 22624 : case TEXTOID:
1797 : case VARCHAROID:
1798 : case BPCHAROID:
1799 22624 : collation = DEFAULT_COLLATION_OID;
1800 22624 : constlen = -1;
1801 22624 : break;
1802 :
1803 0 : case NAMEOID:
1804 0 : collation = C_COLLATION_OID;
1805 0 : constlen = NAMEDATALEN;
1806 0 : break;
1807 :
1808 0 : case BYTEAOID:
1809 0 : collation = InvalidOid;
1810 0 : constlen = -1;
1811 0 : break;
1812 :
1813 0 : default:
1814 0 : elog(ERROR, "unexpected datatype in string_to_const: %u",
1815 : datatype);
1816 : return NULL;
1817 : }
1818 :
1819 22624 : return makeConst(datatype, -1, collation, constlen,
1820 : conval, false, false);
1821 : }
1822 :
1823 : /*
1824 : * Generate a Const node of bytea type from a binary C string and a length.
1825 : */
1826 : static Const *
1827 12 : string_to_bytea_const(const char *str, size_t str_len)
1828 : {
1829 12 : bytea *bstr = palloc(VARHDRSZ + str_len);
1830 : Datum conval;
1831 :
1832 12 : memcpy(VARDATA(bstr), str, str_len);
1833 12 : SET_VARSIZE(bstr, VARHDRSZ + str_len);
1834 12 : conval = PointerGetDatum(bstr);
1835 :
1836 12 : return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1837 : }
|