Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * like_support.c
4 : * Planner support functions for LIKE, regex, and related operators.
5 : *
6 : * These routines handle special optimization of operators that can be
7 : * used with index scans even though they are not known to the executor's
8 : * indexscan machinery. The key idea is that these operators allow us
9 : * to derive approximate indexscan qual clauses, such that any tuples
10 : * that pass the operator clause itself must also satisfy the simpler
11 : * indexscan condition(s). Then we can use the indexscan machinery
12 : * to avoid scanning as much of the table as we'd otherwise have to,
13 : * while applying the original operator as a qpqual condition to ensure
14 : * we deliver only the tuples we want. (In essence, we're using a regular
15 : * index as if it were a lossy index.)
16 : *
17 : * An example of what we're doing is
18 : * textfield LIKE 'abc%def'
19 : * from which we can generate the indexscanable conditions
20 : * textfield >= 'abc' AND textfield < 'abd'
21 : * which allow efficient scanning of an index on textfield.
22 : * (In reality, character set and collation issues make the transformation
23 : * from LIKE to indexscan limits rather harder than one might think ...
24 : * but that's the basic idea.)
25 : *
26 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/adt/like_support.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include <math.h>
38 :
39 : #include "access/htup_details.h"
40 : #include "catalog/pg_collation.h"
41 : #include "catalog/pg_operator.h"
42 : #include "catalog/pg_opfamily.h"
43 : #include "catalog/pg_statistic.h"
44 : #include "catalog/pg_type.h"
45 : #include "mb/pg_wchar.h"
46 : #include "miscadmin.h"
47 : #include "nodes/makefuncs.h"
48 : #include "nodes/nodeFuncs.h"
49 : #include "nodes/supportnodes.h"
50 : #include "utils/builtins.h"
51 : #include "utils/datum.h"
52 : #include "utils/lsyscache.h"
53 : #include "utils/pg_locale.h"
54 : #include "utils/selfuncs.h"
55 : #include "utils/varlena.h"
56 :
57 :
58 : typedef enum
59 : {
60 : Pattern_Type_Like,
61 : Pattern_Type_Like_IC,
62 : Pattern_Type_Regex,
63 : Pattern_Type_Regex_IC,
64 : Pattern_Type_Prefix,
65 : } Pattern_Type;
66 :
67 : typedef enum
68 : {
69 : Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact,
70 : } Pattern_Prefix_Status;
71 :
72 : static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 : static List *match_pattern_prefix(Node *leftop,
74 : Node *rightop,
75 : Pattern_Type ptype,
76 : Oid expr_coll,
77 : Oid opfamily,
78 : Oid indexcollation);
79 : static double patternsel_common(PlannerInfo *root,
80 : Oid oprid,
81 : Oid opfuncid,
82 : List *args,
83 : int varRelid,
84 : Oid collation,
85 : Pattern_Type ptype,
86 : bool negate);
87 : static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
88 : Pattern_Type ptype,
89 : Oid collation,
90 : Const **prefix,
91 : Selectivity *rest_selec);
92 : static Selectivity prefix_selectivity(PlannerInfo *root,
93 : VariableStatData *vardata,
94 : Oid eqopr, Oid ltopr, Oid geopr,
95 : Oid collation,
96 : Const *prefixcon);
97 : static Selectivity like_selectivity(const char *patt, int pattlen,
98 : bool case_insensitive);
99 : static Selectivity regex_selectivity(const char *patt, int pattlen,
100 : bool case_insensitive,
101 : int fixed_prefix_len);
102 : static int pattern_char_isalpha(char c, bool is_multibyte,
103 : pg_locale_t locale);
104 : static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105 : Oid collation);
106 : static Datum string_to_datum(const char *str, Oid datatype);
107 : static Const *string_to_const(const char *str, Oid datatype);
108 : static Const *string_to_bytea_const(const char *str, size_t str_len);
109 :
110 :
111 : /*
112 : * Planner support functions for LIKE, regex, and related operators
113 : */
114 : Datum
115 5066 : textlike_support(PG_FUNCTION_ARGS)
116 : {
117 5066 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
118 :
119 5066 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
120 : }
121 :
122 : Datum
123 358 : texticlike_support(PG_FUNCTION_ARGS)
124 : {
125 358 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
126 :
127 358 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
128 : }
129 :
130 : Datum
131 21670 : textregexeq_support(PG_FUNCTION_ARGS)
132 : {
133 21670 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
134 :
135 21670 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
136 : }
137 :
138 : Datum
139 118 : texticregexeq_support(PG_FUNCTION_ARGS)
140 : {
141 118 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
142 :
143 118 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
144 : }
145 :
146 : Datum
147 156 : text_starts_with_support(PG_FUNCTION_ARGS)
148 : {
149 156 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
150 :
151 156 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Prefix));
152 : }
153 :
154 : /* Common code for the above */
155 : static Node *
156 27368 : like_regex_support(Node *rawreq, Pattern_Type ptype)
157 : {
158 27368 : Node *ret = NULL;
159 :
160 27368 : if (IsA(rawreq, SupportRequestSelectivity))
161 : {
162 : /*
163 : * Make a selectivity estimate for a function call, just as we'd do if
164 : * the call was via the corresponding operator.
165 : */
166 24 : SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
167 : Selectivity s1;
168 :
169 24 : if (req->is_join)
170 : {
171 : /*
172 : * For the moment we just punt. If patternjoinsel is ever
173 : * improved to do better, this should be made to call it.
174 : */
175 0 : s1 = DEFAULT_MATCH_SEL;
176 : }
177 : else
178 : {
179 : /* Share code with operator restriction selectivity functions */
180 24 : s1 = patternsel_common(req->root,
181 : InvalidOid,
182 : req->funcid,
183 : req->args,
184 : req->varRelid,
185 : req->inputcollid,
186 : ptype,
187 : false);
188 : }
189 24 : req->selectivity = s1;
190 24 : ret = (Node *) req;
191 : }
192 27344 : else if (IsA(rawreq, SupportRequestIndexCondition))
193 : {
194 : /* Try to convert operator/function call to index conditions */
195 7720 : SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
196 :
197 : /*
198 : * Currently we have no "reverse" match operators with the pattern on
199 : * the left, so we only need consider cases with the indexkey on the
200 : * left.
201 : */
202 7720 : if (req->indexarg != 0)
203 0 : return NULL;
204 :
205 7720 : if (is_opclause(req->node))
206 : {
207 7696 : OpExpr *clause = (OpExpr *) req->node;
208 :
209 : Assert(list_length(clause->args) == 2);
210 : ret = (Node *)
211 7696 : match_pattern_prefix((Node *) linitial(clause->args),
212 7696 : (Node *) lsecond(clause->args),
213 : ptype,
214 : clause->inputcollid,
215 : req->opfamily,
216 : req->indexcollation);
217 : }
218 24 : else if (is_funcclause(req->node)) /* be paranoid */
219 : {
220 24 : FuncExpr *clause = (FuncExpr *) req->node;
221 :
222 : Assert(list_length(clause->args) == 2);
223 : ret = (Node *)
224 24 : match_pattern_prefix((Node *) linitial(clause->args),
225 24 : (Node *) lsecond(clause->args),
226 : ptype,
227 : clause->inputcollid,
228 : req->opfamily,
229 : req->indexcollation);
230 : }
231 : }
232 :
233 27368 : return ret;
234 : }
235 :
236 : /*
237 : * match_pattern_prefix
238 : * Try to generate an indexqual for a LIKE or regex operator.
239 : */
240 : static List *
241 7720 : match_pattern_prefix(Node *leftop,
242 : Node *rightop,
243 : Pattern_Type ptype,
244 : Oid expr_coll,
245 : Oid opfamily,
246 : Oid indexcollation)
247 : {
248 : List *result;
249 : Const *patt;
250 : Const *prefix;
251 : Pattern_Prefix_Status pstatus;
252 : Oid ldatatype;
253 : Oid rdatatype;
254 : Oid eqopr;
255 : Oid ltopr;
256 : Oid geopr;
257 7720 : Oid preopr = InvalidOid;
258 : bool collation_aware;
259 : Expr *expr;
260 : FmgrInfo ltproc;
261 : Const *greaterstr;
262 :
263 : /*
264 : * Can't do anything with a non-constant or NULL pattern argument.
265 : *
266 : * Note that since we restrict ourselves to cases with a hard constant on
267 : * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
268 : * about verifying that.
269 : */
270 7720 : if (!IsA(rightop, Const) ||
271 7600 : ((Const *) rightop)->constisnull)
272 120 : return NIL;
273 7600 : patt = (Const *) rightop;
274 :
275 : /*
276 : * Try to extract a fixed prefix from the pattern.
277 : */
278 7600 : pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
279 : &prefix, NULL);
280 :
281 : /* fail if no fixed prefix */
282 7600 : if (pstatus == Pattern_Prefix_None)
283 294 : return NIL;
284 :
285 : /*
286 : * Identify the operators we want to use, based on the type of the
287 : * left-hand argument. Usually these are just the type's regular
288 : * comparison operators, but if we are considering one of the semi-legacy
289 : * "pattern" opclasses, use the "pattern" operators instead. Those are
290 : * not collation-sensitive but always use C collation, as we want. The
291 : * selected operators also determine the needed type of the prefix
292 : * constant.
293 : */
294 7306 : ldatatype = exprType(leftop);
295 7306 : switch (ldatatype)
296 : {
297 80 : case TEXTOID:
298 80 : if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
299 : {
300 0 : eqopr = TextEqualOperator;
301 0 : ltopr = TextPatternLessOperator;
302 0 : geopr = TextPatternGreaterEqualOperator;
303 0 : collation_aware = false;
304 : }
305 80 : else if (opfamily == TEXT_SPGIST_FAM_OID)
306 : {
307 24 : eqopr = TextEqualOperator;
308 24 : ltopr = TextPatternLessOperator;
309 24 : geopr = TextPatternGreaterEqualOperator;
310 : /* This opfamily has direct support for prefixing */
311 24 : preopr = TextPrefixOperator;
312 24 : collation_aware = false;
313 : }
314 : else
315 : {
316 56 : eqopr = TextEqualOperator;
317 56 : ltopr = TextLessOperator;
318 56 : geopr = TextGreaterEqualOperator;
319 56 : collation_aware = true;
320 : }
321 80 : rdatatype = TEXTOID;
322 80 : break;
323 7202 : case NAMEOID:
324 :
325 : /*
326 : * Note that here, we need the RHS type to be text, so that the
327 : * comparison value isn't improperly truncated to NAMEDATALEN.
328 : */
329 7202 : eqopr = NameEqualTextOperator;
330 7202 : ltopr = NameLessTextOperator;
331 7202 : geopr = NameGreaterEqualTextOperator;
332 7202 : collation_aware = true;
333 7202 : rdatatype = TEXTOID;
334 7202 : break;
335 24 : case BPCHAROID:
336 24 : if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
337 : {
338 0 : eqopr = BpcharEqualOperator;
339 0 : ltopr = BpcharPatternLessOperator;
340 0 : geopr = BpcharPatternGreaterEqualOperator;
341 0 : collation_aware = false;
342 : }
343 : else
344 : {
345 24 : eqopr = BpcharEqualOperator;
346 24 : ltopr = BpcharLessOperator;
347 24 : geopr = BpcharGreaterEqualOperator;
348 24 : collation_aware = true;
349 : }
350 24 : rdatatype = BPCHAROID;
351 24 : break;
352 0 : case BYTEAOID:
353 0 : eqopr = ByteaEqualOperator;
354 0 : ltopr = ByteaLessOperator;
355 0 : geopr = ByteaGreaterEqualOperator;
356 0 : collation_aware = false;
357 0 : rdatatype = BYTEAOID;
358 0 : break;
359 0 : default:
360 : /* Can't get here unless we're attached to the wrong operator */
361 0 : return NIL;
362 : }
363 :
364 : /*
365 : * If necessary, coerce the prefix constant to the right type. The given
366 : * prefix constant is either text or bytea type, therefore the only case
367 : * where we need to do anything is when converting text to bpchar. Those
368 : * two types are binary-compatible, so relabeling the Const node is
369 : * sufficient.
370 : */
371 7306 : if (prefix->consttype != rdatatype)
372 : {
373 : Assert(prefix->consttype == TEXTOID &&
374 : rdatatype == BPCHAROID);
375 24 : prefix->consttype = rdatatype;
376 : }
377 :
378 : /*
379 : * If we found an exact-match pattern, generate an "=" indexqual.
380 : *
381 : * Here and below, check to see whether the desired operator is actually
382 : * supported by the index opclass, and fail quietly if not. This allows
383 : * us to not be concerned with specific opclasses (except for the legacy
384 : * "pattern" cases); any index that correctly implements the operators
385 : * will work.
386 : */
387 7306 : if (pstatus == Pattern_Prefix_Exact)
388 : {
389 6014 : if (!op_in_opfamily(eqopr, opfamily))
390 12 : return NIL;
391 6002 : if (indexcollation != expr_coll)
392 5948 : return NIL;
393 54 : expr = make_opclause(eqopr, BOOLOID, false,
394 : (Expr *) leftop, (Expr *) prefix,
395 : InvalidOid, indexcollation);
396 54 : result = list_make1(expr);
397 54 : return result;
398 : }
399 :
400 : /*
401 : * Anything other than Pattern_Prefix_Exact is not supported if the
402 : * expression collation is nondeterministic. The optimized equality or
403 : * prefix tests use bytewise comparisons, which is not consistent with
404 : * nondeterministic collations.
405 : *
406 : * expr_coll is not set for a non-collation-aware data type such as bytea.
407 : */
408 1292 : if (expr_coll && !get_collation_isdeterministic(expr_coll))
409 6 : return NIL;
410 :
411 : /*
412 : * Otherwise, we have a nonempty required prefix of the values. Some
413 : * opclasses support prefix checks directly, otherwise we'll try to
414 : * generate a range constraint.
415 : */
416 1286 : if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
417 : {
418 24 : expr = make_opclause(preopr, BOOLOID, false,
419 : (Expr *) leftop, (Expr *) prefix,
420 : InvalidOid, indexcollation);
421 24 : result = list_make1(expr);
422 24 : return result;
423 : }
424 :
425 : /*
426 : * Since we need a range constraint, it's only going to work reliably if
427 : * the index is collation-insensitive or has "C" collation. Note that
428 : * here we are looking at the index's collation, not the expression's
429 : * collation -- this test is *not* dependent on the LIKE/regex operator's
430 : * collation.
431 : */
432 1262 : if (collation_aware &&
433 1262 : !pg_newlocale_from_collation(indexcollation)->collate_is_c)
434 14 : return NIL;
435 :
436 : /*
437 : * We can always say "x >= prefix".
438 : */
439 1248 : if (!op_in_opfamily(geopr, opfamily))
440 12 : return NIL;
441 1236 : expr = make_opclause(geopr, BOOLOID, false,
442 : (Expr *) leftop, (Expr *) prefix,
443 : InvalidOid, indexcollation);
444 1236 : result = list_make1(expr);
445 :
446 : /*-------
447 : * If we can create a string larger than the prefix, we can say
448 : * "x < greaterstr". NB: we rely on make_greater_string() to generate
449 : * a guaranteed-greater string, not just a probably-greater string.
450 : * In general this is only guaranteed in C locale, so we'd better be
451 : * using a C-locale index collation.
452 : *-------
453 : */
454 1236 : if (!op_in_opfamily(ltopr, opfamily))
455 0 : return result;
456 1236 : fmgr_info(get_opcode(ltopr), <proc);
457 1236 : greaterstr = make_greater_string(prefix, <proc, indexcollation);
458 1236 : if (greaterstr)
459 : {
460 1236 : expr = make_opclause(ltopr, BOOLOID, false,
461 : (Expr *) leftop, (Expr *) greaterstr,
462 : InvalidOid, indexcollation);
463 1236 : result = lappend(result, expr);
464 : }
465 :
466 1236 : return result;
467 : }
468 :
469 :
470 : /*
471 : * patternsel_common - generic code for pattern-match restriction selectivity.
472 : *
473 : * To support using this from either the operator or function paths, caller
474 : * may pass either operator OID or underlying function OID; we look up the
475 : * latter from the former if needed. (We could just have patternsel() call
476 : * get_opcode(), but the work would be wasted if we don't have a need to
477 : * compare a fixed prefix to the pg_statistic data.)
478 : *
479 : * Note that oprid and/or opfuncid should be for the positive-match operator
480 : * even when negate is true.
481 : */
482 : static double
483 10564 : patternsel_common(PlannerInfo *root,
484 : Oid oprid,
485 : Oid opfuncid,
486 : List *args,
487 : int varRelid,
488 : Oid collation,
489 : Pattern_Type ptype,
490 : bool negate)
491 : {
492 : VariableStatData vardata;
493 : Node *other;
494 : bool varonleft;
495 : Datum constval;
496 : Oid consttype;
497 : Oid vartype;
498 : Oid rdatatype;
499 : Oid eqopr;
500 : Oid ltopr;
501 : Oid geopr;
502 : Pattern_Prefix_Status pstatus;
503 : Const *patt;
504 10564 : Const *prefix = NULL;
505 10564 : Selectivity rest_selec = 0;
506 10564 : double nullfrac = 0.0;
507 : double result;
508 :
509 : /*
510 : * Initialize result to the appropriate default estimate depending on
511 : * whether it's a match or not-match operator.
512 : */
513 10564 : if (negate)
514 1522 : result = 1.0 - DEFAULT_MATCH_SEL;
515 : else
516 9042 : result = DEFAULT_MATCH_SEL;
517 :
518 : /*
519 : * If expression is not variable op constant, then punt and return the
520 : * default estimate.
521 : */
522 10564 : if (!get_restriction_variable(root, args, varRelid,
523 : &vardata, &other, &varonleft))
524 196 : return result;
525 10368 : if (!varonleft || !IsA(other, Const))
526 : {
527 50 : ReleaseVariableStats(vardata);
528 50 : return result;
529 : }
530 :
531 : /*
532 : * If the constant is NULL, assume operator is strict and return zero, ie,
533 : * operator will never return TRUE. (It's zero even for a negator op.)
534 : */
535 10318 : if (((Const *) other)->constisnull)
536 : {
537 0 : ReleaseVariableStats(vardata);
538 0 : return 0.0;
539 : }
540 10318 : constval = ((Const *) other)->constvalue;
541 10318 : consttype = ((Const *) other)->consttype;
542 :
543 : /*
544 : * The right-hand const is type text or bytea for all supported operators.
545 : * We do not expect to see binary-compatible types here, since
546 : * const-folding should have relabeled the const to exactly match the
547 : * operator's declared type.
548 : */
549 10318 : if (consttype != TEXTOID && consttype != BYTEAOID)
550 : {
551 24 : ReleaseVariableStats(vardata);
552 24 : return result;
553 : }
554 :
555 : /*
556 : * Similarly, the exposed type of the left-hand side should be one of
557 : * those we know. (Do not look at vardata.atttype, which might be
558 : * something binary-compatible but different.) We can use it to identify
559 : * the comparison operators and the required type of the comparison
560 : * constant, much as in match_pattern_prefix().
561 : */
562 10294 : vartype = vardata.vartype;
563 :
564 10294 : switch (vartype)
565 : {
566 1498 : case TEXTOID:
567 1498 : eqopr = TextEqualOperator;
568 1498 : ltopr = TextLessOperator;
569 1498 : geopr = TextGreaterEqualOperator;
570 1498 : rdatatype = TEXTOID;
571 1498 : break;
572 8702 : case NAMEOID:
573 :
574 : /*
575 : * Note that here, we need the RHS type to be text, so that the
576 : * comparison value isn't improperly truncated to NAMEDATALEN.
577 : */
578 8702 : eqopr = NameEqualTextOperator;
579 8702 : ltopr = NameLessTextOperator;
580 8702 : geopr = NameGreaterEqualTextOperator;
581 8702 : rdatatype = TEXTOID;
582 8702 : break;
583 84 : case BPCHAROID:
584 84 : eqopr = BpcharEqualOperator;
585 84 : ltopr = BpcharLessOperator;
586 84 : geopr = BpcharGreaterEqualOperator;
587 84 : rdatatype = BPCHAROID;
588 84 : break;
589 6 : case BYTEAOID:
590 6 : eqopr = ByteaEqualOperator;
591 6 : ltopr = ByteaLessOperator;
592 6 : geopr = ByteaGreaterEqualOperator;
593 6 : rdatatype = BYTEAOID;
594 6 : break;
595 4 : default:
596 : /* Can't get here unless we're attached to the wrong operator */
597 4 : ReleaseVariableStats(vardata);
598 4 : return result;
599 : }
600 :
601 : /*
602 : * Grab the nullfrac for use below.
603 : */
604 10290 : if (HeapTupleIsValid(vardata.statsTuple))
605 : {
606 : Form_pg_statistic stats;
607 :
608 8302 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
609 8302 : nullfrac = stats->stanullfrac;
610 : }
611 :
612 : /*
613 : * Pull out any fixed prefix implied by the pattern, and estimate the
614 : * fractional selectivity of the remainder of the pattern. Unlike many
615 : * other selectivity estimators, we use the pattern operator's actual
616 : * collation for this step. This is not because we expect the collation
617 : * to make a big difference in the selectivity estimate (it seldom would),
618 : * but because we want to be sure we cache compiled regexps under the
619 : * right cache key, so that they can be re-used at runtime.
620 : */
621 10290 : patt = (Const *) other;
622 10290 : pstatus = pattern_fixed_prefix(patt, ptype, collation,
623 : &prefix, &rest_selec);
624 :
625 : /*
626 : * If necessary, coerce the prefix constant to the right type. The only
627 : * case where we need to do anything is when converting text to bpchar.
628 : * Those two types are binary-compatible, so relabeling the Const node is
629 : * sufficient.
630 : */
631 10266 : if (prefix && prefix->consttype != rdatatype)
632 : {
633 : Assert(prefix->consttype == TEXTOID &&
634 : rdatatype == BPCHAROID);
635 36 : prefix->consttype = rdatatype;
636 : }
637 :
638 10266 : if (pstatus == Pattern_Prefix_Exact)
639 : {
640 : /*
641 : * Pattern specifies an exact match, so estimate as for '='
642 : */
643 6306 : result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
644 : false, true, false);
645 : }
646 : else
647 : {
648 : /*
649 : * Not exact-match pattern. If we have a sufficiently large
650 : * histogram, estimate selectivity for the histogram part of the
651 : * population by counting matches in the histogram. If not, estimate
652 : * selectivity of the fixed prefix and remainder of pattern
653 : * separately, then combine the two to get an estimate of the
654 : * selectivity for the part of the column population represented by
655 : * the histogram. (For small histograms, we combine these
656 : * approaches.)
657 : *
658 : * We then add up data for any most-common-values values; these are
659 : * not in the histogram population, and we can get exact answers for
660 : * them by applying the pattern operator, so there's no reason to
661 : * approximate. (If the MCVs cover a significant part of the total
662 : * population, this gives us a big leg up in accuracy.)
663 : */
664 : Selectivity selec;
665 : int hist_size;
666 : FmgrInfo opproc;
667 : double mcv_selec,
668 : sumcommon;
669 :
670 : /* Try to use the histogram entries to get selectivity */
671 3960 : if (!OidIsValid(opfuncid))
672 3936 : opfuncid = get_opcode(oprid);
673 3960 : fmgr_info(opfuncid, &opproc);
674 :
675 3960 : selec = histogram_selectivity(&vardata, &opproc, collation,
676 : constval, true,
677 : 10, 1, &hist_size);
678 :
679 : /* If not at least 100 entries, use the heuristic method */
680 3960 : if (hist_size < 100)
681 : {
682 : Selectivity heursel;
683 : Selectivity prefixsel;
684 :
685 2588 : if (pstatus == Pattern_Prefix_Partial)
686 1846 : prefixsel = prefix_selectivity(root, &vardata,
687 : eqopr, ltopr, geopr,
688 : collation,
689 : prefix);
690 : else
691 742 : prefixsel = 1.0;
692 2588 : heursel = prefixsel * rest_selec;
693 :
694 2588 : if (selec < 0) /* fewer than 10 histogram entries? */
695 2326 : selec = heursel;
696 : else
697 : {
698 : /*
699 : * For histogram sizes from 10 to 100, we combine the
700 : * histogram and heuristic selectivities, putting increasingly
701 : * more trust in the histogram for larger sizes.
702 : */
703 262 : double hist_weight = hist_size / 100.0;
704 :
705 262 : selec = selec * hist_weight + heursel * (1.0 - hist_weight);
706 : }
707 : }
708 :
709 : /* In any case, don't believe extremely small or large estimates. */
710 3960 : if (selec < 0.0001)
711 1436 : selec = 0.0001;
712 2524 : else if (selec > 0.9999)
713 124 : selec = 0.9999;
714 :
715 : /*
716 : * If we have most-common-values info, add up the fractions of the MCV
717 : * entries that satisfy MCV OP PATTERN. These fractions contribute
718 : * directly to the result selectivity. Also add up the total fraction
719 : * represented by MCV entries.
720 : */
721 3960 : mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
722 : constval, true,
723 : &sumcommon);
724 :
725 : /*
726 : * Now merge the results from the MCV and histogram calculations,
727 : * realizing that the histogram covers only the non-null values that
728 : * are not listed in MCV.
729 : */
730 3960 : selec *= 1.0 - nullfrac - sumcommon;
731 3960 : selec += mcv_selec;
732 3960 : result = selec;
733 : }
734 :
735 : /* now adjust if we wanted not-match rather than match */
736 10266 : if (negate)
737 1314 : result = 1.0 - result - nullfrac;
738 :
739 : /* result should be in range, but make sure... */
740 10266 : CLAMP_PROBABILITY(result);
741 :
742 10266 : if (prefix)
743 : {
744 9670 : pfree(DatumGetPointer(prefix->constvalue));
745 9670 : pfree(prefix);
746 : }
747 :
748 10266 : ReleaseVariableStats(vardata);
749 :
750 10266 : return result;
751 : }
752 :
753 : /*
754 : * Fix impedance mismatch between SQL-callable functions and patternsel_common
755 : */
756 : static double
757 10540 : patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
758 : {
759 10540 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
760 10540 : Oid operator = PG_GETARG_OID(1);
761 10540 : List *args = (List *) PG_GETARG_POINTER(2);
762 10540 : int varRelid = PG_GETARG_INT32(3);
763 10540 : Oid collation = PG_GET_COLLATION();
764 :
765 : /*
766 : * If this is for a NOT LIKE or similar operator, get the corresponding
767 : * positive-match operator and work with that.
768 : */
769 10540 : if (negate)
770 : {
771 1522 : operator = get_negator(operator);
772 1522 : if (!OidIsValid(operator))
773 0 : elog(ERROR, "patternsel called for operator without a negator");
774 : }
775 :
776 10540 : return patternsel_common(root,
777 : operator,
778 : InvalidOid,
779 : args,
780 : varRelid,
781 : collation,
782 : ptype,
783 : negate);
784 : }
785 :
786 : /*
787 : * regexeqsel - Selectivity of regular-expression pattern match.
788 : */
789 : Datum
790 7244 : regexeqsel(PG_FUNCTION_ARGS)
791 : {
792 7244 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
793 : }
794 :
795 : /*
796 : * icregexeqsel - Selectivity of case-insensitive regex match.
797 : */
798 : Datum
799 64 : icregexeqsel(PG_FUNCTION_ARGS)
800 : {
801 64 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
802 : }
803 :
804 : /*
805 : * likesel - Selectivity of LIKE pattern match.
806 : */
807 : Datum
808 1528 : likesel(PG_FUNCTION_ARGS)
809 : {
810 1528 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
811 : }
812 :
813 : /*
814 : * prefixsel - selectivity of prefix operator
815 : */
816 : Datum
817 54 : prefixsel(PG_FUNCTION_ARGS)
818 : {
819 54 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
820 : }
821 :
822 : /*
823 : *
824 : * iclikesel - Selectivity of ILIKE pattern match.
825 : */
826 : Datum
827 128 : iclikesel(PG_FUNCTION_ARGS)
828 : {
829 128 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
830 : }
831 :
832 : /*
833 : * regexnesel - Selectivity of regular-expression pattern non-match.
834 : */
835 : Datum
836 1362 : regexnesel(PG_FUNCTION_ARGS)
837 : {
838 1362 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
839 : }
840 :
841 : /*
842 : * icregexnesel - Selectivity of case-insensitive regex non-match.
843 : */
844 : Datum
845 16 : icregexnesel(PG_FUNCTION_ARGS)
846 : {
847 16 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
848 : }
849 :
850 : /*
851 : * nlikesel - Selectivity of LIKE pattern non-match.
852 : */
853 : Datum
854 136 : nlikesel(PG_FUNCTION_ARGS)
855 : {
856 136 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
857 : }
858 :
859 : /*
860 : * icnlikesel - Selectivity of ILIKE pattern non-match.
861 : */
862 : Datum
863 8 : icnlikesel(PG_FUNCTION_ARGS)
864 : {
865 8 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
866 : }
867 :
868 : /*
869 : * patternjoinsel - Generic code for pattern-match join selectivity.
870 : */
871 : static double
872 236 : patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
873 : {
874 : /* For the moment we just punt. */
875 236 : return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
876 : }
877 :
878 : /*
879 : * regexeqjoinsel - Join selectivity of regular-expression pattern match.
880 : */
881 : Datum
882 236 : regexeqjoinsel(PG_FUNCTION_ARGS)
883 : {
884 236 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
885 : }
886 :
887 : /*
888 : * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
889 : */
890 : Datum
891 0 : icregexeqjoinsel(PG_FUNCTION_ARGS)
892 : {
893 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
894 : }
895 :
896 : /*
897 : * likejoinsel - Join selectivity of LIKE pattern match.
898 : */
899 : Datum
900 0 : likejoinsel(PG_FUNCTION_ARGS)
901 : {
902 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
903 : }
904 :
905 : /*
906 : * prefixjoinsel - Join selectivity of prefix operator
907 : */
908 : Datum
909 0 : prefixjoinsel(PG_FUNCTION_ARGS)
910 : {
911 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
912 : }
913 :
914 : /*
915 : * iclikejoinsel - Join selectivity of ILIKE pattern match.
916 : */
917 : Datum
918 0 : iclikejoinsel(PG_FUNCTION_ARGS)
919 : {
920 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
921 : }
922 :
923 : /*
924 : * regexnejoinsel - Join selectivity of regex non-match.
925 : */
926 : Datum
927 0 : regexnejoinsel(PG_FUNCTION_ARGS)
928 : {
929 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
930 : }
931 :
932 : /*
933 : * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
934 : */
935 : Datum
936 0 : icregexnejoinsel(PG_FUNCTION_ARGS)
937 : {
938 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
939 : }
940 :
941 : /*
942 : * nlikejoinsel - Join selectivity of LIKE pattern non-match.
943 : */
944 : Datum
945 0 : nlikejoinsel(PG_FUNCTION_ARGS)
946 : {
947 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
948 : }
949 :
950 : /*
951 : * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
952 : */
953 : Datum
954 0 : icnlikejoinsel(PG_FUNCTION_ARGS)
955 : {
956 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
957 : }
958 :
959 :
960 : /*-------------------------------------------------------------------------
961 : *
962 : * Pattern analysis functions
963 : *
964 : * These routines support analysis of LIKE and regular-expression patterns
965 : * by the planner/optimizer. It's important that they agree with the
966 : * regular-expression code in backend/regex/ and the LIKE code in
967 : * backend/utils/adt/like.c. Also, the computation of the fixed prefix
968 : * must be conservative: if we report a string longer than the true fixed
969 : * prefix, the query may produce actually wrong answers, rather than just
970 : * getting a bad selectivity estimate!
971 : *
972 : *-------------------------------------------------------------------------
973 : */
974 :
975 : /*
976 : * Extract the fixed prefix, if any, for a pattern.
977 : *
978 : * *prefix is set to a palloc'd prefix string (in the form of a Const node),
979 : * or to NULL if no fixed prefix exists for the pattern.
980 : * If rest_selec is not NULL, *rest_selec is set to an estimate of the
981 : * selectivity of the remainder of the pattern (without any fixed prefix).
982 : * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
983 : *
984 : * The return value distinguishes no fixed prefix, a partial prefix,
985 : * or an exact-match-only pattern.
986 : */
987 :
988 : static Pattern_Prefix_Status
989 2926 : like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
990 : Const **prefix_const, Selectivity *rest_selec)
991 : {
992 : char *match;
993 : char *patt;
994 : int pattlen;
995 2926 : Oid typeid = patt_const->consttype;
996 : int pos,
997 : match_pos;
998 2926 : bool is_multibyte = (pg_database_encoding_max_length() > 1);
999 2926 : pg_locale_t locale = 0;
1000 :
1001 : /* the right-hand const is type text or bytea */
1002 : Assert(typeid == BYTEAOID || typeid == TEXTOID);
1003 :
1004 2926 : if (case_insensitive)
1005 : {
1006 238 : if (typeid == BYTEAOID)
1007 0 : ereport(ERROR,
1008 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1009 : errmsg("case insensitive matching not supported on type bytea")));
1010 :
1011 238 : if (!OidIsValid(collation))
1012 : {
1013 : /*
1014 : * This typically means that the parser could not resolve a
1015 : * conflict of implicit collations, so report it that way.
1016 : */
1017 0 : ereport(ERROR,
1018 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1019 : errmsg("could not determine which collation to use for ILIKE"),
1020 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1021 : }
1022 :
1023 238 : locale = pg_newlocale_from_collation(collation);
1024 : }
1025 :
1026 2926 : if (typeid != BYTEAOID)
1027 : {
1028 2914 : patt = TextDatumGetCString(patt_const->constvalue);
1029 2914 : pattlen = strlen(patt);
1030 : }
1031 : else
1032 : {
1033 12 : bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1034 :
1035 12 : pattlen = VARSIZE_ANY_EXHDR(bstr);
1036 12 : patt = (char *) palloc(pattlen);
1037 12 : memcpy(patt, VARDATA_ANY(bstr), pattlen);
1038 : Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1039 : }
1040 :
1041 2926 : match = palloc(pattlen + 1);
1042 2926 : match_pos = 0;
1043 15654 : for (pos = 0; pos < pattlen; pos++)
1044 : {
1045 : /* % and _ are wildcard characters in LIKE */
1046 15548 : if (patt[pos] == '%' ||
1047 13952 : patt[pos] == '_')
1048 : break;
1049 :
1050 : /* Backslash escapes the next character */
1051 12902 : if (patt[pos] == '\\')
1052 : {
1053 274 : pos++;
1054 274 : if (pos >= pattlen)
1055 0 : break;
1056 : }
1057 :
1058 : /* Stop if case-varying character (it's sort of a wildcard) */
1059 13172 : if (case_insensitive &&
1060 270 : pattern_char_isalpha(patt[pos], is_multibyte, locale))
1061 174 : break;
1062 :
1063 12728 : match[match_pos++] = patt[pos];
1064 : }
1065 :
1066 2926 : match[match_pos] = '\0';
1067 :
1068 2926 : if (typeid != BYTEAOID)
1069 2914 : *prefix_const = string_to_const(match, typeid);
1070 : else
1071 12 : *prefix_const = string_to_bytea_const(match, match_pos);
1072 :
1073 2926 : if (rest_selec != NULL)
1074 1784 : *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1075 : case_insensitive);
1076 :
1077 2926 : pfree(patt);
1078 2926 : pfree(match);
1079 :
1080 : /* in LIKE, an empty pattern is an exact match! */
1081 2926 : if (pos == pattlen)
1082 106 : return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1083 :
1084 2820 : if (match_pos > 0)
1085 2300 : return Pattern_Prefix_Partial;
1086 :
1087 520 : return Pattern_Prefix_None;
1088 : }
1089 :
1090 : static Pattern_Prefix_Status
1091 14862 : regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1092 : Const **prefix_const, Selectivity *rest_selec)
1093 : {
1094 14862 : Oid typeid = patt_const->consttype;
1095 : char *prefix;
1096 : bool exact;
1097 :
1098 : /*
1099 : * Should be unnecessary, there are no bytea regex operators defined. As
1100 : * such, it should be noted that the rest of this function has *not* been
1101 : * made safe for binary (possibly NULL containing) strings.
1102 : */
1103 14862 : if (typeid == BYTEAOID)
1104 0 : ereport(ERROR,
1105 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1106 : errmsg("regular-expression matching not supported on type bytea")));
1107 :
1108 : /* Use the regexp machinery to extract the prefix, if any */
1109 14862 : prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1110 : case_insensitive, collation,
1111 : &exact);
1112 :
1113 14838 : if (prefix == NULL)
1114 : {
1115 726 : *prefix_const = NULL;
1116 :
1117 726 : if (rest_selec != NULL)
1118 : {
1119 596 : char *patt = TextDatumGetCString(patt_const->constvalue);
1120 :
1121 596 : *rest_selec = regex_selectivity(patt, strlen(patt),
1122 : case_insensitive,
1123 : 0);
1124 596 : pfree(patt);
1125 : }
1126 :
1127 726 : return Pattern_Prefix_None;
1128 : }
1129 :
1130 14112 : *prefix_const = string_to_const(prefix, typeid);
1131 :
1132 14112 : if (rest_selec != NULL)
1133 : {
1134 7808 : if (exact)
1135 : {
1136 : /* Exact match, so there's no additional selectivity */
1137 6242 : *rest_selec = 1.0;
1138 : }
1139 : else
1140 : {
1141 1566 : char *patt = TextDatumGetCString(patt_const->constvalue);
1142 :
1143 3132 : *rest_selec = regex_selectivity(patt, strlen(patt),
1144 : case_insensitive,
1145 1566 : strlen(prefix));
1146 1566 : pfree(patt);
1147 : }
1148 : }
1149 :
1150 14112 : pfree(prefix);
1151 :
1152 14112 : if (exact)
1153 12214 : return Pattern_Prefix_Exact; /* pattern specifies exact match */
1154 : else
1155 1898 : return Pattern_Prefix_Partial;
1156 : }
1157 :
1158 : static Pattern_Prefix_Status
1159 17890 : pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1160 : Const **prefix, Selectivity *rest_selec)
1161 : {
1162 : Pattern_Prefix_Status result;
1163 :
1164 17890 : switch (ptype)
1165 : {
1166 2688 : case Pattern_Type_Like:
1167 2688 : result = like_fixed_prefix(patt, false, collation,
1168 : prefix, rest_selec);
1169 2688 : break;
1170 238 : case Pattern_Type_Like_IC:
1171 238 : result = like_fixed_prefix(patt, true, collation,
1172 : prefix, rest_selec);
1173 238 : break;
1174 14800 : case Pattern_Type_Regex:
1175 14800 : result = regex_fixed_prefix(patt, false, collation,
1176 : prefix, rest_selec);
1177 14776 : break;
1178 62 : case Pattern_Type_Regex_IC:
1179 62 : result = regex_fixed_prefix(patt, true, collation,
1180 : prefix, rest_selec);
1181 62 : break;
1182 102 : case Pattern_Type_Prefix:
1183 : /* Prefix type work is trivial. */
1184 102 : result = Pattern_Prefix_Partial;
1185 102 : *prefix = makeConst(patt->consttype,
1186 : patt->consttypmod,
1187 : patt->constcollid,
1188 : patt->constlen,
1189 : datumCopy(patt->constvalue,
1190 102 : patt->constbyval,
1191 : patt->constlen),
1192 102 : patt->constisnull,
1193 102 : patt->constbyval);
1194 102 : if (rest_selec != NULL)
1195 78 : *rest_selec = 1.0; /* all */
1196 102 : break;
1197 0 : default:
1198 0 : elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1199 : result = Pattern_Prefix_None; /* keep compiler quiet */
1200 : break;
1201 : }
1202 17866 : return result;
1203 : }
1204 :
1205 : /*
1206 : * Estimate the selectivity of a fixed prefix for a pattern match.
1207 : *
1208 : * A fixed prefix "foo" is estimated as the selectivity of the expression
1209 : * "variable >= 'foo' AND variable < 'fop'".
1210 : *
1211 : * The selectivity estimate is with respect to the portion of the column
1212 : * population represented by the histogram --- the caller must fold this
1213 : * together with info about MCVs and NULLs.
1214 : *
1215 : * We use the given comparison operators and collation to do the estimation.
1216 : * The given variable and Const must be of the associated datatype(s).
1217 : *
1218 : * XXX Note: we make use of the upper bound to estimate operator selectivity
1219 : * even if the locale is such that we cannot rely on the upper-bound string.
1220 : * The selectivity only needs to be approximately right anyway, so it seems
1221 : * more useful to use the upper-bound code than not.
1222 : */
1223 : static Selectivity
1224 1846 : prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1225 : Oid eqopr, Oid ltopr, Oid geopr,
1226 : Oid collation,
1227 : Const *prefixcon)
1228 : {
1229 : Selectivity prefixsel;
1230 : FmgrInfo opproc;
1231 : Const *greaterstrcon;
1232 : Selectivity eq_sel;
1233 :
1234 : /* Estimate the selectivity of "x >= prefix" */
1235 1846 : fmgr_info(get_opcode(geopr), &opproc);
1236 :
1237 1846 : prefixsel = ineq_histogram_selectivity(root, vardata,
1238 : geopr, &opproc, true, true,
1239 : collation,
1240 : prefixcon->constvalue,
1241 : prefixcon->consttype);
1242 :
1243 1846 : if (prefixsel < 0.0)
1244 : {
1245 : /* No histogram is present ... return a suitable default estimate */
1246 634 : return DEFAULT_MATCH_SEL;
1247 : }
1248 :
1249 : /*
1250 : * If we can create a string larger than the prefix, say "x < greaterstr".
1251 : */
1252 1212 : fmgr_info(get_opcode(ltopr), &opproc);
1253 1212 : greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1254 1212 : if (greaterstrcon)
1255 : {
1256 : Selectivity topsel;
1257 :
1258 1212 : topsel = ineq_histogram_selectivity(root, vardata,
1259 : ltopr, &opproc, false, false,
1260 : collation,
1261 : greaterstrcon->constvalue,
1262 : greaterstrcon->consttype);
1263 :
1264 : /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1265 : Assert(topsel >= 0.0);
1266 :
1267 : /*
1268 : * Merge the two selectivities in the same way as for a range query
1269 : * (see clauselist_selectivity()). Note that we don't need to worry
1270 : * about double-exclusion of nulls, since ineq_histogram_selectivity
1271 : * doesn't count those anyway.
1272 : */
1273 1212 : prefixsel = topsel + prefixsel - 1.0;
1274 : }
1275 :
1276 : /*
1277 : * If the prefix is long then the two bounding values might be too close
1278 : * together for the histogram to distinguish them usefully, resulting in a
1279 : * zero estimate (plus or minus roundoff error). To avoid returning a
1280 : * ridiculously small estimate, compute the estimated selectivity for
1281 : * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1282 : * estimate should be at least that.)
1283 : *
1284 : * We apply this even if we couldn't make a greater string. That case
1285 : * suggests that the prefix is near the maximum possible, and thus
1286 : * probably off the end of the histogram, and thus we probably got a very
1287 : * small estimate from the >= condition; so we still need to clamp.
1288 : */
1289 1212 : eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1290 : false, true, false);
1291 :
1292 1212 : prefixsel = Max(prefixsel, eq_sel);
1293 :
1294 1212 : return prefixsel;
1295 : }
1296 :
1297 :
1298 : /*
1299 : * Estimate the selectivity of a pattern of the specified type.
1300 : * Note that any fixed prefix of the pattern will have been removed already,
1301 : * so actually we may be looking at just a fragment of the pattern.
1302 : *
1303 : * For now, we use a very simplistic approach: fixed characters reduce the
1304 : * selectivity a good deal, character ranges reduce it a little,
1305 : * wildcards (such as % for LIKE or .* for regex) increase it.
1306 : */
1307 :
1308 : #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1309 : #define CHAR_RANGE_SEL 0.25
1310 : #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1311 : #define FULL_WILDCARD_SEL 5.0
1312 : #define PARTIAL_WILDCARD_SEL 2.0
1313 :
1314 : static Selectivity
1315 1784 : like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1316 : {
1317 1784 : Selectivity sel = 1.0;
1318 : int pos;
1319 :
1320 : /* Skip any leading wildcard; it's already factored into initial sel */
1321 3442 : for (pos = 0; pos < pattlen; pos++)
1322 : {
1323 2512 : if (patt[pos] != '%' && patt[pos] != '_')
1324 854 : break;
1325 : }
1326 :
1327 6858 : for (; pos < pattlen; pos++)
1328 : {
1329 : /* % and _ are wildcard characters in LIKE */
1330 5074 : if (patt[pos] == '%')
1331 750 : sel *= FULL_WILDCARD_SEL;
1332 4324 : else if (patt[pos] == '_')
1333 172 : sel *= ANY_CHAR_SEL;
1334 4152 : else if (patt[pos] == '\\')
1335 : {
1336 : /* Backslash quotes the next character */
1337 40 : pos++;
1338 40 : if (pos >= pattlen)
1339 0 : break;
1340 40 : sel *= FIXED_CHAR_SEL;
1341 : }
1342 : else
1343 4112 : sel *= FIXED_CHAR_SEL;
1344 : }
1345 : /* Could get sel > 1 if multiple wildcards */
1346 1784 : if (sel > 1.0)
1347 0 : sel = 1.0;
1348 1784 : return sel;
1349 : }
1350 :
1351 : static Selectivity
1352 2528 : regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1353 : {
1354 2528 : Selectivity sel = 1.0;
1355 2528 : int paren_depth = 0;
1356 2528 : int paren_pos = 0; /* dummy init to keep compiler quiet */
1357 : int pos;
1358 :
1359 : /* since this function recurses, it could be driven to stack overflow */
1360 2528 : check_stack_depth();
1361 :
1362 26122 : for (pos = 0; pos < pattlen; pos++)
1363 : {
1364 23618 : if (patt[pos] == '(')
1365 : {
1366 348 : if (paren_depth == 0)
1367 342 : paren_pos = pos; /* remember start of parenthesized item */
1368 348 : paren_depth++;
1369 : }
1370 23270 : else if (patt[pos] == ')' && paren_depth > 0)
1371 : {
1372 348 : paren_depth--;
1373 348 : if (paren_depth == 0)
1374 342 : sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1375 342 : pos - (paren_pos + 1),
1376 : case_insensitive);
1377 : }
1378 22922 : else if (patt[pos] == '|' && paren_depth == 0)
1379 : {
1380 : /*
1381 : * If unquoted | is present at paren level 0 in pattern, we have
1382 : * multiple alternatives; sum their probabilities.
1383 : */
1384 48 : sel += regex_selectivity_sub(patt + (pos + 1),
1385 24 : pattlen - (pos + 1),
1386 : case_insensitive);
1387 24 : break; /* rest of pattern is now processed */
1388 : }
1389 22898 : else if (patt[pos] == '[')
1390 : {
1391 78 : bool negclass = false;
1392 :
1393 78 : if (patt[++pos] == '^')
1394 : {
1395 0 : negclass = true;
1396 0 : pos++;
1397 : }
1398 78 : if (patt[pos] == ']') /* ']' at start of class is not special */
1399 0 : pos++;
1400 380 : while (pos < pattlen && patt[pos] != ']')
1401 302 : pos++;
1402 78 : if (paren_depth == 0)
1403 78 : sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1404 : }
1405 22820 : else if (patt[pos] == '.')
1406 : {
1407 806 : if (paren_depth == 0)
1408 466 : sel *= ANY_CHAR_SEL;
1409 : }
1410 22014 : else if (patt[pos] == '*' ||
1411 21286 : patt[pos] == '?' ||
1412 21234 : patt[pos] == '+')
1413 : {
1414 : /* Ought to be smarter about quantifiers... */
1415 794 : if (paren_depth == 0)
1416 444 : sel *= PARTIAL_WILDCARD_SEL;
1417 : }
1418 21220 : else if (patt[pos] == '{')
1419 : {
1420 264 : while (pos < pattlen && patt[pos] != '}')
1421 188 : pos++;
1422 76 : if (paren_depth == 0)
1423 64 : sel *= PARTIAL_WILDCARD_SEL;
1424 : }
1425 21144 : else if (patt[pos] == '\\')
1426 : {
1427 : /* backslash quotes the next character */
1428 200 : pos++;
1429 200 : if (pos >= pattlen)
1430 0 : break;
1431 200 : if (paren_depth == 0)
1432 104 : sel *= FIXED_CHAR_SEL;
1433 : }
1434 : else
1435 : {
1436 20944 : if (paren_depth == 0)
1437 18444 : sel *= FIXED_CHAR_SEL;
1438 : }
1439 : }
1440 : /* Could get sel > 1 if multiple wildcards */
1441 2528 : if (sel > 1.0)
1442 26 : sel = 1.0;
1443 2528 : return sel;
1444 : }
1445 :
1446 : static Selectivity
1447 2162 : regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1448 : int fixed_prefix_len)
1449 : {
1450 : Selectivity sel;
1451 :
1452 : /* If patt doesn't end with $, consider it to have a trailing wildcard */
1453 2162 : if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1454 342 : (pattlen == 1 || patt[pattlen - 2] != '\\'))
1455 : {
1456 : /* has trailing $ */
1457 342 : sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1458 : }
1459 : else
1460 : {
1461 : /* no trailing $ */
1462 1820 : sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1463 1820 : sel *= FULL_WILDCARD_SEL;
1464 : }
1465 :
1466 : /*
1467 : * If there's a fixed prefix, discount its selectivity. We have to be
1468 : * careful here since a very long prefix could result in pow's result
1469 : * underflowing to zero (in which case "sel" probably has as well).
1470 : */
1471 2162 : if (fixed_prefix_len > 0)
1472 : {
1473 1566 : double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1474 :
1475 1566 : if (prefixsel > 0.0)
1476 1566 : sel /= prefixsel;
1477 : }
1478 :
1479 : /* Make sure result stays in range */
1480 2162 : CLAMP_PROBABILITY(sel);
1481 2162 : return sel;
1482 : }
1483 :
1484 : /*
1485 : * Check whether char is a letter (and, hence, subject to case-folding)
1486 : *
1487 : * In multibyte character sets or with ICU, we can't use isalpha, and it does
1488 : * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1489 : * Instead, just assume any non-ASCII char is potentially case-varying, and
1490 : * hard-wire knowledge of which ASCII chars are letters.
1491 : */
1492 : static int
1493 270 : pattern_char_isalpha(char c, bool is_multibyte,
1494 : pg_locale_t locale)
1495 : {
1496 270 : if (locale->ctype_is_c)
1497 144 : return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1498 126 : else if (is_multibyte && IS_HIGHBIT_SET(c))
1499 0 : return true;
1500 126 : else if (locale->provider != COLLPROVIDER_LIBC)
1501 126 : return IS_HIGHBIT_SET(c) ||
1502 252 : (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1503 : else
1504 0 : return isalpha_l((unsigned char) c, locale->info.lt);
1505 : }
1506 :
1507 :
1508 : /*
1509 : * For bytea, the increment function need only increment the current byte
1510 : * (there are no multibyte characters to worry about).
1511 : */
1512 : static bool
1513 0 : byte_increment(unsigned char *ptr, int len)
1514 : {
1515 0 : if (*ptr >= 255)
1516 0 : return false;
1517 0 : (*ptr)++;
1518 0 : return true;
1519 : }
1520 :
1521 : /*
1522 : * Try to generate a string greater than the given string or any
1523 : * string it is a prefix of. If successful, return a palloc'd string
1524 : * in the form of a Const node; else return NULL.
1525 : *
1526 : * The caller must provide the appropriate "less than" comparison function
1527 : * for testing the strings, along with the collation to use.
1528 : *
1529 : * The key requirement here is that given a prefix string, say "foo",
1530 : * we must be able to generate another string "fop" that is greater than
1531 : * all strings "foobar" starting with "foo". We can test that we have
1532 : * generated a string greater than the prefix string, but in non-C collations
1533 : * that is not a bulletproof guarantee that an extension of the string might
1534 : * not sort after it; an example is that "foo " is less than "foo!", but it
1535 : * is not clear that a "dictionary" sort ordering will consider "foo!" less
1536 : * than "foo bar". CAUTION: Therefore, this function should be used only for
1537 : * estimation purposes when working in a non-C collation.
1538 : *
1539 : * To try to catch most cases where an extended string might otherwise sort
1540 : * before the result value, we determine which of the strings "Z", "z", "y",
1541 : * and "9" is seen as largest by the collation, and append that to the given
1542 : * prefix before trying to find a string that compares as larger.
1543 : *
1544 : * To search for a greater string, we repeatedly "increment" the rightmost
1545 : * character, using an encoding-specific character incrementer function.
1546 : * When it's no longer possible to increment the last character, we truncate
1547 : * off that character and start incrementing the next-to-rightmost.
1548 : * For example, if "z" were the last character in the sort order, then we
1549 : * could produce "foo" as a string greater than "fonz".
1550 : *
1551 : * This could be rather slow in the worst case, but in most cases we
1552 : * won't have to try more than one or two strings before succeeding.
1553 : *
1554 : * Note that it's important for the character incrementer not to be too anal
1555 : * about producing every possible character code, since in some cases the only
1556 : * way to get a larger string is to increment a previous character position.
1557 : * So we don't want to spend too much time trying every possible character
1558 : * code at the last position. A good rule of thumb is to be sure that we
1559 : * don't try more than 256*K values for a K-byte character (and definitely
1560 : * not 256^K, which is what an exhaustive search would approach).
1561 : */
1562 : static Const *
1563 2448 : make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1564 : {
1565 2448 : Oid datatype = str_const->consttype;
1566 : char *workstr;
1567 : int len;
1568 : Datum cmpstr;
1569 2448 : char *cmptxt = NULL;
1570 : mbcharacter_incrementer charinc;
1571 :
1572 : /*
1573 : * Get a modifiable copy of the prefix string in C-string format, and set
1574 : * up the string we will compare to as a Datum. In C locale this can just
1575 : * be the given prefix string, otherwise we need to add a suffix. Type
1576 : * BYTEA sorts bytewise so it never needs a suffix either.
1577 : */
1578 2448 : if (datatype == BYTEAOID)
1579 : {
1580 0 : bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1581 :
1582 0 : len = VARSIZE_ANY_EXHDR(bstr);
1583 0 : workstr = (char *) palloc(len);
1584 0 : memcpy(workstr, VARDATA_ANY(bstr), len);
1585 : Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1586 0 : cmpstr = str_const->constvalue;
1587 : }
1588 : else
1589 : {
1590 2448 : if (datatype == NAMEOID)
1591 0 : workstr = DatumGetCString(DirectFunctionCall1(nameout,
1592 : str_const->constvalue));
1593 : else
1594 2448 : workstr = TextDatumGetCString(str_const->constvalue);
1595 2448 : len = strlen(workstr);
1596 2448 : if (len == 0 || pg_newlocale_from_collation(collation)->collate_is_c)
1597 2422 : cmpstr = str_const->constvalue;
1598 : else
1599 : {
1600 : /* If first time through, determine the suffix to use */
1601 : static char suffixchar = 0;
1602 : static Oid suffixcollation = 0;
1603 :
1604 26 : if (!suffixchar || suffixcollation != collation)
1605 : {
1606 : char *best;
1607 :
1608 6 : best = "Z";
1609 6 : if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1610 0 : best = "z";
1611 6 : if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1612 0 : best = "y";
1613 6 : if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1614 0 : best = "9";
1615 6 : suffixchar = *best;
1616 6 : suffixcollation = collation;
1617 : }
1618 :
1619 : /* And build the string to compare to */
1620 26 : if (datatype == NAMEOID)
1621 : {
1622 0 : cmptxt = palloc(len + 2);
1623 0 : memcpy(cmptxt, workstr, len);
1624 0 : cmptxt[len] = suffixchar;
1625 0 : cmptxt[len + 1] = '\0';
1626 0 : cmpstr = PointerGetDatum(cmptxt);
1627 : }
1628 : else
1629 : {
1630 26 : cmptxt = palloc(VARHDRSZ + len + 1);
1631 26 : SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1632 26 : memcpy(VARDATA(cmptxt), workstr, len);
1633 26 : *(VARDATA(cmptxt) + len) = suffixchar;
1634 26 : cmpstr = PointerGetDatum(cmptxt);
1635 : }
1636 : }
1637 : }
1638 :
1639 : /* Select appropriate character-incrementer function */
1640 2448 : if (datatype == BYTEAOID)
1641 0 : charinc = byte_increment;
1642 : else
1643 2448 : charinc = pg_database_encoding_character_incrementer();
1644 :
1645 : /* And search ... */
1646 2448 : while (len > 0)
1647 : {
1648 : int charlen;
1649 : unsigned char *lastchar;
1650 :
1651 : /* Identify the last character --- for bytea, just the last byte */
1652 2448 : if (datatype == BYTEAOID)
1653 0 : charlen = 1;
1654 : else
1655 2448 : charlen = len - pg_mbcliplen(workstr, len, len - 1);
1656 2448 : lastchar = (unsigned char *) (workstr + len - charlen);
1657 :
1658 : /*
1659 : * Try to generate a larger string by incrementing the last character
1660 : * (for BYTEA, we treat each byte as a character).
1661 : *
1662 : * Note: the incrementer function is expected to return true if it's
1663 : * generated a valid-per-the-encoding new character, otherwise false.
1664 : * The contents of the character on false return are unspecified.
1665 : */
1666 2448 : while (charinc(lastchar, charlen))
1667 : {
1668 : Const *workstr_const;
1669 :
1670 2448 : if (datatype == BYTEAOID)
1671 0 : workstr_const = string_to_bytea_const(workstr, len);
1672 : else
1673 2448 : workstr_const = string_to_const(workstr, datatype);
1674 :
1675 2448 : if (DatumGetBool(FunctionCall2Coll(ltproc,
1676 : collation,
1677 : cmpstr,
1678 : workstr_const->constvalue)))
1679 : {
1680 : /* Successfully made a string larger than cmpstr */
1681 2448 : if (cmptxt)
1682 26 : pfree(cmptxt);
1683 2448 : pfree(workstr);
1684 2448 : return workstr_const;
1685 : }
1686 :
1687 : /* No good, release unusable value and try again */
1688 0 : pfree(DatumGetPointer(workstr_const->constvalue));
1689 0 : pfree(workstr_const);
1690 : }
1691 :
1692 : /*
1693 : * No luck here, so truncate off the last character and try to
1694 : * increment the next one.
1695 : */
1696 0 : len -= charlen;
1697 0 : workstr[len] = '\0';
1698 : }
1699 :
1700 : /* Failed... */
1701 0 : if (cmptxt)
1702 0 : pfree(cmptxt);
1703 0 : pfree(workstr);
1704 :
1705 0 : return NULL;
1706 : }
1707 :
1708 : /*
1709 : * Generate a Datum of the appropriate type from a C string.
1710 : * Note that all of the supported types are pass-by-ref, so the
1711 : * returned value should be pfree'd if no longer needed.
1712 : */
1713 : static Datum
1714 19474 : string_to_datum(const char *str, Oid datatype)
1715 : {
1716 : Assert(str != NULL);
1717 :
1718 : /*
1719 : * We cheat a little by assuming that CStringGetTextDatum() will do for
1720 : * bpchar and varchar constants too...
1721 : */
1722 19474 : if (datatype == NAMEOID)
1723 0 : return DirectFunctionCall1(namein, CStringGetDatum(str));
1724 19474 : else if (datatype == BYTEAOID)
1725 0 : return DirectFunctionCall1(byteain, CStringGetDatum(str));
1726 : else
1727 19474 : return CStringGetTextDatum(str);
1728 : }
1729 :
1730 : /*
1731 : * Generate a Const node of the appropriate type from a C string.
1732 : */
1733 : static Const *
1734 19474 : string_to_const(const char *str, Oid datatype)
1735 : {
1736 19474 : Datum conval = string_to_datum(str, datatype);
1737 : Oid collation;
1738 : int constlen;
1739 :
1740 : /*
1741 : * We only need to support a few datatypes here, so hard-wire properties
1742 : * instead of incurring the expense of catalog lookups.
1743 : */
1744 19474 : switch (datatype)
1745 : {
1746 19474 : case TEXTOID:
1747 : case VARCHAROID:
1748 : case BPCHAROID:
1749 19474 : collation = DEFAULT_COLLATION_OID;
1750 19474 : constlen = -1;
1751 19474 : break;
1752 :
1753 0 : case NAMEOID:
1754 0 : collation = C_COLLATION_OID;
1755 0 : constlen = NAMEDATALEN;
1756 0 : break;
1757 :
1758 0 : case BYTEAOID:
1759 0 : collation = InvalidOid;
1760 0 : constlen = -1;
1761 0 : break;
1762 :
1763 0 : default:
1764 0 : elog(ERROR, "unexpected datatype in string_to_const: %u",
1765 : datatype);
1766 : return NULL;
1767 : }
1768 :
1769 19474 : return makeConst(datatype, -1, collation, constlen,
1770 : conval, false, false);
1771 : }
1772 :
1773 : /*
1774 : * Generate a Const node of bytea type from a binary C string and a length.
1775 : */
1776 : static Const *
1777 12 : string_to_bytea_const(const char *str, size_t str_len)
1778 : {
1779 12 : bytea *bstr = palloc(VARHDRSZ + str_len);
1780 : Datum conval;
1781 :
1782 12 : memcpy(VARDATA(bstr), str, str_len);
1783 12 : SET_VARSIZE(bstr, VARHDRSZ + str_len);
1784 12 : conval = PointerGetDatum(bstr);
1785 :
1786 12 : return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1787 : }
|