Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * like_support.c
4 : * Planner support functions for LIKE, regex, and related operators.
5 : *
6 : * These routines handle special optimization of operators that can be
7 : * used with index scans even though they are not known to the executor's
8 : * indexscan machinery. The key idea is that these operators allow us
9 : * to derive approximate indexscan qual clauses, such that any tuples
10 : * that pass the operator clause itself must also satisfy the simpler
11 : * indexscan condition(s). Then we can use the indexscan machinery
12 : * to avoid scanning as much of the table as we'd otherwise have to,
13 : * while applying the original operator as a qpqual condition to ensure
14 : * we deliver only the tuples we want. (In essence, we're using a regular
15 : * index as if it were a lossy index.)
16 : *
17 : * An example of what we're doing is
18 : * textfield LIKE 'abc%def'
19 : * from which we can generate the indexscanable conditions
20 : * textfield >= 'abc' AND textfield < 'abd'
21 : * which allow efficient scanning of an index on textfield.
22 : * (In reality, character set and collation issues make the transformation
23 : * from LIKE to indexscan limits rather harder than one might think ...
24 : * but that's the basic idea.)
25 : *
26 : * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/adt/like_support.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include <math.h>
38 :
39 : #include "access/htup_details.h"
40 : #include "access/stratnum.h"
41 : #include "catalog/pg_collation.h"
42 : #include "catalog/pg_operator.h"
43 : #include "catalog/pg_opfamily.h"
44 : #include "catalog/pg_statistic.h"
45 : #include "catalog/pg_type.h"
46 : #include "mb/pg_wchar.h"
47 : #include "nodes/makefuncs.h"
48 : #include "nodes/nodeFuncs.h"
49 : #include "nodes/supportnodes.h"
50 : #include "utils/builtins.h"
51 : #include "utils/datum.h"
52 : #include "utils/lsyscache.h"
53 : #include "utils/pg_locale.h"
54 : #include "utils/selfuncs.h"
55 : #include "utils/varlena.h"
56 :
57 :
58 : typedef enum
59 : {
60 : Pattern_Type_Like,
61 : Pattern_Type_Like_IC,
62 : Pattern_Type_Regex,
63 : Pattern_Type_Regex_IC,
64 : Pattern_Type_Prefix
65 : } Pattern_Type;
66 :
67 : typedef enum
68 : {
69 : Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
70 : } Pattern_Prefix_Status;
71 :
72 : static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
73 : static List *match_pattern_prefix(Node *leftop,
74 : Node *rightop,
75 : Pattern_Type ptype,
76 : Oid expr_coll,
77 : Oid opfamily,
78 : Oid indexcollation);
79 : static double patternsel_common(PlannerInfo *root,
80 : Oid oprid,
81 : Oid opfuncid,
82 : List *args,
83 : int varRelid,
84 : Oid collation,
85 : Pattern_Type ptype,
86 : bool negate);
87 : static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
88 : Pattern_Type ptype,
89 : Oid collation,
90 : Const **prefix,
91 : Selectivity *rest_selec);
92 : static Selectivity prefix_selectivity(PlannerInfo *root,
93 : VariableStatData *vardata,
94 : Oid eqopr, Oid ltopr, Oid geopr,
95 : Oid collation,
96 : Const *prefixcon);
97 : static Selectivity like_selectivity(const char *patt, int pattlen,
98 : bool case_insensitive);
99 : static Selectivity regex_selectivity(const char *patt, int pattlen,
100 : bool case_insensitive,
101 : int fixed_prefix_len);
102 : static int pattern_char_isalpha(char c, bool is_multibyte,
103 : pg_locale_t locale, bool locale_is_c);
104 : static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105 : Oid collation);
106 : static Datum string_to_datum(const char *str, Oid datatype);
107 : static Const *string_to_const(const char *str, Oid datatype);
108 : static Const *string_to_bytea_const(const char *str, size_t str_len);
109 :
110 :
111 : /*
112 : * Planner support functions for LIKE, regex, and related operators
113 : */
114 : Datum
115 5500 : textlike_support(PG_FUNCTION_ARGS)
116 : {
117 5500 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
118 :
119 5500 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
120 : }
121 :
122 : Datum
123 132 : texticlike_support(PG_FUNCTION_ARGS)
124 : {
125 132 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
126 :
127 132 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
128 : }
129 :
130 : Datum
131 17968 : textregexeq_support(PG_FUNCTION_ARGS)
132 : {
133 17968 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
134 :
135 17968 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
136 : }
137 :
138 : Datum
139 64 : texticregexeq_support(PG_FUNCTION_ARGS)
140 : {
141 64 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
142 :
143 64 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
144 : }
145 :
146 : Datum
147 156 : text_starts_with_support(PG_FUNCTION_ARGS)
148 : {
149 156 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
150 :
151 156 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Prefix));
152 : }
153 :
154 : /* Common code for the above */
155 : static Node *
156 23820 : like_regex_support(Node *rawreq, Pattern_Type ptype)
157 : {
158 23820 : Node *ret = NULL;
159 :
160 23820 : if (IsA(rawreq, SupportRequestSelectivity))
161 : {
162 : /*
163 : * Make a selectivity estimate for a function call, just as we'd do if
164 : * the call was via the corresponding operator.
165 : */
166 24 : SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
167 : Selectivity s1;
168 :
169 24 : if (req->is_join)
170 : {
171 : /*
172 : * For the moment we just punt. If patternjoinsel is ever
173 : * improved to do better, this should be made to call it.
174 : */
175 0 : s1 = DEFAULT_MATCH_SEL;
176 : }
177 : else
178 : {
179 : /* Share code with operator restriction selectivity functions */
180 24 : s1 = patternsel_common(req->root,
181 : InvalidOid,
182 : req->funcid,
183 : req->args,
184 : req->varRelid,
185 : req->inputcollid,
186 : ptype,
187 : false);
188 : }
189 24 : req->selectivity = s1;
190 24 : ret = (Node *) req;
191 : }
192 23796 : else if (IsA(rawreq, SupportRequestIndexCondition))
193 : {
194 : /* Try to convert operator/function call to index conditions */
195 6526 : SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
196 :
197 : /*
198 : * Currently we have no "reverse" match operators with the pattern on
199 : * the left, so we only need consider cases with the indexkey on the
200 : * left.
201 : */
202 6526 : if (req->indexarg != 0)
203 0 : return NULL;
204 :
205 6526 : if (is_opclause(req->node))
206 : {
207 6502 : OpExpr *clause = (OpExpr *) req->node;
208 :
209 : Assert(list_length(clause->args) == 2);
210 : ret = (Node *)
211 6502 : match_pattern_prefix((Node *) linitial(clause->args),
212 6502 : (Node *) lsecond(clause->args),
213 : ptype,
214 : clause->inputcollid,
215 : req->opfamily,
216 : req->indexcollation);
217 : }
218 24 : else if (is_funcclause(req->node)) /* be paranoid */
219 : {
220 24 : FuncExpr *clause = (FuncExpr *) req->node;
221 :
222 : Assert(list_length(clause->args) == 2);
223 : ret = (Node *)
224 24 : match_pattern_prefix((Node *) linitial(clause->args),
225 24 : (Node *) lsecond(clause->args),
226 : ptype,
227 : clause->inputcollid,
228 : req->opfamily,
229 : req->indexcollation);
230 : }
231 : }
232 :
233 23820 : return ret;
234 : }
235 :
236 : /*
237 : * match_pattern_prefix
238 : * Try to generate an indexqual for a LIKE or regex operator.
239 : */
240 : static List *
241 6526 : match_pattern_prefix(Node *leftop,
242 : Node *rightop,
243 : Pattern_Type ptype,
244 : Oid expr_coll,
245 : Oid opfamily,
246 : Oid indexcollation)
247 : {
248 : List *result;
249 : Const *patt;
250 : Const *prefix;
251 : Pattern_Prefix_Status pstatus;
252 : Oid ldatatype;
253 : Oid rdatatype;
254 : Oid eqopr;
255 : Oid ltopr;
256 : Oid geopr;
257 6526 : Oid preopr = InvalidOid;
258 : bool collation_aware;
259 : Expr *expr;
260 : FmgrInfo ltproc;
261 : Const *greaterstr;
262 :
263 : /*
264 : * Can't do anything with a non-constant or NULL pattern argument.
265 : *
266 : * Note that since we restrict ourselves to cases with a hard constant on
267 : * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
268 : * about verifying that.
269 : */
270 6526 : if (!IsA(rightop, Const) ||
271 6426 : ((Const *) rightop)->constisnull)
272 100 : return NIL;
273 6426 : patt = (Const *) rightop;
274 :
275 : /*
276 : * Not supported if the expression collation is nondeterministic. The
277 : * optimized equality or prefix tests use bytewise comparisons, which is
278 : * not consistent with nondeterministic collations. The actual
279 : * pattern-matching implementation functions will later error out that
280 : * pattern-matching is not supported with nondeterministic collations. (We
281 : * could also error out here, but by doing it later we get more precise
282 : * error messages.) (It should be possible to support at least
283 : * Pattern_Prefix_Exact, but no point as long as the actual
284 : * pattern-matching implementations don't support it.)
285 : *
286 : * expr_coll is not set for a non-collation-aware data type such as bytea.
287 : */
288 6426 : if (expr_coll && !get_collation_isdeterministic(expr_coll))
289 0 : return NIL;
290 :
291 : /*
292 : * Try to extract a fixed prefix from the pattern.
293 : */
294 6426 : pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
295 : &prefix, NULL);
296 :
297 : /* fail if no fixed prefix */
298 6426 : if (pstatus == Pattern_Prefix_None)
299 210 : return NIL;
300 :
301 : /*
302 : * Identify the operators we want to use, based on the type of the
303 : * left-hand argument. Usually these are just the type's regular
304 : * comparison operators, but if we are considering one of the semi-legacy
305 : * "pattern" opclasses, use the "pattern" operators instead. Those are
306 : * not collation-sensitive but always use C collation, as we want. The
307 : * selected operators also determine the needed type of the prefix
308 : * constant.
309 : */
310 6216 : ldatatype = exprType(leftop);
311 6216 : switch (ldatatype)
312 : {
313 56 : case TEXTOID:
314 56 : if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
315 : {
316 0 : eqopr = TextEqualOperator;
317 0 : ltopr = TextPatternLessOperator;
318 0 : geopr = TextPatternGreaterEqualOperator;
319 0 : collation_aware = false;
320 : }
321 56 : else if (opfamily == TEXT_SPGIST_FAM_OID)
322 : {
323 24 : eqopr = TextEqualOperator;
324 24 : ltopr = TextPatternLessOperator;
325 24 : geopr = TextPatternGreaterEqualOperator;
326 : /* This opfamily has direct support for prefixing */
327 24 : preopr = TextPrefixOperator;
328 24 : collation_aware = false;
329 : }
330 : else
331 : {
332 32 : eqopr = TextEqualOperator;
333 32 : ltopr = TextLessOperator;
334 32 : geopr = TextGreaterEqualOperator;
335 32 : collation_aware = true;
336 : }
337 56 : rdatatype = TEXTOID;
338 56 : break;
339 6136 : case NAMEOID:
340 :
341 : /*
342 : * Note that here, we need the RHS type to be text, so that the
343 : * comparison value isn't improperly truncated to NAMEDATALEN.
344 : */
345 6136 : eqopr = NameEqualTextOperator;
346 6136 : ltopr = NameLessTextOperator;
347 6136 : geopr = NameGreaterEqualTextOperator;
348 6136 : collation_aware = true;
349 6136 : rdatatype = TEXTOID;
350 6136 : break;
351 24 : case BPCHAROID:
352 24 : if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
353 : {
354 0 : eqopr = BpcharEqualOperator;
355 0 : ltopr = BpcharPatternLessOperator;
356 0 : geopr = BpcharPatternGreaterEqualOperator;
357 0 : collation_aware = false;
358 : }
359 : else
360 : {
361 24 : eqopr = BpcharEqualOperator;
362 24 : ltopr = BpcharLessOperator;
363 24 : geopr = BpcharGreaterEqualOperator;
364 24 : collation_aware = true;
365 : }
366 24 : rdatatype = BPCHAROID;
367 24 : break;
368 0 : case BYTEAOID:
369 0 : eqopr = ByteaEqualOperator;
370 0 : ltopr = ByteaLessOperator;
371 0 : geopr = ByteaGreaterEqualOperator;
372 0 : collation_aware = false;
373 0 : rdatatype = BYTEAOID;
374 0 : break;
375 0 : default:
376 : /* Can't get here unless we're attached to the wrong operator */
377 0 : return NIL;
378 : }
379 :
380 : /*
381 : * If necessary, coerce the prefix constant to the right type. The given
382 : * prefix constant is either text or bytea type, therefore the only case
383 : * where we need to do anything is when converting text to bpchar. Those
384 : * two types are binary-compatible, so relabeling the Const node is
385 : * sufficient.
386 : */
387 6216 : if (prefix->consttype != rdatatype)
388 : {
389 : Assert(prefix->consttype == TEXTOID &&
390 : rdatatype == BPCHAROID);
391 24 : prefix->consttype = rdatatype;
392 : }
393 :
394 : /*
395 : * If we found an exact-match pattern, generate an "=" indexqual.
396 : *
397 : * Here and below, check to see whether the desired operator is actually
398 : * supported by the index opclass, and fail quietly if not. This allows
399 : * us to not be concerned with specific opclasses (except for the legacy
400 : * "pattern" cases); any index that correctly implements the operators
401 : * will work.
402 : */
403 6216 : if (pstatus == Pattern_Prefix_Exact)
404 : {
405 5082 : if (!op_in_opfamily(eqopr, opfamily))
406 12 : return NIL;
407 5070 : expr = make_opclause(eqopr, BOOLOID, false,
408 : (Expr *) leftop, (Expr *) prefix,
409 : InvalidOid, indexcollation);
410 5070 : result = list_make1(expr);
411 5070 : return result;
412 : }
413 :
414 : /*
415 : * Otherwise, we have a nonempty required prefix of the values. Some
416 : * opclasses support prefix checks directly, otherwise we'll try to
417 : * generate a range constraint.
418 : */
419 1134 : if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
420 : {
421 24 : expr = make_opclause(preopr, BOOLOID, false,
422 : (Expr *) leftop, (Expr *) prefix,
423 : InvalidOid, indexcollation);
424 24 : result = list_make1(expr);
425 24 : return result;
426 : }
427 :
428 : /*
429 : * Since we need a range constraint, it's only going to work reliably if
430 : * the index is collation-insensitive or has "C" collation. Note that
431 : * here we are looking at the index's collation, not the expression's
432 : * collation -- this test is *not* dependent on the LIKE/regex operator's
433 : * collation.
434 : */
435 1110 : if (collation_aware &&
436 1110 : !lc_collate_is_c(indexcollation))
437 8 : return NIL;
438 :
439 : /*
440 : * We can always say "x >= prefix".
441 : */
442 1102 : if (!op_in_opfamily(geopr, opfamily))
443 12 : return NIL;
444 1090 : expr = make_opclause(geopr, BOOLOID, false,
445 : (Expr *) leftop, (Expr *) prefix,
446 : InvalidOid, indexcollation);
447 1090 : result = list_make1(expr);
448 :
449 : /*-------
450 : * If we can create a string larger than the prefix, we can say
451 : * "x < greaterstr". NB: we rely on make_greater_string() to generate
452 : * a guaranteed-greater string, not just a probably-greater string.
453 : * In general this is only guaranteed in C locale, so we'd better be
454 : * using a C-locale index collation.
455 : *-------
456 : */
457 1090 : if (!op_in_opfamily(ltopr, opfamily))
458 0 : return result;
459 1090 : fmgr_info(get_opcode(ltopr), <proc);
460 1090 : greaterstr = make_greater_string(prefix, <proc, indexcollation);
461 1090 : if (greaterstr)
462 : {
463 1090 : expr = make_opclause(ltopr, BOOLOID, false,
464 : (Expr *) leftop, (Expr *) greaterstr,
465 : InvalidOid, indexcollation);
466 1090 : result = lappend(result, expr);
467 : }
468 :
469 1090 : return result;
470 : }
471 :
472 :
473 : /*
474 : * patternsel_common - generic code for pattern-match restriction selectivity.
475 : *
476 : * To support using this from either the operator or function paths, caller
477 : * may pass either operator OID or underlying function OID; we look up the
478 : * latter from the former if needed. (We could just have patternsel() call
479 : * get_opcode(), but the work would be wasted if we don't have a need to
480 : * compare a fixed prefix to the pg_statistic data.)
481 : *
482 : * Note that oprid and/or opfuncid should be for the positive-match operator
483 : * even when negate is true.
484 : */
485 : static double
486 8760 : patternsel_common(PlannerInfo *root,
487 : Oid oprid,
488 : Oid opfuncid,
489 : List *args,
490 : int varRelid,
491 : Oid collation,
492 : Pattern_Type ptype,
493 : bool negate)
494 : {
495 : VariableStatData vardata;
496 : Node *other;
497 : bool varonleft;
498 : Datum constval;
499 : Oid consttype;
500 : Oid vartype;
501 : Oid rdatatype;
502 : Oid eqopr;
503 : Oid ltopr;
504 : Oid geopr;
505 : Pattern_Prefix_Status pstatus;
506 : Const *patt;
507 8760 : Const *prefix = NULL;
508 8760 : Selectivity rest_selec = 0;
509 8760 : double nullfrac = 0.0;
510 : double result;
511 :
512 : /*
513 : * Initialize result to the appropriate default estimate depending on
514 : * whether it's a match or not-match operator.
515 : */
516 8760 : if (negate)
517 796 : result = 1.0 - DEFAULT_MATCH_SEL;
518 : else
519 7964 : result = DEFAULT_MATCH_SEL;
520 :
521 : /*
522 : * If expression is not variable op constant, then punt and return the
523 : * default estimate.
524 : */
525 8760 : if (!get_restriction_variable(root, args, varRelid,
526 : &vardata, &other, &varonleft))
527 68 : return result;
528 8692 : if (!varonleft || !IsA(other, Const))
529 : {
530 40 : ReleaseVariableStats(vardata);
531 40 : return result;
532 : }
533 :
534 : /*
535 : * If the constant is NULL, assume operator is strict and return zero, ie,
536 : * operator will never return TRUE. (It's zero even for a negator op.)
537 : */
538 8652 : if (((Const *) other)->constisnull)
539 : {
540 0 : ReleaseVariableStats(vardata);
541 0 : return 0.0;
542 : }
543 8652 : constval = ((Const *) other)->constvalue;
544 8652 : consttype = ((Const *) other)->consttype;
545 :
546 : /*
547 : * The right-hand const is type text or bytea for all supported operators.
548 : * We do not expect to see binary-compatible types here, since
549 : * const-folding should have relabeled the const to exactly match the
550 : * operator's declared type.
551 : */
552 8652 : if (consttype != TEXTOID && consttype != BYTEAOID)
553 : {
554 24 : ReleaseVariableStats(vardata);
555 24 : return result;
556 : }
557 :
558 : /*
559 : * Similarly, the exposed type of the left-hand side should be one of
560 : * those we know. (Do not look at vardata.atttype, which might be
561 : * something binary-compatible but different.) We can use it to identify
562 : * the comparison operators and the required type of the comparison
563 : * constant, much as in match_pattern_prefix().
564 : */
565 8628 : vartype = vardata.vartype;
566 :
567 8628 : switch (vartype)
568 : {
569 1626 : case TEXTOID:
570 1626 : eqopr = TextEqualOperator;
571 1626 : ltopr = TextLessOperator;
572 1626 : geopr = TextGreaterEqualOperator;
573 1626 : rdatatype = TEXTOID;
574 1626 : break;
575 6932 : case NAMEOID:
576 :
577 : /*
578 : * Note that here, we need the RHS type to be text, so that the
579 : * comparison value isn't improperly truncated to NAMEDATALEN.
580 : */
581 6932 : eqopr = NameEqualTextOperator;
582 6932 : ltopr = NameLessTextOperator;
583 6932 : geopr = NameGreaterEqualTextOperator;
584 6932 : rdatatype = TEXTOID;
585 6932 : break;
586 60 : case BPCHAROID:
587 60 : eqopr = BpcharEqualOperator;
588 60 : ltopr = BpcharLessOperator;
589 60 : geopr = BpcharGreaterEqualOperator;
590 60 : rdatatype = BPCHAROID;
591 60 : break;
592 6 : case BYTEAOID:
593 6 : eqopr = ByteaEqualOperator;
594 6 : ltopr = ByteaLessOperator;
595 6 : geopr = ByteaGreaterEqualOperator;
596 6 : rdatatype = BYTEAOID;
597 6 : break;
598 4 : default:
599 : /* Can't get here unless we're attached to the wrong operator */
600 4 : ReleaseVariableStats(vardata);
601 4 : return result;
602 : }
603 :
604 : /*
605 : * Grab the nullfrac for use below.
606 : */
607 8624 : if (HeapTupleIsValid(vardata.statsTuple))
608 : {
609 : Form_pg_statistic stats;
610 :
611 6620 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
612 6620 : nullfrac = stats->stanullfrac;
613 : }
614 :
615 : /*
616 : * Pull out any fixed prefix implied by the pattern, and estimate the
617 : * fractional selectivity of the remainder of the pattern. Unlike many
618 : * other selectivity estimators, we use the pattern operator's actual
619 : * collation for this step. This is not because we expect the collation
620 : * to make a big difference in the selectivity estimate (it seldom would),
621 : * but because we want to be sure we cache compiled regexps under the
622 : * right cache key, so that they can be re-used at runtime.
623 : */
624 8624 : patt = (Const *) other;
625 8624 : pstatus = pattern_fixed_prefix(patt, ptype, collation,
626 : &prefix, &rest_selec);
627 :
628 : /*
629 : * If necessary, coerce the prefix constant to the right type. The only
630 : * case where we need to do anything is when converting text to bpchar.
631 : * Those two types are binary-compatible, so relabeling the Const node is
632 : * sufficient.
633 : */
634 8624 : if (prefix && prefix->consttype != rdatatype)
635 : {
636 : Assert(prefix->consttype == TEXTOID &&
637 : rdatatype == BPCHAROID);
638 24 : prefix->consttype = rdatatype;
639 : }
640 :
641 8624 : if (pstatus == Pattern_Prefix_Exact)
642 : {
643 : /*
644 : * Pattern specifies an exact match, so estimate as for '='
645 : */
646 5260 : result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
647 : false, true, false);
648 : }
649 : else
650 : {
651 : /*
652 : * Not exact-match pattern. If we have a sufficiently large
653 : * histogram, estimate selectivity for the histogram part of the
654 : * population by counting matches in the histogram. If not, estimate
655 : * selectivity of the fixed prefix and remainder of pattern
656 : * separately, then combine the two to get an estimate of the
657 : * selectivity for the part of the column population represented by
658 : * the histogram. (For small histograms, we combine these
659 : * approaches.)
660 : *
661 : * We then add up data for any most-common-values values; these are
662 : * not in the histogram population, and we can get exact answers for
663 : * them by applying the pattern operator, so there's no reason to
664 : * approximate. (If the MCVs cover a significant part of the total
665 : * population, this gives us a big leg up in accuracy.)
666 : */
667 : Selectivity selec;
668 : int hist_size;
669 : FmgrInfo opproc;
670 : double mcv_selec,
671 : sumcommon;
672 :
673 : /* Try to use the histogram entries to get selectivity */
674 3364 : if (!OidIsValid(opfuncid))
675 3340 : opfuncid = get_opcode(oprid);
676 3364 : fmgr_info(opfuncid, &opproc);
677 :
678 3364 : selec = histogram_selectivity(&vardata, &opproc, collation,
679 : constval, true,
680 : 10, 1, &hist_size);
681 :
682 : /* If not at least 100 entries, use the heuristic method */
683 3364 : if (hist_size < 100)
684 : {
685 : Selectivity heursel;
686 : Selectivity prefixsel;
687 :
688 2200 : if (pstatus == Pattern_Prefix_Partial)
689 1596 : prefixsel = prefix_selectivity(root, &vardata,
690 : eqopr, ltopr, geopr,
691 : collation,
692 : prefix);
693 : else
694 604 : prefixsel = 1.0;
695 2200 : heursel = prefixsel * rest_selec;
696 :
697 2200 : if (selec < 0) /* fewer than 10 histogram entries? */
698 2000 : selec = heursel;
699 : else
700 : {
701 : /*
702 : * For histogram sizes from 10 to 100, we combine the
703 : * histogram and heuristic selectivities, putting increasingly
704 : * more trust in the histogram for larger sizes.
705 : */
706 200 : double hist_weight = hist_size / 100.0;
707 :
708 200 : selec = selec * hist_weight + heursel * (1.0 - hist_weight);
709 : }
710 : }
711 :
712 : /* In any case, don't believe extremely small or large estimates. */
713 3364 : if (selec < 0.0001)
714 1158 : selec = 0.0001;
715 2206 : else if (selec > 0.9999)
716 110 : selec = 0.9999;
717 :
718 : /*
719 : * If we have most-common-values info, add up the fractions of the MCV
720 : * entries that satisfy MCV OP PATTERN. These fractions contribute
721 : * directly to the result selectivity. Also add up the total fraction
722 : * represented by MCV entries.
723 : */
724 3364 : mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
725 : constval, true,
726 : &sumcommon);
727 :
728 : /*
729 : * Now merge the results from the MCV and histogram calculations,
730 : * realizing that the histogram covers only the non-null values that
731 : * are not listed in MCV.
732 : */
733 3364 : selec *= 1.0 - nullfrac - sumcommon;
734 3364 : selec += mcv_selec;
735 3364 : result = selec;
736 : }
737 :
738 : /* now adjust if we wanted not-match rather than match */
739 8624 : if (negate)
740 716 : result = 1.0 - result - nullfrac;
741 :
742 : /* result should be in range, but make sure... */
743 8624 : CLAMP_PROBABILITY(result);
744 :
745 8624 : if (prefix)
746 : {
747 8092 : pfree(DatumGetPointer(prefix->constvalue));
748 8092 : pfree(prefix);
749 : }
750 :
751 8624 : ReleaseVariableStats(vardata);
752 :
753 8624 : return result;
754 : }
755 :
756 : /*
757 : * Fix impedance mismatch between SQL-callable functions and patternsel_common
758 : */
759 : static double
760 8736 : patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
761 : {
762 8736 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
763 8736 : Oid operator = PG_GETARG_OID(1);
764 8736 : List *args = (List *) PG_GETARG_POINTER(2);
765 8736 : int varRelid = PG_GETARG_INT32(3);
766 8736 : Oid collation = PG_GET_COLLATION();
767 :
768 : /*
769 : * If this is for a NOT LIKE or similar operator, get the corresponding
770 : * positive-match operator and work with that.
771 : */
772 8736 : if (negate)
773 : {
774 796 : operator = get_negator(operator);
775 796 : if (!OidIsValid(operator))
776 0 : elog(ERROR, "patternsel called for operator without a negator");
777 : }
778 :
779 8736 : return patternsel_common(root,
780 : operator,
781 : InvalidOid,
782 : args,
783 : varRelid,
784 : collation,
785 : ptype,
786 : negate);
787 : }
788 :
789 : /*
790 : * regexeqsel - Selectivity of regular-expression pattern match.
791 : */
792 : Datum
793 6006 : regexeqsel(PG_FUNCTION_ARGS)
794 : {
795 6006 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
796 : }
797 :
798 : /*
799 : * icregexeqsel - Selectivity of case-insensitive regex match.
800 : */
801 : Datum
802 40 : icregexeqsel(PG_FUNCTION_ARGS)
803 : {
804 40 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
805 : }
806 :
807 : /*
808 : * likesel - Selectivity of LIKE pattern match.
809 : */
810 : Datum
811 1784 : likesel(PG_FUNCTION_ARGS)
812 : {
813 1784 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
814 : }
815 :
816 : /*
817 : * prefixsel - selectivity of prefix operator
818 : */
819 : Datum
820 54 : prefixsel(PG_FUNCTION_ARGS)
821 : {
822 54 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
823 : }
824 :
825 : /*
826 : *
827 : * iclikesel - Selectivity of ILIKE pattern match.
828 : */
829 : Datum
830 56 : iclikesel(PG_FUNCTION_ARGS)
831 : {
832 56 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
833 : }
834 :
835 : /*
836 : * regexnesel - Selectivity of regular-expression pattern non-match.
837 : */
838 : Datum
839 642 : regexnesel(PG_FUNCTION_ARGS)
840 : {
841 642 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
842 : }
843 :
844 : /*
845 : * icregexnesel - Selectivity of case-insensitive regex non-match.
846 : */
847 : Datum
848 16 : icregexnesel(PG_FUNCTION_ARGS)
849 : {
850 16 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
851 : }
852 :
853 : /*
854 : * nlikesel - Selectivity of LIKE pattern non-match.
855 : */
856 : Datum
857 130 : nlikesel(PG_FUNCTION_ARGS)
858 : {
859 130 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
860 : }
861 :
862 : /*
863 : * icnlikesel - Selectivity of ILIKE pattern non-match.
864 : */
865 : Datum
866 8 : icnlikesel(PG_FUNCTION_ARGS)
867 : {
868 8 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
869 : }
870 :
871 : /*
872 : * patternjoinsel - Generic code for pattern-match join selectivity.
873 : */
874 : static double
875 194 : patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
876 : {
877 : /* For the moment we just punt. */
878 194 : return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
879 : }
880 :
881 : /*
882 : * regexeqjoinsel - Join selectivity of regular-expression pattern match.
883 : */
884 : Datum
885 182 : regexeqjoinsel(PG_FUNCTION_ARGS)
886 : {
887 182 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
888 : }
889 :
890 : /*
891 : * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
892 : */
893 : Datum
894 0 : icregexeqjoinsel(PG_FUNCTION_ARGS)
895 : {
896 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
897 : }
898 :
899 : /*
900 : * likejoinsel - Join selectivity of LIKE pattern match.
901 : */
902 : Datum
903 12 : likejoinsel(PG_FUNCTION_ARGS)
904 : {
905 12 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
906 : }
907 :
908 : /*
909 : * prefixjoinsel - Join selectivity of prefix operator
910 : */
911 : Datum
912 0 : prefixjoinsel(PG_FUNCTION_ARGS)
913 : {
914 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
915 : }
916 :
917 : /*
918 : * iclikejoinsel - Join selectivity of ILIKE pattern match.
919 : */
920 : Datum
921 0 : iclikejoinsel(PG_FUNCTION_ARGS)
922 : {
923 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
924 : }
925 :
926 : /*
927 : * regexnejoinsel - Join selectivity of regex non-match.
928 : */
929 : Datum
930 0 : regexnejoinsel(PG_FUNCTION_ARGS)
931 : {
932 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
933 : }
934 :
935 : /*
936 : * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
937 : */
938 : Datum
939 0 : icregexnejoinsel(PG_FUNCTION_ARGS)
940 : {
941 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
942 : }
943 :
944 : /*
945 : * nlikejoinsel - Join selectivity of LIKE pattern non-match.
946 : */
947 : Datum
948 0 : nlikejoinsel(PG_FUNCTION_ARGS)
949 : {
950 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
951 : }
952 :
953 : /*
954 : * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
955 : */
956 : Datum
957 0 : icnlikejoinsel(PG_FUNCTION_ARGS)
958 : {
959 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
960 : }
961 :
962 :
963 : /*-------------------------------------------------------------------------
964 : *
965 : * Pattern analysis functions
966 : *
967 : * These routines support analysis of LIKE and regular-expression patterns
968 : * by the planner/optimizer. It's important that they agree with the
969 : * regular-expression code in backend/regex/ and the LIKE code in
970 : * backend/utils/adt/like.c. Also, the computation of the fixed prefix
971 : * must be conservative: if we report a string longer than the true fixed
972 : * prefix, the query may produce actually wrong answers, rather than just
973 : * getting a bad selectivity estimate!
974 : *
975 : *-------------------------------------------------------------------------
976 : */
977 :
978 : /*
979 : * Extract the fixed prefix, if any, for a pattern.
980 : *
981 : * *prefix is set to a palloc'd prefix string (in the form of a Const node),
982 : * or to NULL if no fixed prefix exists for the pattern.
983 : * If rest_selec is not NULL, *rest_selec is set to an estimate of the
984 : * selectivity of the remainder of the pattern (without any fixed prefix).
985 : * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
986 : *
987 : * The return value distinguishes no fixed prefix, a partial prefix,
988 : * or an exact-match-only pattern.
989 : */
990 :
991 : static Pattern_Prefix_Status
992 2922 : like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
993 : Const **prefix_const, Selectivity *rest_selec)
994 : {
995 : char *match;
996 : char *patt;
997 : int pattlen;
998 2922 : Oid typeid = patt_const->consttype;
999 : int pos,
1000 : match_pos;
1001 2922 : bool is_multibyte = (pg_database_encoding_max_length() > 1);
1002 2922 : pg_locale_t locale = 0;
1003 2922 : bool locale_is_c = false;
1004 :
1005 : /* the right-hand const is type text or bytea */
1006 : Assert(typeid == BYTEAOID || typeid == TEXTOID);
1007 :
1008 2922 : if (case_insensitive)
1009 : {
1010 84 : if (typeid == BYTEAOID)
1011 0 : ereport(ERROR,
1012 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1013 : errmsg("case insensitive matching not supported on type bytea")));
1014 :
1015 84 : if (!OidIsValid(collation))
1016 : {
1017 : /*
1018 : * This typically means that the parser could not resolve a
1019 : * conflict of implicit collations, so report it that way.
1020 : */
1021 0 : ereport(ERROR,
1022 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1023 : errmsg("could not determine which collation to use for ILIKE"),
1024 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1025 : }
1026 :
1027 : /* If case-insensitive, we need locale info */
1028 84 : if (lc_ctype_is_c(collation))
1029 84 : locale_is_c = true;
1030 : else
1031 0 : locale = pg_newlocale_from_collation(collation);
1032 : }
1033 :
1034 2922 : if (typeid != BYTEAOID)
1035 : {
1036 2910 : patt = TextDatumGetCString(patt_const->constvalue);
1037 2910 : pattlen = strlen(patt);
1038 : }
1039 : else
1040 : {
1041 12 : bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1042 :
1043 12 : pattlen = VARSIZE_ANY_EXHDR(bstr);
1044 12 : patt = (char *) palloc(pattlen);
1045 12 : memcpy(patt, VARDATA_ANY(bstr), pattlen);
1046 : Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1047 : }
1048 :
1049 2922 : match = palloc(pattlen + 1);
1050 2922 : match_pos = 0;
1051 18956 : for (pos = 0; pos < pattlen; pos++)
1052 : {
1053 : /* % and _ are wildcard characters in LIKE */
1054 18892 : if (patt[pos] == '%' ||
1055 17008 : patt[pos] == '_')
1056 : break;
1057 :
1058 : /* Backslash escapes the next character */
1059 16062 : if (patt[pos] == '\\')
1060 : {
1061 262 : pos++;
1062 262 : if (pos >= pattlen)
1063 0 : break;
1064 : }
1065 :
1066 : /* Stop if case-varying character (it's sort of a wildcard) */
1067 16186 : if (case_insensitive &&
1068 124 : pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1069 28 : break;
1070 :
1071 16034 : match[match_pos++] = patt[pos];
1072 : }
1073 :
1074 2922 : match[match_pos] = '\0';
1075 :
1076 2922 : if (typeid != BYTEAOID)
1077 2910 : *prefix_const = string_to_const(match, typeid);
1078 : else
1079 12 : *prefix_const = string_to_bytea_const(match, match_pos);
1080 :
1081 2922 : if (rest_selec != NULL)
1082 1962 : *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1083 : case_insensitive);
1084 :
1085 2922 : pfree(patt);
1086 2922 : pfree(match);
1087 :
1088 : /* in LIKE, an empty pattern is an exact match! */
1089 2922 : if (pos == pattlen)
1090 64 : return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1091 :
1092 2858 : if (match_pos > 0)
1093 2498 : return Pattern_Prefix_Partial;
1094 :
1095 360 : return Pattern_Prefix_None;
1096 : }
1097 :
1098 : static Pattern_Prefix_Status
1099 12026 : regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1100 : Const **prefix_const, Selectivity *rest_selec)
1101 : {
1102 12026 : Oid typeid = patt_const->consttype;
1103 : char *prefix;
1104 : bool exact;
1105 :
1106 : /*
1107 : * Should be unnecessary, there are no bytea regex operators defined. As
1108 : * such, it should be noted that the rest of this function has *not* been
1109 : * made safe for binary (possibly NULL containing) strings.
1110 : */
1111 12026 : if (typeid == BYTEAOID)
1112 0 : ereport(ERROR,
1113 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1114 : errmsg("regular-expression matching not supported on type bytea")));
1115 :
1116 : /* Use the regexp machinery to extract the prefix, if any */
1117 12026 : prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1118 : case_insensitive, collation,
1119 : &exact);
1120 :
1121 12026 : if (prefix == NULL)
1122 : {
1123 648 : *prefix_const = NULL;
1124 :
1125 648 : if (rest_selec != NULL)
1126 : {
1127 532 : char *patt = TextDatumGetCString(patt_const->constvalue);
1128 :
1129 532 : *rest_selec = regex_selectivity(patt, strlen(patt),
1130 : case_insensitive,
1131 : 0);
1132 532 : pfree(patt);
1133 : }
1134 :
1135 648 : return Pattern_Prefix_None;
1136 : }
1137 :
1138 11378 : *prefix_const = string_to_const(prefix, typeid);
1139 :
1140 11378 : if (rest_selec != NULL)
1141 : {
1142 6052 : if (exact)
1143 : {
1144 : /* Exact match, so there's no additional selectivity */
1145 5226 : *rest_selec = 1.0;
1146 : }
1147 : else
1148 : {
1149 826 : char *patt = TextDatumGetCString(patt_const->constvalue);
1150 :
1151 1652 : *rest_selec = regex_selectivity(patt, strlen(patt),
1152 : case_insensitive,
1153 826 : strlen(prefix));
1154 826 : pfree(patt);
1155 : }
1156 : }
1157 :
1158 11378 : pfree(prefix);
1159 :
1160 11378 : if (exact)
1161 10278 : return Pattern_Prefix_Exact; /* pattern specifies exact match */
1162 : else
1163 1100 : return Pattern_Prefix_Partial;
1164 : }
1165 :
1166 : static Pattern_Prefix_Status
1167 15050 : pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1168 : Const **prefix, Selectivity *rest_selec)
1169 : {
1170 : Pattern_Prefix_Status result;
1171 :
1172 15050 : switch (ptype)
1173 : {
1174 2838 : case Pattern_Type_Like:
1175 2838 : result = like_fixed_prefix(patt, false, collation,
1176 : prefix, rest_selec);
1177 2838 : break;
1178 84 : case Pattern_Type_Like_IC:
1179 84 : result = like_fixed_prefix(patt, true, collation,
1180 : prefix, rest_selec);
1181 84 : break;
1182 11994 : case Pattern_Type_Regex:
1183 11994 : result = regex_fixed_prefix(patt, false, collation,
1184 : prefix, rest_selec);
1185 11994 : break;
1186 32 : case Pattern_Type_Regex_IC:
1187 32 : result = regex_fixed_prefix(patt, true, collation,
1188 : prefix, rest_selec);
1189 32 : break;
1190 102 : case Pattern_Type_Prefix:
1191 : /* Prefix type work is trivial. */
1192 102 : result = Pattern_Prefix_Partial;
1193 102 : *prefix = makeConst(patt->consttype,
1194 : patt->consttypmod,
1195 : patt->constcollid,
1196 : patt->constlen,
1197 : datumCopy(patt->constvalue,
1198 102 : patt->constbyval,
1199 : patt->constlen),
1200 102 : patt->constisnull,
1201 102 : patt->constbyval);
1202 102 : if (rest_selec != NULL)
1203 78 : *rest_selec = 1.0; /* all */
1204 102 : break;
1205 0 : default:
1206 0 : elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1207 : result = Pattern_Prefix_None; /* keep compiler quiet */
1208 : break;
1209 : }
1210 15050 : return result;
1211 : }
1212 :
1213 : /*
1214 : * Estimate the selectivity of a fixed prefix for a pattern match.
1215 : *
1216 : * A fixed prefix "foo" is estimated as the selectivity of the expression
1217 : * "variable >= 'foo' AND variable < 'fop'".
1218 : *
1219 : * The selectivity estimate is with respect to the portion of the column
1220 : * population represented by the histogram --- the caller must fold this
1221 : * together with info about MCVs and NULLs.
1222 : *
1223 : * We use the given comparison operators and collation to do the estimation.
1224 : * The given variable and Const must be of the associated datatype(s).
1225 : *
1226 : * XXX Note: we make use of the upper bound to estimate operator selectivity
1227 : * even if the locale is such that we cannot rely on the upper-bound string.
1228 : * The selectivity only needs to be approximately right anyway, so it seems
1229 : * more useful to use the upper-bound code than not.
1230 : */
1231 : static Selectivity
1232 1596 : prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1233 : Oid eqopr, Oid ltopr, Oid geopr,
1234 : Oid collation,
1235 : Const *prefixcon)
1236 : {
1237 : Selectivity prefixsel;
1238 : FmgrInfo opproc;
1239 : Const *greaterstrcon;
1240 : Selectivity eq_sel;
1241 :
1242 : /* Estimate the selectivity of "x >= prefix" */
1243 1596 : fmgr_info(get_opcode(geopr), &opproc);
1244 :
1245 1596 : prefixsel = ineq_histogram_selectivity(root, vardata,
1246 : geopr, &opproc, true, true,
1247 : collation,
1248 : prefixcon->constvalue,
1249 : prefixcon->consttype);
1250 :
1251 1596 : if (prefixsel < 0.0)
1252 : {
1253 : /* No histogram is present ... return a suitable default estimate */
1254 928 : return DEFAULT_MATCH_SEL;
1255 : }
1256 :
1257 : /*
1258 : * If we can create a string larger than the prefix, say "x < greaterstr".
1259 : */
1260 668 : fmgr_info(get_opcode(ltopr), &opproc);
1261 668 : greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1262 668 : if (greaterstrcon)
1263 : {
1264 : Selectivity topsel;
1265 :
1266 668 : topsel = ineq_histogram_selectivity(root, vardata,
1267 : ltopr, &opproc, false, false,
1268 : collation,
1269 : greaterstrcon->constvalue,
1270 : greaterstrcon->consttype);
1271 :
1272 : /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1273 : Assert(topsel >= 0.0);
1274 :
1275 : /*
1276 : * Merge the two selectivities in the same way as for a range query
1277 : * (see clauselist_selectivity()). Note that we don't need to worry
1278 : * about double-exclusion of nulls, since ineq_histogram_selectivity
1279 : * doesn't count those anyway.
1280 : */
1281 668 : prefixsel = topsel + prefixsel - 1.0;
1282 : }
1283 :
1284 : /*
1285 : * If the prefix is long then the two bounding values might be too close
1286 : * together for the histogram to distinguish them usefully, resulting in a
1287 : * zero estimate (plus or minus roundoff error). To avoid returning a
1288 : * ridiculously small estimate, compute the estimated selectivity for
1289 : * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1290 : * estimate should be at least that.)
1291 : *
1292 : * We apply this even if we couldn't make a greater string. That case
1293 : * suggests that the prefix is near the maximum possible, and thus
1294 : * probably off the end of the histogram, and thus we probably got a very
1295 : * small estimate from the >= condition; so we still need to clamp.
1296 : */
1297 668 : eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1298 : false, true, false);
1299 :
1300 668 : prefixsel = Max(prefixsel, eq_sel);
1301 :
1302 668 : return prefixsel;
1303 : }
1304 :
1305 :
1306 : /*
1307 : * Estimate the selectivity of a pattern of the specified type.
1308 : * Note that any fixed prefix of the pattern will have been removed already,
1309 : * so actually we may be looking at just a fragment of the pattern.
1310 : *
1311 : * For now, we use a very simplistic approach: fixed characters reduce the
1312 : * selectivity a good deal, character ranges reduce it a little,
1313 : * wildcards (such as % for LIKE or .* for regex) increase it.
1314 : */
1315 :
1316 : #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1317 : #define CHAR_RANGE_SEL 0.25
1318 : #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1319 : #define FULL_WILDCARD_SEL 5.0
1320 : #define PARTIAL_WILDCARD_SEL 2.0
1321 :
1322 : static Selectivity
1323 1962 : like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1324 : {
1325 1962 : Selectivity sel = 1.0;
1326 : int pos;
1327 :
1328 : /* Skip any leading wildcard; it's already factored into initial sel */
1329 3892 : for (pos = 0; pos < pattlen; pos++)
1330 : {
1331 2630 : if (patt[pos] != '%' && patt[pos] != '_')
1332 700 : break;
1333 : }
1334 :
1335 5990 : for (; pos < pattlen; pos++)
1336 : {
1337 : /* % and _ are wildcard characters in LIKE */
1338 4028 : if (patt[pos] == '%')
1339 620 : sel *= FULL_WILDCARD_SEL;
1340 3408 : else if (patt[pos] == '_')
1341 132 : sel *= ANY_CHAR_SEL;
1342 3276 : else if (patt[pos] == '\\')
1343 : {
1344 : /* Backslash quotes the next character */
1345 40 : pos++;
1346 40 : if (pos >= pattlen)
1347 0 : break;
1348 40 : sel *= FIXED_CHAR_SEL;
1349 : }
1350 : else
1351 3236 : sel *= FIXED_CHAR_SEL;
1352 : }
1353 : /* Could get sel > 1 if multiple wildcards */
1354 1962 : if (sel > 1.0)
1355 0 : sel = 1.0;
1356 1962 : return sel;
1357 : }
1358 :
1359 : static Selectivity
1360 1626 : regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1361 : {
1362 1626 : Selectivity sel = 1.0;
1363 1626 : int paren_depth = 0;
1364 1626 : int paren_pos = 0; /* dummy init to keep compiler quiet */
1365 : int pos;
1366 :
1367 15446 : for (pos = 0; pos < pattlen; pos++)
1368 : {
1369 13838 : if (patt[pos] == '(')
1370 : {
1371 256 : if (paren_depth == 0)
1372 250 : paren_pos = pos; /* remember start of parenthesized item */
1373 256 : paren_depth++;
1374 : }
1375 13582 : else if (patt[pos] == ')' && paren_depth > 0)
1376 : {
1377 256 : paren_depth--;
1378 256 : if (paren_depth == 0)
1379 250 : sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1380 250 : pos - (paren_pos + 1),
1381 : case_insensitive);
1382 : }
1383 13326 : else if (patt[pos] == '|' && paren_depth == 0)
1384 : {
1385 : /*
1386 : * If unquoted | is present at paren level 0 in pattern, we have
1387 : * multiple alternatives; sum their probabilities.
1388 : */
1389 36 : sel += regex_selectivity_sub(patt + (pos + 1),
1390 18 : pattlen - (pos + 1),
1391 : case_insensitive);
1392 18 : break; /* rest of pattern is now processed */
1393 : }
1394 13308 : else if (patt[pos] == '[')
1395 : {
1396 76 : bool negclass = false;
1397 :
1398 76 : if (patt[++pos] == '^')
1399 : {
1400 0 : negclass = true;
1401 0 : pos++;
1402 : }
1403 76 : if (patt[pos] == ']') /* ']' at start of class is not special */
1404 0 : pos++;
1405 372 : while (pos < pattlen && patt[pos] != ']')
1406 296 : pos++;
1407 76 : if (paren_depth == 0)
1408 76 : sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1409 : }
1410 13232 : else if (patt[pos] == '.')
1411 : {
1412 596 : if (paren_depth == 0)
1413 360 : sel *= ANY_CHAR_SEL;
1414 : }
1415 12636 : else if (patt[pos] == '*' ||
1416 12144 : patt[pos] == '?' ||
1417 12104 : patt[pos] == '+')
1418 : {
1419 : /* Ought to be smarter about quantifiers... */
1420 546 : if (paren_depth == 0)
1421 318 : sel *= PARTIAL_WILDCARD_SEL;
1422 : }
1423 12090 : else if (patt[pos] == '{')
1424 : {
1425 264 : while (pos < pattlen && patt[pos] != '}')
1426 188 : pos++;
1427 76 : if (paren_depth == 0)
1428 64 : sel *= PARTIAL_WILDCARD_SEL;
1429 : }
1430 12014 : else if (patt[pos] == '\\')
1431 : {
1432 : /* backslash quotes the next character */
1433 200 : pos++;
1434 200 : if (pos >= pattlen)
1435 0 : break;
1436 200 : if (paren_depth == 0)
1437 104 : sel *= FIXED_CHAR_SEL;
1438 : }
1439 : else
1440 : {
1441 11814 : if (paren_depth == 0)
1442 10132 : sel *= FIXED_CHAR_SEL;
1443 : }
1444 : }
1445 : /* Could get sel > 1 if multiple wildcards */
1446 1626 : if (sel > 1.0)
1447 26 : sel = 1.0;
1448 1626 : return sel;
1449 : }
1450 :
1451 : static Selectivity
1452 1358 : regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1453 : int fixed_prefix_len)
1454 : {
1455 : Selectivity sel;
1456 :
1457 : /* If patt doesn't end with $, consider it to have a trailing wildcard */
1458 1358 : if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1459 244 : (pattlen == 1 || patt[pattlen - 2] != '\\'))
1460 : {
1461 : /* has trailing $ */
1462 244 : sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1463 : }
1464 : else
1465 : {
1466 : /* no trailing $ */
1467 1114 : sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1468 1114 : sel *= FULL_WILDCARD_SEL;
1469 : }
1470 :
1471 : /*
1472 : * If there's a fixed prefix, discount its selectivity. We have to be
1473 : * careful here since a very long prefix could result in pow's result
1474 : * underflowing to zero (in which case "sel" probably has as well).
1475 : */
1476 1358 : if (fixed_prefix_len > 0)
1477 : {
1478 826 : double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1479 :
1480 826 : if (prefixsel > 0.0)
1481 826 : sel /= prefixsel;
1482 : }
1483 :
1484 : /* Make sure result stays in range */
1485 1358 : CLAMP_PROBABILITY(sel);
1486 1358 : return sel;
1487 : }
1488 :
1489 : /*
1490 : * Check whether char is a letter (and, hence, subject to case-folding)
1491 : *
1492 : * In multibyte character sets or with ICU, we can't use isalpha, and it does
1493 : * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1494 : * Instead, just assume any non-ASCII char is potentially case-varying, and
1495 : * hard-wire knowledge of which ASCII chars are letters.
1496 : */
1497 : static int
1498 124 : pattern_char_isalpha(char c, bool is_multibyte,
1499 : pg_locale_t locale, bool locale_is_c)
1500 : {
1501 124 : if (locale_is_c)
1502 124 : return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1503 0 : else if (is_multibyte && IS_HIGHBIT_SET(c))
1504 0 : return true;
1505 0 : else if (locale && locale->provider == COLLPROVIDER_ICU)
1506 0 : return IS_HIGHBIT_SET(c) ||
1507 0 : (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1508 : #ifdef HAVE_LOCALE_T
1509 0 : else if (locale && locale->provider == COLLPROVIDER_LIBC)
1510 0 : return isalpha_l((unsigned char) c, locale->info.lt);
1511 : #endif
1512 : else
1513 0 : return isalpha((unsigned char) c);
1514 : }
1515 :
1516 :
1517 : /*
1518 : * For bytea, the increment function need only increment the current byte
1519 : * (there are no multibyte characters to worry about).
1520 : */
1521 : static bool
1522 0 : byte_increment(unsigned char *ptr, int len)
1523 : {
1524 0 : if (*ptr >= 255)
1525 0 : return false;
1526 0 : (*ptr)++;
1527 0 : return true;
1528 : }
1529 :
1530 : /*
1531 : * Try to generate a string greater than the given string or any
1532 : * string it is a prefix of. If successful, return a palloc'd string
1533 : * in the form of a Const node; else return NULL.
1534 : *
1535 : * The caller must provide the appropriate "less than" comparison function
1536 : * for testing the strings, along with the collation to use.
1537 : *
1538 : * The key requirement here is that given a prefix string, say "foo",
1539 : * we must be able to generate another string "fop" that is greater than
1540 : * all strings "foobar" starting with "foo". We can test that we have
1541 : * generated a string greater than the prefix string, but in non-C collations
1542 : * that is not a bulletproof guarantee that an extension of the string might
1543 : * not sort after it; an example is that "foo " is less than "foo!", but it
1544 : * is not clear that a "dictionary" sort ordering will consider "foo!" less
1545 : * than "foo bar". CAUTION: Therefore, this function should be used only for
1546 : * estimation purposes when working in a non-C collation.
1547 : *
1548 : * To try to catch most cases where an extended string might otherwise sort
1549 : * before the result value, we determine which of the strings "Z", "z", "y",
1550 : * and "9" is seen as largest by the collation, and append that to the given
1551 : * prefix before trying to find a string that compares as larger.
1552 : *
1553 : * To search for a greater string, we repeatedly "increment" the rightmost
1554 : * character, using an encoding-specific character incrementer function.
1555 : * When it's no longer possible to increment the last character, we truncate
1556 : * off that character and start incrementing the next-to-rightmost.
1557 : * For example, if "z" were the last character in the sort order, then we
1558 : * could produce "foo" as a string greater than "fonz".
1559 : *
1560 : * This could be rather slow in the worst case, but in most cases we
1561 : * won't have to try more than one or two strings before succeeding.
1562 : *
1563 : * Note that it's important for the character incrementer not to be too anal
1564 : * about producing every possible character code, since in some cases the only
1565 : * way to get a larger string is to increment a previous character position.
1566 : * So we don't want to spend too much time trying every possible character
1567 : * code at the last position. A good rule of thumb is to be sure that we
1568 : * don't try more than 256*K values for a K-byte character (and definitely
1569 : * not 256^K, which is what an exhaustive search would approach).
1570 : */
1571 : static Const *
1572 1758 : make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1573 : {
1574 1758 : Oid datatype = str_const->consttype;
1575 : char *workstr;
1576 : int len;
1577 : Datum cmpstr;
1578 1758 : char *cmptxt = NULL;
1579 : mbcharacter_incrementer charinc;
1580 :
1581 : /*
1582 : * Get a modifiable copy of the prefix string in C-string format, and set
1583 : * up the string we will compare to as a Datum. In C locale this can just
1584 : * be the given prefix string, otherwise we need to add a suffix. Type
1585 : * BYTEA sorts bytewise so it never needs a suffix either.
1586 : */
1587 1758 : if (datatype == BYTEAOID)
1588 : {
1589 0 : bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1590 :
1591 0 : len = VARSIZE_ANY_EXHDR(bstr);
1592 0 : workstr = (char *) palloc(len);
1593 0 : memcpy(workstr, VARDATA_ANY(bstr), len);
1594 : Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1595 0 : cmpstr = str_const->constvalue;
1596 : }
1597 : else
1598 : {
1599 1758 : if (datatype == NAMEOID)
1600 0 : workstr = DatumGetCString(DirectFunctionCall1(nameout,
1601 : str_const->constvalue));
1602 : else
1603 1758 : workstr = TextDatumGetCString(str_const->constvalue);
1604 1758 : len = strlen(workstr);
1605 1758 : if (lc_collate_is_c(collation) || len == 0)
1606 1732 : cmpstr = str_const->constvalue;
1607 : else
1608 : {
1609 : /* If first time through, determine the suffix to use */
1610 : static char suffixchar = 0;
1611 : static Oid suffixcollation = 0;
1612 :
1613 26 : if (!suffixchar || suffixcollation != collation)
1614 : {
1615 : char *best;
1616 :
1617 8 : best = "Z";
1618 8 : if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1619 0 : best = "z";
1620 8 : if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1621 0 : best = "y";
1622 8 : if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1623 0 : best = "9";
1624 8 : suffixchar = *best;
1625 8 : suffixcollation = collation;
1626 : }
1627 :
1628 : /* And build the string to compare to */
1629 26 : if (datatype == NAMEOID)
1630 : {
1631 0 : cmptxt = palloc(len + 2);
1632 0 : memcpy(cmptxt, workstr, len);
1633 0 : cmptxt[len] = suffixchar;
1634 0 : cmptxt[len + 1] = '\0';
1635 0 : cmpstr = PointerGetDatum(cmptxt);
1636 : }
1637 : else
1638 : {
1639 26 : cmptxt = palloc(VARHDRSZ + len + 1);
1640 26 : SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1641 26 : memcpy(VARDATA(cmptxt), workstr, len);
1642 26 : *(VARDATA(cmptxt) + len) = suffixchar;
1643 26 : cmpstr = PointerGetDatum(cmptxt);
1644 : }
1645 : }
1646 : }
1647 :
1648 : /* Select appropriate character-incrementer function */
1649 1758 : if (datatype == BYTEAOID)
1650 0 : charinc = byte_increment;
1651 : else
1652 1758 : charinc = pg_database_encoding_character_incrementer();
1653 :
1654 : /* And search ... */
1655 1758 : while (len > 0)
1656 : {
1657 : int charlen;
1658 : unsigned char *lastchar;
1659 :
1660 : /* Identify the last character --- for bytea, just the last byte */
1661 1758 : if (datatype == BYTEAOID)
1662 0 : charlen = 1;
1663 : else
1664 1758 : charlen = len - pg_mbcliplen(workstr, len, len - 1);
1665 1758 : lastchar = (unsigned char *) (workstr + len - charlen);
1666 :
1667 : /*
1668 : * Try to generate a larger string by incrementing the last character
1669 : * (for BYTEA, we treat each byte as a character).
1670 : *
1671 : * Note: the incrementer function is expected to return true if it's
1672 : * generated a valid-per-the-encoding new character, otherwise false.
1673 : * The contents of the character on false return are unspecified.
1674 : */
1675 1758 : while (charinc(lastchar, charlen))
1676 : {
1677 : Const *workstr_const;
1678 :
1679 1758 : if (datatype == BYTEAOID)
1680 0 : workstr_const = string_to_bytea_const(workstr, len);
1681 : else
1682 1758 : workstr_const = string_to_const(workstr, datatype);
1683 :
1684 1758 : if (DatumGetBool(FunctionCall2Coll(ltproc,
1685 : collation,
1686 : cmpstr,
1687 : workstr_const->constvalue)))
1688 : {
1689 : /* Successfully made a string larger than cmpstr */
1690 1758 : if (cmptxt)
1691 26 : pfree(cmptxt);
1692 1758 : pfree(workstr);
1693 1758 : return workstr_const;
1694 : }
1695 :
1696 : /* No good, release unusable value and try again */
1697 0 : pfree(DatumGetPointer(workstr_const->constvalue));
1698 0 : pfree(workstr_const);
1699 : }
1700 :
1701 : /*
1702 : * No luck here, so truncate off the last character and try to
1703 : * increment the next one.
1704 : */
1705 0 : len -= charlen;
1706 0 : workstr[len] = '\0';
1707 : }
1708 :
1709 : /* Failed... */
1710 0 : if (cmptxt)
1711 0 : pfree(cmptxt);
1712 0 : pfree(workstr);
1713 :
1714 0 : return NULL;
1715 : }
1716 :
1717 : /*
1718 : * Generate a Datum of the appropriate type from a C string.
1719 : * Note that all of the supported types are pass-by-ref, so the
1720 : * returned value should be pfree'd if no longer needed.
1721 : */
1722 : static Datum
1723 16046 : string_to_datum(const char *str, Oid datatype)
1724 : {
1725 : Assert(str != NULL);
1726 :
1727 : /*
1728 : * We cheat a little by assuming that CStringGetTextDatum() will do for
1729 : * bpchar and varchar constants too...
1730 : */
1731 16046 : if (datatype == NAMEOID)
1732 0 : return DirectFunctionCall1(namein, CStringGetDatum(str));
1733 16046 : else if (datatype == BYTEAOID)
1734 0 : return DirectFunctionCall1(byteain, CStringGetDatum(str));
1735 : else
1736 16046 : return CStringGetTextDatum(str);
1737 : }
1738 :
1739 : /*
1740 : * Generate a Const node of the appropriate type from a C string.
1741 : */
1742 : static Const *
1743 16046 : string_to_const(const char *str, Oid datatype)
1744 : {
1745 16046 : Datum conval = string_to_datum(str, datatype);
1746 : Oid collation;
1747 : int constlen;
1748 :
1749 : /*
1750 : * We only need to support a few datatypes here, so hard-wire properties
1751 : * instead of incurring the expense of catalog lookups.
1752 : */
1753 16046 : switch (datatype)
1754 : {
1755 16046 : case TEXTOID:
1756 : case VARCHAROID:
1757 : case BPCHAROID:
1758 16046 : collation = DEFAULT_COLLATION_OID;
1759 16046 : constlen = -1;
1760 16046 : break;
1761 :
1762 0 : case NAMEOID:
1763 0 : collation = C_COLLATION_OID;
1764 0 : constlen = NAMEDATALEN;
1765 0 : break;
1766 :
1767 0 : case BYTEAOID:
1768 0 : collation = InvalidOid;
1769 0 : constlen = -1;
1770 0 : break;
1771 :
1772 0 : default:
1773 0 : elog(ERROR, "unexpected datatype in string_to_const: %u",
1774 : datatype);
1775 : return NULL;
1776 : }
1777 :
1778 16046 : return makeConst(datatype, -1, collation, constlen,
1779 : conval, false, false);
1780 : }
1781 :
1782 : /*
1783 : * Generate a Const node of bytea type from a binary C string and a length.
1784 : */
1785 : static Const *
1786 12 : string_to_bytea_const(const char *str, size_t str_len)
1787 : {
1788 12 : bytea *bstr = palloc(VARHDRSZ + str_len);
1789 : Datum conval;
1790 :
1791 12 : memcpy(VARDATA(bstr), str, str_len);
1792 12 : SET_VARSIZE(bstr, VARHDRSZ + str_len);
1793 12 : conval = PointerGetDatum(bstr);
1794 :
1795 12 : return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1796 : }
|