Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * multirangetypes_selfuncs.c
4 : * Functions for selectivity estimation of multirange operators
5 : *
6 : * Estimates are based on histograms of lower and upper bounds, and the
7 : * fraction of empty multiranges.
8 : *
9 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : *
13 : * IDENTIFICATION
14 : * src/backend/utils/adt/multirangetypes_selfuncs.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include <math.h>
21 :
22 : #include "access/htup_details.h"
23 : #include "catalog/pg_operator.h"
24 : #include "catalog/pg_statistic.h"
25 : #include "catalog/pg_type.h"
26 : #include "utils/float.h"
27 : #include "utils/fmgrprotos.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/rangetypes.h"
30 : #include "utils/multirangetypes.h"
31 : #include "utils/selfuncs.h"
32 : #include "utils/typcache.h"
33 :
34 : static double calc_multirangesel(TypeCacheEntry *typcache,
35 : VariableStatData *vardata,
36 : const MultirangeType *constval, Oid operator);
37 : static double default_multirange_selectivity(Oid operator);
38 : static double calc_hist_selectivity(TypeCacheEntry *typcache,
39 : VariableStatData *vardata,
40 : const MultirangeType *constval,
41 : Oid operator);
42 : static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
43 : const RangeBound *constbound,
44 : const RangeBound *hist,
45 : int hist_nvalues, bool equal);
46 : static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
47 : const RangeBound *hist, int hist_length, bool equal);
48 : static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
49 : const RangeBound *hist1, const RangeBound *hist2);
50 : static float8 get_len_position(double value, double hist1, double hist2);
51 : static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
52 : const RangeBound *bound2);
53 : static int length_hist_bsearch(Datum *length_hist_values,
54 : int length_hist_nvalues, double value,
55 : bool equal);
56 : static double calc_length_hist_frac(Datum *length_hist_values,
57 : int length_hist_nvalues, double length1,
58 : double length2, bool equal);
59 : static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
60 : const RangeBound *lower,
61 : RangeBound *upper,
62 : const RangeBound *hist_lower,
63 : int hist_nvalues,
64 : Datum *length_hist_values,
65 : int length_hist_nvalues);
66 : static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
67 : const RangeBound *lower,
68 : const RangeBound *upper,
69 : const RangeBound *hist_lower,
70 : int hist_nvalues,
71 : Datum *length_hist_values,
72 : int length_hist_nvalues);
73 :
74 : /*
75 : * Returns a default selectivity estimate for given operator, when we don't
76 : * have statistics or cannot use them for some reason.
77 : */
78 : static double
79 192 : default_multirange_selectivity(Oid operator)
80 : {
81 192 : switch (operator)
82 : {
83 18 : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
84 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
85 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
86 18 : return 0.01;
87 :
88 54 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
89 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
90 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
91 : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
92 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
93 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
94 54 : return 0.005;
95 :
96 0 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
97 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
98 :
99 : /*
100 : * "multirange @> elem" is more or less identical to a scalar
101 : * inequality "A >= b AND A <= c".
102 : */
103 0 : return DEFAULT_MULTIRANGE_INEQ_SEL;
104 :
105 120 : case OID_MULTIRANGE_LESS_OP:
106 : case OID_MULTIRANGE_LESS_EQUAL_OP:
107 : case OID_MULTIRANGE_GREATER_OP:
108 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
109 : case OID_MULTIRANGE_LEFT_RANGE_OP:
110 : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
111 : case OID_RANGE_LEFT_MULTIRANGE_OP:
112 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
113 : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
114 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
115 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
116 : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
117 : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
118 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
119 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
120 : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
121 : /* these are similar to regular scalar inequalities */
122 120 : return DEFAULT_INEQ_SEL;
123 :
124 0 : default:
125 :
126 : /*
127 : * all multirange operators should be handled above, but just in
128 : * case
129 : */
130 0 : return 0.01;
131 : }
132 : }
133 :
134 : /*
135 : * multirangesel -- restriction selectivity for multirange operators
136 : */
137 : Datum
138 666 : multirangesel(PG_FUNCTION_ARGS)
139 : {
140 666 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
141 666 : Oid operator = PG_GETARG_OID(1);
142 666 : List *args = (List *) PG_GETARG_POINTER(2);
143 666 : int varRelid = PG_GETARG_INT32(3);
144 : VariableStatData vardata;
145 : Node *other;
146 : bool varonleft;
147 : Selectivity selec;
148 666 : TypeCacheEntry *typcache = NULL;
149 666 : MultirangeType *constmultirange = NULL;
150 666 : RangeType *constrange = NULL;
151 :
152 : /*
153 : * If expression is not (variable op something) or (something op
154 : * variable), then punt and return a default estimate.
155 : */
156 666 : if (!get_restriction_variable(root, args, varRelid,
157 : &vardata, &other, &varonleft))
158 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
159 :
160 : /*
161 : * Can't do anything useful if the something is not a constant, either.
162 : */
163 666 : if (!IsA(other, Const))
164 : {
165 0 : ReleaseVariableStats(vardata);
166 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
167 : }
168 :
169 : /*
170 : * All the multirange operators are strict, so we can cope with a NULL
171 : * constant right away.
172 : */
173 666 : if (((Const *) other)->constisnull)
174 : {
175 0 : ReleaseVariableStats(vardata);
176 0 : PG_RETURN_FLOAT8(0.0);
177 : }
178 :
179 : /*
180 : * If var is on the right, commute the operator, so that we can assume the
181 : * var is on the left in what follows.
182 : */
183 666 : if (!varonleft)
184 : {
185 : /* we have other Op var, commute to make var Op other */
186 24 : operator = get_commutator(operator);
187 24 : if (!operator)
188 : {
189 : /* Use default selectivity (should we raise an error instead?) */
190 0 : ReleaseVariableStats(vardata);
191 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
192 : }
193 : }
194 :
195 : /*
196 : * OK, there's a Var and a Const we're dealing with here. We need the
197 : * Const to be of same multirange type as the column, else we can't do
198 : * anything useful. (Such cases will likely fail at runtime, but here we'd
199 : * rather just return a default estimate.)
200 : *
201 : * If the operator is "multirange @> element", the constant should be of
202 : * the element type of the multirange column. Convert it to a multirange
203 : * that includes only that single point, so that we don't need special
204 : * handling for that in what follows.
205 : */
206 666 : if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP)
207 : {
208 30 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
209 :
210 30 : if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id)
211 : {
212 : RangeBound lower,
213 : upper;
214 :
215 30 : lower.inclusive = true;
216 30 : lower.val = ((Const *) other)->constvalue;
217 30 : lower.infinite = false;
218 30 : lower.lower = true;
219 30 : upper.inclusive = true;
220 30 : upper.val = ((Const *) other)->constvalue;
221 30 : upper.infinite = false;
222 30 : upper.lower = false;
223 30 : constrange = range_serialize(typcache->rngtype, &lower, &upper,
224 : false, NULL);
225 30 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
226 : 1, &constrange);
227 : }
228 : }
229 636 : else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
230 570 : operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
231 534 : operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP ||
232 510 : operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP ||
233 486 : operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP ||
234 462 : operator == OID_MULTIRANGE_LEFT_RANGE_OP ||
235 : operator == OID_MULTIRANGE_RIGHT_RANGE_OP)
236 : {
237 : /*
238 : * Promote a range in "multirange OP range" just like we do an element
239 : * in "multirange OP element".
240 : */
241 198 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
242 198 : if (((Const *) other)->consttype == typcache->rngtype->type_id)
243 : {
244 198 : constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
245 198 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
246 : 1, &constrange);
247 : }
248 : }
249 438 : else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP ||
250 402 : operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP ||
251 384 : operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP ||
252 366 : operator == OID_RANGE_LEFT_MULTIRANGE_OP ||
253 348 : operator == OID_RANGE_RIGHT_MULTIRANGE_OP ||
254 312 : operator == OID_RANGE_CONTAINS_MULTIRANGE_OP ||
255 312 : operator == OID_MULTIRANGE_ELEM_CONTAINED_OP ||
256 : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP)
257 : {
258 : /*
259 : * Here, the Var is the elem/range, not the multirange. For now we
260 : * just punt and return the default estimate. In future we could
261 : * disassemble the multirange constant to do something more
262 : * intelligent.
263 : */
264 : }
265 294 : else if (((Const *) other)->consttype == vardata.vartype)
266 : {
267 : /* Both sides are the same multirange type */
268 294 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
269 :
270 294 : constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue);
271 : }
272 :
273 : /*
274 : * If we got a valid constant on one side of the operator, proceed to
275 : * estimate using statistics. Otherwise punt and return a default constant
276 : * estimate. Note that calc_multirangesel need not handle
277 : * OID_MULTIRANGE_*_CONTAINED_OP.
278 : */
279 666 : if (constmultirange)
280 522 : selec = calc_multirangesel(typcache, &vardata, constmultirange, operator);
281 : else
282 144 : selec = default_multirange_selectivity(operator);
283 :
284 666 : ReleaseVariableStats(vardata);
285 :
286 666 : CLAMP_PROBABILITY(selec);
287 :
288 666 : PG_RETURN_FLOAT8((float8) selec);
289 : }
290 :
291 : static double
292 522 : calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
293 : const MultirangeType *constval, Oid operator)
294 : {
295 : double hist_selec;
296 : double selec;
297 : float4 empty_frac,
298 : null_frac;
299 :
300 : /*
301 : * First look up the fraction of NULLs and empty multiranges from
302 : * pg_statistic.
303 : */
304 522 : if (HeapTupleIsValid(vardata->statsTuple))
305 : {
306 : Form_pg_statistic stats;
307 : AttStatsSlot sslot;
308 :
309 450 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
310 450 : null_frac = stats->stanullfrac;
311 :
312 : /* Try to get fraction of empty multiranges */
313 450 : if (get_attstatsslot(&sslot, vardata->statsTuple,
314 : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
315 : InvalidOid,
316 : ATTSTATSSLOT_NUMBERS))
317 : {
318 450 : if (sslot.nnumbers != 1)
319 0 : elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
320 450 : empty_frac = sslot.numbers[0];
321 450 : free_attstatsslot(&sslot);
322 : }
323 : else
324 : {
325 : /* No empty fraction statistic. Assume no empty ranges. */
326 0 : empty_frac = 0.0;
327 : }
328 : }
329 : else
330 : {
331 : /*
332 : * No stats are available. Follow through the calculations below
333 : * anyway, assuming no NULLs and no empty multiranges. This still
334 : * allows us to give a better-than-nothing estimate based on whether
335 : * the constant is an empty multirange or not.
336 : */
337 72 : null_frac = 0.0;
338 72 : empty_frac = 0.0;
339 : }
340 :
341 522 : if (MultirangeIsEmpty(constval))
342 : {
343 : /*
344 : * An empty multirange matches all multiranges, all empty multiranges,
345 : * or nothing, depending on the operator
346 : */
347 222 : switch (operator)
348 : {
349 : /* these return false if either argument is empty */
350 126 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
351 : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
352 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
353 : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
354 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
355 : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
356 : case OID_MULTIRANGE_LEFT_RANGE_OP:
357 : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
358 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
359 : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
360 : /* nothing is less than an empty multirange */
361 : case OID_MULTIRANGE_LESS_OP:
362 126 : selec = 0.0;
363 126 : break;
364 :
365 : /*
366 : * only empty multiranges can be contained by an empty
367 : * multirange
368 : */
369 30 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
370 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
371 : /* only empty ranges are <= an empty multirange */
372 : case OID_MULTIRANGE_LESS_EQUAL_OP:
373 30 : selec = empty_frac;
374 30 : break;
375 :
376 : /* everything contains an empty multirange */
377 60 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
378 : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
379 : /* everything is >= an empty multirange */
380 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
381 60 : selec = 1.0;
382 60 : break;
383 :
384 : /* all non-empty multiranges are > an empty multirange */
385 6 : case OID_MULTIRANGE_GREATER_OP:
386 6 : selec = 1.0 - empty_frac;
387 6 : break;
388 :
389 : /* an element cannot be empty */
390 0 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
391 :
392 : /* filtered out by multirangesel() */
393 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
394 : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
395 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
396 : case OID_RANGE_LEFT_MULTIRANGE_OP:
397 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
398 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
399 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
400 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
401 :
402 : default:
403 0 : elog(ERROR, "unexpected operator %u", operator);
404 : selec = 0.0; /* keep compiler quiet */
405 : break;
406 : }
407 : }
408 : else
409 : {
410 : /*
411 : * Calculate selectivity using bound histograms. If that fails for
412 : * some reason, e.g no histogram in pg_statistic, use the default
413 : * constant estimate for the fraction of non-empty values. This is
414 : * still somewhat better than just returning the default estimate,
415 : * because this still takes into account the fraction of empty and
416 : * NULL tuples, if we had statistics for them.
417 : */
418 300 : hist_selec = calc_hist_selectivity(typcache, vardata, constval,
419 : operator);
420 300 : if (hist_selec < 0.0)
421 48 : hist_selec = default_multirange_selectivity(operator);
422 :
423 : /*
424 : * Now merge the results for the empty multiranges and histogram
425 : * calculations, realizing that the histogram covers only the
426 : * non-null, non-empty values.
427 : */
428 300 : if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
429 : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
430 : {
431 : /* empty is contained by anything non-empty */
432 24 : selec = (1.0 - empty_frac) * hist_selec + empty_frac;
433 : }
434 : else
435 : {
436 : /* with any other operator, empty Op non-empty matches nothing */
437 276 : selec = (1.0 - empty_frac) * hist_selec;
438 : }
439 : }
440 :
441 : /* all multirange operators are strict */
442 522 : selec *= (1.0 - null_frac);
443 :
444 : /* result should be in range, but make sure... */
445 522 : CLAMP_PROBABILITY(selec);
446 :
447 522 : return selec;
448 : }
449 :
450 : /*
451 : * Calculate multirange operator selectivity using histograms of multirange bounds.
452 : *
453 : * This estimate is for the portion of values that are not empty and not
454 : * NULL.
455 : */
456 : static double
457 300 : calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
458 : const MultirangeType *constval, Oid operator)
459 : {
460 300 : TypeCacheEntry *rng_typcache = typcache->rngtype;
461 : AttStatsSlot hslot;
462 : AttStatsSlot lslot;
463 : int nhist;
464 : RangeBound *hist_lower;
465 : RangeBound *hist_upper;
466 : int i;
467 : RangeBound const_lower;
468 : RangeBound const_upper;
469 : RangeBound tmp;
470 : double hist_selec;
471 :
472 : /* Can't use the histogram with insecure multirange support functions */
473 300 : if (!statistic_proc_security_check(vardata,
474 : rng_typcache->rng_cmp_proc_finfo.fn_oid))
475 0 : return -1;
476 300 : if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) &&
477 300 : !statistic_proc_security_check(vardata,
478 : rng_typcache->rng_subdiff_finfo.fn_oid))
479 0 : return -1;
480 :
481 : /* Try to get histogram of ranges */
482 552 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
483 252 : get_attstatsslot(&hslot, vardata->statsTuple,
484 : STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
485 : ATTSTATSSLOT_VALUES)))
486 48 : return -1.0;
487 :
488 : /* check that it's a histogram, not just a dummy entry */
489 252 : if (hslot.nvalues < 2)
490 : {
491 0 : free_attstatsslot(&hslot);
492 0 : return -1.0;
493 : }
494 :
495 : /*
496 : * Convert histogram of ranges into histograms of its lower and upper
497 : * bounds.
498 : */
499 252 : nhist = hslot.nvalues;
500 252 : hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
501 252 : hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
502 18936 : for (i = 0; i < nhist; i++)
503 : {
504 : bool empty;
505 :
506 18684 : range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]),
507 18684 : &hist_lower[i], &hist_upper[i], &empty);
508 : /* The histogram should not contain any empty ranges */
509 18684 : if (empty)
510 0 : elog(ERROR, "bounds histogram contains an empty range");
511 : }
512 :
513 : /* @> and @< also need a histogram of range lengths */
514 252 : if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
515 204 : operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP ||
516 204 : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP ||
517 : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
518 : {
519 120 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
520 60 : get_attstatsslot(&lslot, vardata->statsTuple,
521 : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
522 : InvalidOid,
523 : ATTSTATSSLOT_VALUES)))
524 : {
525 0 : free_attstatsslot(&hslot);
526 0 : return -1.0;
527 : }
528 :
529 : /* check that it's a histogram, not just a dummy entry */
530 60 : if (lslot.nvalues < 2)
531 : {
532 0 : free_attstatsslot(&lslot);
533 0 : free_attstatsslot(&hslot);
534 0 : return -1.0;
535 : }
536 : }
537 : else
538 192 : memset(&lslot, 0, sizeof(lslot));
539 :
540 : /* Extract the bounds of the constant value. */
541 : Assert(constval->rangeCount > 0);
542 252 : multirange_get_bounds(rng_typcache, constval, 0,
543 : &const_lower, &tmp);
544 252 : multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1,
545 : &tmp, &const_upper);
546 :
547 : /*
548 : * Calculate selectivity comparing the lower or upper bound of the
549 : * constant with the histogram of lower or upper bounds.
550 : */
551 252 : switch (operator)
552 : {
553 0 : case OID_MULTIRANGE_LESS_OP:
554 :
555 : /*
556 : * The regular b-tree comparison operators (<, <=, >, >=) compare
557 : * the lower bounds first, and the upper bounds for values with
558 : * equal lower bounds. Estimate that by comparing the lower bounds
559 : * only. This gives a fairly accurate estimate assuming there
560 : * aren't many rows with a lower bound equal to the constant's
561 : * lower bound.
562 : */
563 : hist_selec =
564 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
565 : hist_lower, nhist, false);
566 0 : break;
567 :
568 0 : case OID_MULTIRANGE_LESS_EQUAL_OP:
569 : hist_selec =
570 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
571 : hist_lower, nhist, true);
572 0 : break;
573 :
574 0 : case OID_MULTIRANGE_GREATER_OP:
575 0 : hist_selec =
576 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
577 : hist_lower, nhist, false);
578 0 : break;
579 :
580 0 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
581 0 : hist_selec =
582 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
583 : hist_lower, nhist, true);
584 0 : break;
585 :
586 24 : case OID_MULTIRANGE_LEFT_RANGE_OP:
587 : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
588 : /* var << const when upper(var) < lower(const) */
589 : hist_selec =
590 24 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
591 : hist_upper, nhist, false);
592 24 : break;
593 :
594 24 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
595 : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
596 : /* var >> const when lower(var) > upper(const) */
597 24 : hist_selec =
598 24 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper,
599 : hist_lower, nhist, true);
600 24 : break;
601 :
602 24 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
603 : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
604 : /* compare lower bounds */
605 24 : hist_selec =
606 24 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
607 : hist_lower, nhist, false);
608 24 : break;
609 :
610 24 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
611 : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
612 : /* compare upper bounds */
613 : hist_selec =
614 24 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
615 : hist_upper, nhist, true);
616 24 : break;
617 :
618 84 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
619 : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
620 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
621 :
622 : /*
623 : * A && B <=> NOT (A << B OR A >> B).
624 : *
625 : * Since A << B and A >> B are mutually exclusive events we can
626 : * sum their probabilities to find probability of (A << B OR A >>
627 : * B).
628 : *
629 : * "multirange @> elem" is equivalent to "multirange &&
630 : * {[elem,elem]}". The caller already constructed the singular
631 : * range from the element constant, so just treat it the same as
632 : * &&.
633 : */
634 : hist_selec =
635 84 : calc_hist_selectivity_scalar(rng_typcache,
636 : &const_lower, hist_upper,
637 : nhist, false);
638 84 : hist_selec +=
639 84 : (1.0 - calc_hist_selectivity_scalar(rng_typcache,
640 : &const_upper, hist_lower,
641 : nhist, true));
642 84 : hist_selec = 1.0 - hist_selec;
643 84 : break;
644 :
645 48 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
646 : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
647 : hist_selec =
648 48 : calc_hist_selectivity_contains(rng_typcache, &const_lower,
649 : &const_upper, hist_lower, nhist,
650 : lslot.values, lslot.nvalues);
651 48 : break;
652 :
653 24 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
654 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
655 24 : if (const_lower.infinite)
656 : {
657 : /*
658 : * Lower bound no longer matters. Just estimate the fraction
659 : * with an upper bound <= const upper bound
660 : */
661 : hist_selec =
662 0 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
663 : hist_upper, nhist, true);
664 : }
665 24 : else if (const_upper.infinite)
666 : {
667 0 : hist_selec =
668 0 : 1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
669 : hist_lower, nhist, false);
670 : }
671 : else
672 : {
673 : hist_selec =
674 24 : calc_hist_selectivity_contained(rng_typcache, &const_lower,
675 : &const_upper, hist_lower, nhist,
676 : lslot.values, lslot.nvalues);
677 : }
678 24 : break;
679 :
680 : /* filtered out by multirangesel() */
681 0 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
682 : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
683 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
684 : case OID_RANGE_LEFT_MULTIRANGE_OP:
685 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
686 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
687 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
688 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
689 :
690 : default:
691 0 : elog(ERROR, "unknown multirange operator %u", operator);
692 : hist_selec = -1.0; /* keep compiler quiet */
693 : break;
694 : }
695 :
696 252 : free_attstatsslot(&lslot);
697 252 : free_attstatsslot(&hslot);
698 :
699 252 : return hist_selec;
700 : }
701 :
702 :
703 : /*
704 : * Look up the fraction of values less than (or equal, if 'equal' argument
705 : * is true) a given const in a histogram of range bounds.
706 : */
707 : static double
708 264 : calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
709 : const RangeBound *hist, int hist_nvalues, bool equal)
710 : {
711 : Selectivity selec;
712 : int index;
713 :
714 : /*
715 : * Find the histogram bin the given constant falls into. Estimate
716 : * selectivity as the number of preceding whole bins.
717 : */
718 264 : index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
719 264 : selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
720 :
721 : /* Adjust using linear interpolation within the bin */
722 264 : if (index >= 0 && index < hist_nvalues - 1)
723 360 : selec += get_position(typcache, constbound, &hist[index],
724 180 : &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
725 :
726 264 : return selec;
727 : }
728 :
729 : /*
730 : * Binary search on an array of range bounds. Returns greatest index of range
731 : * bound in array which is less(less or equal) than given range bound. If all
732 : * range bounds in array are greater or equal(greater) than given range bound,
733 : * return -1. When "equal" flag is set conditions in brackets are used.
734 : *
735 : * This function is used in scalar operator selectivity estimation. Another
736 : * goal of this function is to find a histogram bin where to stop
737 : * interpolation of portion of bounds which are less than or equal to given bound.
738 : */
739 : static int
740 336 : rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
741 : int hist_length, bool equal)
742 : {
743 336 : int lower = -1,
744 336 : upper = hist_length - 1,
745 : cmp,
746 : middle;
747 :
748 2136 : while (lower < upper)
749 : {
750 1800 : middle = (lower + upper + 1) / 2;
751 1800 : cmp = range_cmp_bounds(typcache, &hist[middle], value);
752 :
753 1800 : if (cmp < 0 || (equal && cmp == 0))
754 744 : lower = middle;
755 : else
756 1056 : upper = middle - 1;
757 : }
758 336 : return lower;
759 : }
760 :
761 :
762 : /*
763 : * Binary search on length histogram. Returns greatest index of range length in
764 : * histogram which is less than (less than or equal) the given length value. If
765 : * all lengths in the histogram are greater than (greater than or equal) the
766 : * given length, returns -1.
767 : */
768 : static int
769 360 : length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
770 : double value, bool equal)
771 : {
772 360 : int lower = -1,
773 360 : upper = length_hist_nvalues - 1,
774 : middle;
775 :
776 1872 : while (lower < upper)
777 : {
778 : double middleval;
779 :
780 1512 : middle = (lower + upper + 1) / 2;
781 :
782 1512 : middleval = DatumGetFloat8(length_hist_values[middle]);
783 1512 : if (middleval < value || (equal && middleval <= value))
784 744 : lower = middle;
785 : else
786 768 : upper = middle - 1;
787 : }
788 360 : return lower;
789 : }
790 :
791 : /*
792 : * Get relative position of value in histogram bin in [0,1] range.
793 : */
794 : static float8
795 276 : get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
796 : const RangeBound *hist2)
797 : {
798 276 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
799 : float8 position;
800 :
801 276 : if (!hist1->infinite && !hist2->infinite)
802 : {
803 : float8 bin_width;
804 :
805 : /*
806 : * Both bounds are finite. Assuming the subtype's comparison function
807 : * works sanely, the value must be finite, too, because it lies
808 : * somewhere between the bounds. If it doesn't, arbitrarily return
809 : * 0.5.
810 : */
811 204 : if (value->infinite)
812 0 : return 0.5;
813 :
814 : /* Can't interpolate without subdiff function */
815 204 : if (!has_subdiff)
816 0 : return 0.5;
817 :
818 : /* Calculate relative position using subdiff function. */
819 204 : bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
820 : typcache->rng_collation,
821 : hist2->val,
822 : hist1->val));
823 204 : if (isnan(bin_width) || bin_width <= 0.0)
824 0 : return 0.5; /* punt for NaN or zero-width bin */
825 :
826 204 : position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
827 : typcache->rng_collation,
828 : value->val,
829 : hist1->val))
830 : / bin_width;
831 :
832 204 : if (isnan(position))
833 0 : return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
834 :
835 : /* Relative position must be in [0,1] range */
836 204 : position = Max(position, 0.0);
837 204 : position = Min(position, 1.0);
838 204 : return position;
839 : }
840 72 : else if (hist1->infinite && !hist2->infinite)
841 : {
842 : /*
843 : * Lower bin boundary is -infinite, upper is finite. If the value is
844 : * -infinite, return 0.0 to indicate it's equal to the lower bound.
845 : * Otherwise return 1.0 to indicate it's infinitely far from the lower
846 : * bound.
847 : */
848 60 : return ((value->infinite && value->lower) ? 0.0 : 1.0);
849 : }
850 12 : else if (!hist1->infinite && hist2->infinite)
851 : {
852 : /* same as above, but in reverse */
853 12 : return ((value->infinite && !value->lower) ? 1.0 : 0.0);
854 : }
855 : else
856 : {
857 : /*
858 : * If both bin boundaries are infinite, they should be equal to each
859 : * other, and the value should also be infinite and equal to both
860 : * bounds. (But don't Assert that, to avoid crashing if a user creates
861 : * a datatype with a broken comparison function).
862 : *
863 : * Assume the value to lie in the middle of the infinite bounds.
864 : */
865 0 : return 0.5;
866 : }
867 : }
868 :
869 :
870 : /*
871 : * Get relative position of value in a length histogram bin in [0,1] range.
872 : */
873 : static double
874 360 : get_len_position(double value, double hist1, double hist2)
875 : {
876 360 : if (!isinf(hist1) && !isinf(hist2))
877 : {
878 : /*
879 : * Both bounds are finite. The value should be finite too, because it
880 : * lies somewhere between the bounds. If it doesn't, just return
881 : * something.
882 : */
883 60 : if (isinf(value))
884 0 : return 0.5;
885 :
886 60 : return 1.0 - (hist2 - value) / (hist2 - hist1);
887 : }
888 300 : else if (isinf(hist1) && !isinf(hist2))
889 : {
890 : /*
891 : * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
892 : * indicate the value is infinitely far from the lower bound.
893 : */
894 0 : return 1.0;
895 : }
896 300 : else if (isinf(hist1) && isinf(hist2))
897 : {
898 : /* same as above, but in reverse */
899 0 : return 0.0;
900 : }
901 : else
902 : {
903 : /*
904 : * If both bin boundaries are infinite, they should be equal to each
905 : * other, and the value should also be infinite and equal to both
906 : * bounds. (But don't Assert that, to avoid crashing unnecessarily if
907 : * the caller messes up)
908 : *
909 : * Assume the value to lie in the middle of the infinite bounds.
910 : */
911 300 : return 0.5;
912 : }
913 : }
914 :
915 : /*
916 : * Measure distance between two range bounds.
917 : */
918 : static float8
919 408 : get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
920 : {
921 408 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
922 :
923 408 : if (!bound1->infinite && !bound2->infinite)
924 : {
925 : /*
926 : * Neither bound is infinite, use subdiff function or return default
927 : * value of 1.0 if no subdiff is available.
928 : */
929 240 : if (has_subdiff)
930 : {
931 : float8 res;
932 :
933 240 : res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
934 : typcache->rng_collation,
935 : bound2->val,
936 : bound1->val));
937 : /* Reject possible NaN result, also negative result */
938 240 : if (isnan(res) || res < 0.0)
939 0 : return 1.0;
940 : else
941 240 : return res;
942 : }
943 : else
944 0 : return 1.0;
945 : }
946 168 : else if (bound1->infinite && bound2->infinite)
947 : {
948 : /* Both bounds are infinite */
949 0 : if (bound1->lower == bound2->lower)
950 0 : return 0.0;
951 : else
952 0 : return get_float8_infinity();
953 : }
954 : else
955 : {
956 : /* One bound is infinite, the other is not */
957 168 : return get_float8_infinity();
958 : }
959 : }
960 :
961 : /*
962 : * Calculate the average of function P(x), in the interval [length1, length2],
963 : * where P(x) is the fraction of tuples with length < x (or length <= x if
964 : * 'equal' is true).
965 : */
966 : static double
967 360 : calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
968 : double length1, double length2, bool equal)
969 : {
970 : double frac;
971 : double A,
972 : B,
973 : PA,
974 : PB;
975 : double pos;
976 : int i;
977 : double area;
978 :
979 : Assert(length2 >= length1);
980 :
981 360 : if (length2 < 0.0)
982 0 : return 0.0; /* shouldn't happen, but doesn't hurt to check */
983 :
984 : /* All lengths in the table are <= infinite. */
985 360 : if (isinf(length2) && equal)
986 0 : return 1.0;
987 :
988 : /*----------
989 : * The average of a function between A and B can be calculated by the
990 : * formula:
991 : *
992 : * B
993 : * 1 /
994 : * ------- | P(x)dx
995 : * B - A /
996 : * A
997 : *
998 : * The geometrical interpretation of the integral is the area under the
999 : * graph of P(x). P(x) is defined by the length histogram. We calculate
1000 : * the area in a piecewise fashion, iterating through the length histogram
1001 : * bins. Each bin is a trapezoid:
1002 : *
1003 : * P(x2)
1004 : * /|
1005 : * / |
1006 : * P(x1)/ |
1007 : * | |
1008 : * | |
1009 : * ---+---+--
1010 : * x1 x2
1011 : *
1012 : * where x1 and x2 are the boundaries of the current histogram, and P(x1)
1013 : * and P(x1) are the cumulative fraction of tuples at the boundaries.
1014 : *
1015 : * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
1016 : *
1017 : * The first bin contains the lower bound passed by the caller, so we
1018 : * use linear interpolation between the previous and next histogram bin
1019 : * boundary to calculate P(x1). Likewise for the last bin: we use linear
1020 : * interpolation to calculate P(x2). For the bins in between, x1 and x2
1021 : * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
1022 : * P(x1) = (bin index) / (number of bins)
1023 : * P(x2) = (bin index + 1 / (number of bins)
1024 : */
1025 :
1026 : /* First bin, the one that contains lower bound */
1027 360 : i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
1028 360 : if (i >= length_hist_nvalues - 1)
1029 48 : return 1.0;
1030 :
1031 312 : if (i < 0)
1032 : {
1033 84 : i = 0;
1034 84 : pos = 0.0;
1035 : }
1036 : else
1037 : {
1038 : /* interpolate length1's position in the bin */
1039 228 : pos = get_len_position(length1,
1040 228 : DatumGetFloat8(length_hist_values[i]),
1041 228 : DatumGetFloat8(length_hist_values[i + 1]));
1042 : }
1043 312 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1044 312 : B = length1;
1045 :
1046 : /*
1047 : * In the degenerate case that length1 == length2, simply return
1048 : * P(length1). This is not merely an optimization: if length1 == length2,
1049 : * we'd divide by zero later on.
1050 : */
1051 312 : if (length2 == length1)
1052 144 : return PB;
1053 :
1054 : /*
1055 : * Loop through all the bins, until we hit the last bin, the one that
1056 : * contains the upper bound. (if lower and upper bounds are in the same
1057 : * bin, this falls out immediately)
1058 : */
1059 168 : area = 0.0;
1060 3168 : for (; i < length_hist_nvalues - 1; i++)
1061 : {
1062 3168 : double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
1063 :
1064 : /* check if we've reached the last bin */
1065 3168 : if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
1066 : break;
1067 :
1068 : /* the upper bound of previous bin is the lower bound of this bin */
1069 3000 : A = B;
1070 3000 : PA = PB;
1071 :
1072 3000 : B = bin_upper;
1073 3000 : PB = (double) i / (double) (length_hist_nvalues - 1);
1074 :
1075 : /*
1076 : * Add the area of this trapezoid to the total. The point of the
1077 : * if-check is to avoid NaN, in the corner case that PA == PB == 0,
1078 : * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
1079 : * 0) is zero, regardless of the width (B - A).
1080 : */
1081 3000 : if (PA > 0 || PB > 0)
1082 2952 : area += 0.5 * (PB + PA) * (B - A);
1083 : }
1084 :
1085 : /* Last bin */
1086 168 : A = B;
1087 168 : PA = PB;
1088 :
1089 168 : B = length2; /* last bin ends at the query upper bound */
1090 168 : if (i >= length_hist_nvalues - 1)
1091 0 : pos = 0.0;
1092 : else
1093 : {
1094 168 : if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
1095 36 : pos = 0.0;
1096 : else
1097 132 : pos = get_len_position(length2,
1098 132 : DatumGetFloat8(length_hist_values[i]),
1099 132 : DatumGetFloat8(length_hist_values[i + 1]));
1100 : }
1101 168 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1102 :
1103 168 : if (PA > 0 || PB > 0)
1104 132 : area += 0.5 * (PB + PA) * (B - A);
1105 :
1106 : /*
1107 : * Ok, we have calculated the area, ie. the integral. Divide by width to
1108 : * get the requested average.
1109 : *
1110 : * Avoid NaN arising from infinite / infinite. This happens at least if
1111 : * length2 is infinite. It's not clear what the correct value would be in
1112 : * that case, so 0.5 seems as good as any value.
1113 : */
1114 168 : if (isinf(area) && isinf(length2))
1115 48 : frac = 0.5;
1116 : else
1117 120 : frac = area / (length2 - length1);
1118 :
1119 168 : return frac;
1120 : }
1121 :
1122 : /*
1123 : * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1124 : * of multiranges that fall within the constant lower and upper bounds. This uses
1125 : * the histograms of range lower bounds and range lengths, on the assumption
1126 : * that the range lengths are independent of the lower bounds.
1127 : *
1128 : * The caller has already checked that constant lower and upper bounds are
1129 : * finite.
1130 : */
1131 : static double
1132 24 : calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1133 : const RangeBound *lower, RangeBound *upper,
1134 : const RangeBound *hist_lower, int hist_nvalues,
1135 : Datum *length_hist_values, int length_hist_nvalues)
1136 : {
1137 : int i,
1138 : upper_index;
1139 : float8 prev_dist;
1140 : double bin_width;
1141 : double upper_bin_width;
1142 : double sum_frac;
1143 :
1144 : /*
1145 : * Begin by finding the bin containing the upper bound, in the lower bound
1146 : * histogram. Any range with a lower bound > constant upper bound can't
1147 : * match, ie. there are no matches in bins greater than upper_index.
1148 : */
1149 24 : upper->inclusive = !upper->inclusive;
1150 24 : upper->lower = true;
1151 24 : upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1152 : false);
1153 :
1154 : /*
1155 : * If the upper bound value is below the histogram's lower limit, there
1156 : * are no matches.
1157 : */
1158 24 : if (upper_index < 0)
1159 0 : return 0.0;
1160 :
1161 : /*
1162 : * If the upper bound value is at or beyond the histogram's upper limit,
1163 : * start our loop at the last actual bin, as though the upper bound were
1164 : * within that bin; get_position will clamp its result to 1.0 anyway.
1165 : * (This corresponds to assuming that the data population above the
1166 : * histogram's upper limit is empty, exactly like what we just assumed for
1167 : * the lower limit.)
1168 : */
1169 24 : upper_index = Min(upper_index, hist_nvalues - 2);
1170 :
1171 : /*
1172 : * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1173 : * upper_index + 1) bin which is greater than upper bound of query range
1174 : * using linear interpolation of subdiff function.
1175 : */
1176 24 : upper_bin_width = get_position(typcache, upper,
1177 24 : &hist_lower[upper_index],
1178 24 : &hist_lower[upper_index + 1]);
1179 :
1180 : /*
1181 : * In the loop, dist and prev_dist are the distance of the "current" bin's
1182 : * lower and upper bounds from the constant upper bound.
1183 : *
1184 : * bin_width represents the width of the current bin. Normally it is 1.0,
1185 : * meaning a full width bin, but can be less in the corner cases: start
1186 : * and end of the loop. We start with bin_width = upper_bin_width, because
1187 : * we begin at the bin containing the upper bound.
1188 : */
1189 24 : prev_dist = 0.0;
1190 24 : bin_width = upper_bin_width;
1191 :
1192 24 : sum_frac = 0.0;
1193 120 : for (i = upper_index; i >= 0; i--)
1194 : {
1195 : double dist;
1196 : double length_hist_frac;
1197 120 : bool final_bin = false;
1198 :
1199 : /*
1200 : * dist -- distance from upper bound of query range to lower bound of
1201 : * the current bin in the lower bound histogram. Or to the lower bound
1202 : * of the constant range, if this is the final bin, containing the
1203 : * constant lower bound.
1204 : */
1205 120 : if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1206 : {
1207 24 : dist = get_distance(typcache, lower, upper);
1208 :
1209 : /*
1210 : * Subtract from bin_width the portion of this bin that we want to
1211 : * ignore.
1212 : */
1213 48 : bin_width -= get_position(typcache, lower, &hist_lower[i],
1214 24 : &hist_lower[i + 1]);
1215 24 : if (bin_width < 0.0)
1216 0 : bin_width = 0.0;
1217 24 : final_bin = true;
1218 : }
1219 : else
1220 96 : dist = get_distance(typcache, &hist_lower[i], upper);
1221 :
1222 : /*
1223 : * Estimate the fraction of tuples in this bin that are narrow enough
1224 : * to not exceed the distance to the upper bound of the query range.
1225 : */
1226 120 : length_hist_frac = calc_length_hist_frac(length_hist_values,
1227 : length_hist_nvalues,
1228 : prev_dist, dist, true);
1229 :
1230 : /*
1231 : * Add the fraction of tuples in this bin, with a suitable length, to
1232 : * the total.
1233 : */
1234 120 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1235 :
1236 120 : if (final_bin)
1237 24 : break;
1238 :
1239 96 : bin_width = 1.0;
1240 96 : prev_dist = dist;
1241 : }
1242 :
1243 24 : return sum_frac;
1244 : }
1245 :
1246 : /*
1247 : * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1248 : * of multiranges that contain the constant lower and upper bounds. This uses
1249 : * the histograms of range lower bounds and range lengths, on the assumption
1250 : * that the range lengths are independent of the lower bounds.
1251 : */
1252 : static double
1253 48 : calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1254 : const RangeBound *lower, const RangeBound *upper,
1255 : const RangeBound *hist_lower, int hist_nvalues,
1256 : Datum *length_hist_values, int length_hist_nvalues)
1257 : {
1258 : int i,
1259 : lower_index;
1260 : double bin_width,
1261 : lower_bin_width;
1262 : double sum_frac;
1263 : float8 prev_dist;
1264 :
1265 : /* Find the bin containing the lower bound of query range. */
1266 48 : lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1267 : true);
1268 :
1269 : /*
1270 : * If the lower bound value is below the histogram's lower limit, there
1271 : * are no matches.
1272 : */
1273 48 : if (lower_index < 0)
1274 0 : return 0.0;
1275 :
1276 : /*
1277 : * If the lower bound value is at or beyond the histogram's upper limit,
1278 : * start our loop at the last actual bin, as though the upper bound were
1279 : * within that bin; get_position will clamp its result to 1.0 anyway.
1280 : * (This corresponds to assuming that the data population above the
1281 : * histogram's upper limit is empty, exactly like what we just assumed for
1282 : * the lower limit.)
1283 : */
1284 48 : lower_index = Min(lower_index, hist_nvalues - 2);
1285 :
1286 : /*
1287 : * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1288 : * lower_index + 1) bin which is greater than lower bound of query range
1289 : * using linear interpolation of subdiff function.
1290 : */
1291 48 : lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1292 48 : &hist_lower[lower_index + 1]);
1293 :
1294 : /*
1295 : * Loop through all the lower bound bins, smaller than the query lower
1296 : * bound. In the loop, dist and prev_dist are the distance of the
1297 : * "current" bin's lower and upper bounds from the constant upper bound.
1298 : * We begin from query lower bound, and walk backwards, so the first bin's
1299 : * upper bound is the query lower bound, and its distance to the query
1300 : * upper bound is the length of the query range.
1301 : *
1302 : * bin_width represents the width of the current bin. Normally it is 1.0,
1303 : * meaning a full width bin, except for the first bin, which is only
1304 : * counted up to the constant lower bound.
1305 : */
1306 48 : prev_dist = get_distance(typcache, lower, upper);
1307 48 : sum_frac = 0.0;
1308 48 : bin_width = lower_bin_width;
1309 288 : for (i = lower_index; i >= 0; i--)
1310 : {
1311 : float8 dist;
1312 : double length_hist_frac;
1313 :
1314 : /*
1315 : * dist -- distance from upper bound of query range to current value
1316 : * of lower bound histogram or lower bound of query range (if we've
1317 : * reach it).
1318 : */
1319 240 : dist = get_distance(typcache, &hist_lower[i], upper);
1320 :
1321 : /*
1322 : * Get average fraction of length histogram which covers intervals
1323 : * longer than (or equal to) distance to upper bound of query range.
1324 : */
1325 240 : length_hist_frac =
1326 240 : 1.0 - calc_length_hist_frac(length_hist_values,
1327 : length_hist_nvalues,
1328 : prev_dist, dist, false);
1329 :
1330 240 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1331 :
1332 240 : bin_width = 1.0;
1333 240 : prev_dist = dist;
1334 : }
1335 :
1336 48 : return sum_frac;
1337 : }
|