Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dependencies.c
4 : * POSTGRES functional dependencies
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/statistics/dependencies.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/htup_details.h"
17 : #include "catalog/pg_statistic_ext.h"
18 : #include "catalog/pg_statistic_ext_data.h"
19 : #include "nodes/nodeFuncs.h"
20 : #include "optimizer/clauses.h"
21 : #include "optimizer/optimizer.h"
22 : #include "parser/parsetree.h"
23 : #include "statistics/extended_stats_internal.h"
24 : #include "utils/fmgroids.h"
25 : #include "utils/lsyscache.h"
26 : #include "utils/memutils.h"
27 : #include "utils/selfuncs.h"
28 : #include "utils/syscache.h"
29 : #include "utils/typcache.h"
30 :
31 : /* size of the struct header fields (magic, type, ndeps) */
32 : #define SizeOfHeader (3 * sizeof(uint32))
33 :
34 : /* size of a serialized dependency (degree, natts, atts) */
35 : #define SizeOfItem(natts) \
36 : (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
37 :
38 : /* minimal size of a dependency (with two attributes) */
39 : #define MinSizeOfItem SizeOfItem(2)
40 :
41 : /* minimal size of dependencies, when all deps are minimal */
42 : #define MinSizeOfItems(ndeps) \
43 : (SizeOfHeader + (ndeps) * MinSizeOfItem)
44 :
45 : /*
46 : * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
47 : * k-permutations of n elements, except that the order does not matter for the
48 : * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
49 : */
50 : typedef struct DependencyGeneratorData
51 : {
52 : int k; /* size of the dependency */
53 : int n; /* number of possible attributes */
54 : int current; /* next dependency to return (index) */
55 : AttrNumber ndependencies; /* number of dependencies generated */
56 : AttrNumber *dependencies; /* array of pre-generated dependencies */
57 : } DependencyGeneratorData;
58 :
59 : typedef DependencyGeneratorData *DependencyGenerator;
60 :
61 : static void generate_dependencies_recurse(DependencyGenerator state,
62 : int index, AttrNumber start, AttrNumber *current);
63 : static void generate_dependencies(DependencyGenerator state);
64 : static DependencyGenerator DependencyGenerator_init(int n, int k);
65 : static void DependencyGenerator_free(DependencyGenerator state);
66 : static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
67 : static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
68 : static bool dependency_is_fully_matched(MVDependency *dependency,
69 : Bitmapset *attnums);
70 : static bool dependency_is_compatible_clause(Node *clause, Index relid,
71 : AttrNumber *attnum);
72 : static bool dependency_is_compatible_expression(Node *clause, Index relid,
73 : List *statlist, Node **expr);
74 : static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
75 : int ndependencies, Bitmapset *attnums);
76 : static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
77 : int varRelid, JoinType jointype,
78 : SpecialJoinInfo *sjinfo,
79 : MVDependency **dependencies,
80 : int ndependencies,
81 : AttrNumber *list_attnums,
82 : Bitmapset **estimatedclauses);
83 :
84 : static void
85 1308 : generate_dependencies_recurse(DependencyGenerator state, int index,
86 : AttrNumber start, AttrNumber *current)
87 : {
88 : /*
89 : * The generator handles the first (k-1) elements differently from the
90 : * last element.
91 : */
92 1308 : if (index < (state->k - 1))
93 : {
94 : AttrNumber i;
95 :
96 : /*
97 : * The first (k-1) values have to be in ascending order, which we
98 : * generate recursively.
99 : */
100 :
101 1620 : for (i = start; i < state->n; i++)
102 : {
103 1040 : current[index] = i;
104 1040 : generate_dependencies_recurse(state, (index + 1), (i + 1), current);
105 : }
106 : }
107 : else
108 : {
109 : int i;
110 :
111 : /*
112 : * the last element is the implied value, which does not respect the
113 : * ascending order. We just need to check that the value is not in the
114 : * first (k-1) elements.
115 : */
116 :
117 2808 : for (i = 0; i < state->n; i++)
118 : {
119 : int j;
120 2080 : bool match = false;
121 :
122 2080 : current[index] = i;
123 :
124 3864 : for (j = 0; j < index; j++)
125 : {
126 2824 : if (current[j] == i)
127 : {
128 1040 : match = true;
129 1040 : break;
130 : }
131 : }
132 :
133 : /*
134 : * If the value is not found in the first part of the dependency,
135 : * we're done.
136 : */
137 2080 : if (!match)
138 : {
139 2080 : state->dependencies = (AttrNumber *) repalloc(state->dependencies,
140 1040 : state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
141 1040 : memcpy(&state->dependencies[(state->k * state->ndependencies)],
142 1040 : current, state->k * sizeof(AttrNumber));
143 1040 : state->ndependencies++;
144 : }
145 : }
146 : }
147 1308 : }
148 :
149 : /* generate all dependencies (k-permutations of n elements) */
150 : static void
151 268 : generate_dependencies(DependencyGenerator state)
152 : {
153 268 : AttrNumber *current = palloc0_array(AttrNumber, state->k);
154 :
155 268 : generate_dependencies_recurse(state, 0, 0, current);
156 :
157 268 : pfree(current);
158 268 : }
159 :
160 : /*
161 : * initialize the DependencyGenerator of variations, and prebuild the variations
162 : *
163 : * This pre-builds all the variations. We could also generate them in
164 : * DependencyGenerator_next(), but this seems simpler.
165 : */
166 : static DependencyGenerator
167 268 : DependencyGenerator_init(int n, int k)
168 : {
169 : DependencyGenerator state;
170 :
171 : Assert((n >= k) && (k > 0));
172 :
173 : /* allocate the DependencyGenerator state */
174 268 : state = palloc0_object(DependencyGeneratorData);
175 268 : state->dependencies = palloc_array(AttrNumber, k);
176 :
177 268 : state->ndependencies = 0;
178 268 : state->current = 0;
179 268 : state->k = k;
180 268 : state->n = n;
181 :
182 : /* now actually pre-generate all the variations */
183 268 : generate_dependencies(state);
184 :
185 268 : return state;
186 : }
187 :
188 : /* free the DependencyGenerator state */
189 : static void
190 268 : DependencyGenerator_free(DependencyGenerator state)
191 : {
192 268 : pfree(state->dependencies);
193 268 : pfree(state);
194 268 : }
195 :
196 : /* generate next combination */
197 : static AttrNumber *
198 1308 : DependencyGenerator_next(DependencyGenerator state)
199 : {
200 1308 : if (state->current == state->ndependencies)
201 268 : return NULL;
202 :
203 1040 : return &state->dependencies[state->k * state->current++];
204 : }
205 :
206 :
207 : /*
208 : * validates functional dependency on the data
209 : *
210 : * An actual work horse of detecting functional dependencies. Given a variation
211 : * of k attributes, it checks that the first (k-1) are sufficient to determine
212 : * the last one.
213 : */
214 : static double
215 1040 : dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
216 : {
217 : int i,
218 : nitems;
219 : MultiSortSupport mss;
220 : SortItem *items;
221 : AttrNumber *attnums_dep;
222 :
223 : /* counters valid within a group */
224 1040 : int group_size = 0;
225 1040 : int n_violations = 0;
226 :
227 : /* total number of rows supporting (consistent with) the dependency */
228 1040 : int n_supporting_rows = 0;
229 :
230 : /* Make sure we have at least two input attributes. */
231 : Assert(k >= 2);
232 :
233 : /* sort info for all attributes columns */
234 1040 : mss = multi_sort_init(k);
235 :
236 : /*
237 : * Translate the array of indexes to regular attnums for the dependency
238 : * (we will need this to identify the columns in StatsBuildData).
239 : */
240 1040 : attnums_dep = palloc_array(AttrNumber, k);
241 3504 : for (i = 0; i < k; i++)
242 2464 : attnums_dep[i] = data->attnums[dependency[i]];
243 :
244 : /*
245 : * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
246 : *
247 : * (a) sort the data lexicographically
248 : *
249 : * (b) split the data into groups by first (k-1) columns
250 : *
251 : * (c) for each group count different values in the last column
252 : *
253 : * We use the column data types' default sort operators and collations;
254 : * perhaps at some point it'd be worth using column-specific collations?
255 : */
256 :
257 : /* prepare the sort function for the dimensions */
258 3504 : for (i = 0; i < k; i++)
259 : {
260 2464 : VacAttrStats *colstat = data->stats[dependency[i]];
261 : TypeCacheEntry *type;
262 :
263 2464 : type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
264 2464 : if (type->lt_opr == InvalidOid) /* shouldn't happen */
265 0 : elog(ERROR, "cache lookup failed for ordering operator for type %u",
266 : colstat->attrtypid);
267 :
268 : /* prepare the sort function for this dimension */
269 2464 : multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
270 : }
271 :
272 : /*
273 : * build an array of SortItem(s) sorted using the multi-sort support
274 : *
275 : * XXX This relies on all stats entries pointing to the same tuple
276 : * descriptor. For now that assumption holds, but it might change in the
277 : * future for example if we support statistics on multiple tables.
278 : */
279 1040 : items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
280 :
281 : /*
282 : * Walk through the sorted array, split it into rows according to the
283 : * first (k-1) columns. If there's a single value in the last column, we
284 : * count the group as 'supporting' the functional dependency. Otherwise we
285 : * count it as contradicting.
286 : */
287 :
288 : /* start with the first row forming a group */
289 1040 : group_size = 1;
290 :
291 : /* loop 1 beyond the end of the array so that we count the final group */
292 1508738 : for (i = 1; i <= nitems; i++)
293 : {
294 : /*
295 : * Check if the group ended, which may be either because we processed
296 : * all the items (i==nitems), or because the i-th item is not equal to
297 : * the preceding one.
298 : */
299 3014356 : if (i == nitems ||
300 1506658 : multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
301 : {
302 : /*
303 : * If no violations were found in the group then track the rows of
304 : * the group as supporting the functional dependency.
305 : */
306 34548 : if (n_violations == 0)
307 21766 : n_supporting_rows += group_size;
308 :
309 : /* Reset counters for the new group */
310 34548 : n_violations = 0;
311 34548 : group_size = 1;
312 34548 : continue;
313 : }
314 : /* first columns match, but the last one does not (so contradicting) */
315 1473150 : else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
316 60614 : n_violations++;
317 :
318 1473150 : group_size++;
319 : }
320 :
321 : /* Compute the 'degree of validity' as (supporting/total). */
322 1040 : return (n_supporting_rows * 1.0 / data->numrows);
323 : }
324 :
325 : /*
326 : * detects functional dependencies between groups of columns
327 : *
328 : * Generates all possible subsets of columns (variations) and computes
329 : * the degree of validity for each one. For example when creating statistics
330 : * on three columns (a,b,c) there are 9 possible dependencies
331 : *
332 : * two columns three columns
333 : * ----------- -------------
334 : * (a) -> b (a,b) -> c
335 : * (a) -> c (a,c) -> b
336 : * (b) -> a (b,c) -> a
337 : * (b) -> c
338 : * (c) -> a
339 : * (c) -> b
340 : */
341 : MVDependencies *
342 196 : statext_dependencies_build(StatsBuildData *data)
343 : {
344 : int i,
345 : k;
346 :
347 : /* result */
348 196 : MVDependencies *dependencies = NULL;
349 : MemoryContext cxt;
350 :
351 : Assert(data->nattnums >= 2);
352 :
353 : /* tracks memory allocated by dependency_degree calls */
354 196 : cxt = AllocSetContextCreate(CurrentMemoryContext,
355 : "dependency_degree cxt",
356 : ALLOCSET_DEFAULT_SIZES);
357 :
358 : /*
359 : * We'll try build functional dependencies starting from the smallest ones
360 : * covering just 2 columns, to the largest ones, covering all columns
361 : * included in the statistics object. We start from the smallest ones
362 : * because we want to be able to skip already implied ones.
363 : */
364 464 : for (k = 2; k <= data->nattnums; k++)
365 : {
366 : AttrNumber *dependency; /* array with k elements */
367 :
368 : /* prepare a DependencyGenerator of variation */
369 268 : DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
370 :
371 : /* generate all possible variations of k values (out of n) */
372 1308 : while ((dependency = DependencyGenerator_next(DependencyGenerator)))
373 : {
374 : double degree;
375 : MVDependency *d;
376 : MemoryContext oldcxt;
377 :
378 : /* release memory used by dependency degree calculation */
379 1040 : oldcxt = MemoryContextSwitchTo(cxt);
380 :
381 : /* compute how valid the dependency seems */
382 1040 : degree = dependency_degree(data, k, dependency);
383 :
384 1040 : MemoryContextSwitchTo(oldcxt);
385 1040 : MemoryContextReset(cxt);
386 :
387 : /*
388 : * if the dependency seems entirely invalid, don't store it
389 : */
390 1040 : if (degree == 0.0)
391 262 : continue;
392 :
393 778 : d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
394 778 : + k * sizeof(AttrNumber));
395 :
396 : /* copy the dependency (and keep the indexes into stxkeys) */
397 778 : d->degree = degree;
398 778 : d->nattributes = k;
399 2646 : for (i = 0; i < k; i++)
400 1868 : d->attributes[i] = data->attnums[dependency[i]];
401 :
402 : /* initialize the list of dependencies */
403 778 : if (dependencies == NULL)
404 : {
405 178 : dependencies = palloc0_object(MVDependencies);
406 :
407 178 : dependencies->magic = STATS_DEPS_MAGIC;
408 178 : dependencies->type = STATS_DEPS_TYPE_BASIC;
409 178 : dependencies->ndeps = 0;
410 : }
411 :
412 778 : dependencies->ndeps++;
413 778 : dependencies = (MVDependencies *) repalloc(dependencies,
414 : offsetof(MVDependencies, deps)
415 778 : + dependencies->ndeps * sizeof(MVDependency *));
416 :
417 778 : dependencies->deps[dependencies->ndeps - 1] = d;
418 : }
419 :
420 : /*
421 : * we're done with variations of k elements, so free the
422 : * DependencyGenerator
423 : */
424 268 : DependencyGenerator_free(DependencyGenerator);
425 : }
426 :
427 196 : MemoryContextDelete(cxt);
428 :
429 196 : return dependencies;
430 : }
431 :
432 :
433 : /*
434 : * Serialize list of dependencies into a bytea value.
435 : */
436 : bytea *
437 258 : statext_dependencies_serialize(MVDependencies *dependencies)
438 : {
439 : int i;
440 : bytea *output;
441 : char *tmp;
442 : Size len;
443 :
444 : /* we need to store ndeps, with a number of attributes for each one */
445 258 : len = VARHDRSZ + SizeOfHeader;
446 :
447 : /* and also include space for the actual attribute numbers and degrees */
448 1244 : for (i = 0; i < dependencies->ndeps; i++)
449 986 : len += SizeOfItem(dependencies->deps[i]->nattributes);
450 :
451 258 : output = (bytea *) palloc0(len);
452 258 : SET_VARSIZE(output, len);
453 :
454 258 : tmp = VARDATA(output);
455 :
456 : /* Store the base struct values (magic, type, ndeps) */
457 258 : memcpy(tmp, &dependencies->magic, sizeof(uint32));
458 258 : tmp += sizeof(uint32);
459 258 : memcpy(tmp, &dependencies->type, sizeof(uint32));
460 258 : tmp += sizeof(uint32);
461 258 : memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
462 258 : tmp += sizeof(uint32);
463 :
464 : /* store number of attributes and attribute numbers for each dependency */
465 1244 : for (i = 0; i < dependencies->ndeps; i++)
466 : {
467 986 : MVDependency *d = dependencies->deps[i];
468 :
469 986 : memcpy(tmp, &d->degree, sizeof(double));
470 986 : tmp += sizeof(double);
471 :
472 986 : memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
473 986 : tmp += sizeof(AttrNumber);
474 :
475 986 : memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
476 986 : tmp += sizeof(AttrNumber) * d->nattributes;
477 :
478 : /* protect against overflow */
479 : Assert(tmp <= ((char *) output + len));
480 : }
481 :
482 : /* make sure we've produced exactly the right amount of data */
483 : Assert(tmp == ((char *) output + len));
484 :
485 258 : return output;
486 : }
487 :
488 : /*
489 : * Reads serialized dependencies into MVDependencies structure.
490 : */
491 : MVDependencies *
492 1784 : statext_dependencies_deserialize(bytea *data)
493 : {
494 : int i;
495 : Size min_expected_size;
496 : MVDependencies *dependencies;
497 : char *tmp;
498 :
499 1784 : if (data == NULL)
500 0 : return NULL;
501 :
502 1784 : if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
503 0 : elog(ERROR, "invalid MVDependencies size %zu (expected at least %zu)",
504 : VARSIZE_ANY_EXHDR(data), SizeOfHeader);
505 :
506 : /* read the MVDependencies header */
507 1784 : dependencies = palloc0_object(MVDependencies);
508 :
509 : /* initialize pointer to the data part (skip the varlena header) */
510 1784 : tmp = VARDATA_ANY(data);
511 :
512 : /* read the header fields and perform basic sanity checks */
513 1784 : memcpy(&dependencies->magic, tmp, sizeof(uint32));
514 1784 : tmp += sizeof(uint32);
515 1784 : memcpy(&dependencies->type, tmp, sizeof(uint32));
516 1784 : tmp += sizeof(uint32);
517 1784 : memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
518 1784 : tmp += sizeof(uint32);
519 :
520 1784 : if (dependencies->magic != STATS_DEPS_MAGIC)
521 0 : elog(ERROR, "invalid dependency magic %d (expected %d)",
522 : dependencies->magic, STATS_DEPS_MAGIC);
523 :
524 1784 : if (dependencies->type != STATS_DEPS_TYPE_BASIC)
525 0 : elog(ERROR, "invalid dependency type %d (expected %d)",
526 : dependencies->type, STATS_DEPS_TYPE_BASIC);
527 :
528 1784 : if (dependencies->ndeps == 0)
529 0 : elog(ERROR, "invalid zero-length item array in MVDependencies");
530 :
531 : /* what minimum bytea size do we expect for those parameters */
532 1784 : min_expected_size = SizeOfItem(dependencies->ndeps);
533 :
534 1784 : if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
535 0 : elog(ERROR, "invalid dependencies size %zu (expected at least %zu)",
536 : VARSIZE_ANY_EXHDR(data), min_expected_size);
537 :
538 : /* allocate space for the MCV items */
539 1784 : dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
540 1784 : + (dependencies->ndeps * sizeof(MVDependency *)));
541 :
542 10260 : for (i = 0; i < dependencies->ndeps; i++)
543 : {
544 : double degree;
545 : AttrNumber k;
546 : MVDependency *d;
547 :
548 : /* degree of validity */
549 8476 : memcpy(°ree, tmp, sizeof(double));
550 8476 : tmp += sizeof(double);
551 :
552 : /* number of attributes */
553 8476 : memcpy(&k, tmp, sizeof(AttrNumber));
554 8476 : tmp += sizeof(AttrNumber);
555 :
556 : /* is the number of attributes valid? */
557 : Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
558 :
559 : /* now that we know the number of attributes, allocate the dependency */
560 8476 : d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
561 8476 : + (k * sizeof(AttrNumber)));
562 :
563 8476 : d->degree = degree;
564 8476 : d->nattributes = k;
565 :
566 : /* copy attribute numbers */
567 8476 : memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
568 8476 : tmp += sizeof(AttrNumber) * d->nattributes;
569 :
570 8476 : dependencies->deps[i] = d;
571 :
572 : /* still within the bytea */
573 : Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
574 : }
575 :
576 : /* we should have consumed the whole bytea exactly */
577 : Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
578 :
579 1784 : return dependencies;
580 : }
581 :
582 : /*
583 : * Free allocations of a MVDependencies.
584 : */
585 : void
586 38 : statext_dependencies_free(MVDependencies *dependencies)
587 : {
588 306 : for (int i = 0; i < dependencies->ndeps; i++)
589 268 : pfree(dependencies->deps[i]);
590 38 : pfree(dependencies);
591 38 : }
592 :
593 : /*
594 : * Validate a set of MVDependencies against the extended statistics object
595 : * definition.
596 : *
597 : * Every MVDependencies must be checked to ensure that the attnums in the
598 : * attributes list correspond to attnums/expressions defined by the
599 : * extended statistics object.
600 : *
601 : * Positive attnums are attributes which must be found in the stxkeys, while
602 : * negative attnums correspond to an expression number, no attribute number
603 : * can be below (0 - numexprs).
604 : */
605 : bool
606 38 : statext_dependencies_validate(const MVDependencies *dependencies,
607 : const int2vector *stxkeys,
608 : int numexprs, int elevel)
609 : {
610 38 : int attnum_expr_lowbound = 0 - numexprs;
611 :
612 : /* Scan through each dependency entry */
613 282 : for (int i = 0; i < dependencies->ndeps; i++)
614 : {
615 256 : const MVDependency *dep = dependencies->deps[i];
616 :
617 : /*
618 : * Cross-check each attribute in a dependency entry with the extended
619 : * stats object definition.
620 : */
621 892 : for (int j = 0; j < dep->nattributes; j++)
622 : {
623 648 : AttrNumber attnum = dep->attributes[j];
624 648 : bool ok = false;
625 :
626 648 : if (attnum > 0)
627 : {
628 : /* attribute number in stxkeys */
629 504 : for (int k = 0; k < stxkeys->dim1; k++)
630 : {
631 492 : if (attnum == stxkeys->values[k])
632 : {
633 318 : ok = true;
634 318 : break;
635 : }
636 : }
637 : }
638 318 : else if ((attnum < 0) && (attnum >= attnum_expr_lowbound))
639 : {
640 : /* attribute number for an expression */
641 318 : ok = true;
642 : }
643 :
644 648 : if (!ok)
645 : {
646 12 : ereport(elevel,
647 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
648 : errmsg("could not validate \"%s\" object: invalid attribute number %d found",
649 : "pg_dependencies", attnum)));
650 12 : return false;
651 : }
652 : }
653 : }
654 :
655 26 : return true;
656 : }
657 :
658 : /*
659 : * dependency_is_fully_matched
660 : * checks that a functional dependency is fully matched given clauses on
661 : * attributes (assuming the clauses are suitable equality clauses)
662 : */
663 : static bool
664 6156 : dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
665 : {
666 : int j;
667 :
668 : /*
669 : * Check that the dependency actually is fully covered by clauses. We have
670 : * to translate all attribute numbers, as those are referenced
671 : */
672 15642 : for (j = 0; j < dependency->nattributes; j++)
673 : {
674 12558 : int attnum = dependency->attributes[j];
675 :
676 12558 : if (!bms_is_member(attnum, attnums))
677 3072 : return false;
678 : }
679 :
680 3084 : return true;
681 : }
682 :
683 : /*
684 : * statext_dependencies_load
685 : * Load the functional dependencies for the indicated pg_statistic_ext tuple
686 : */
687 : MVDependencies *
688 1608 : statext_dependencies_load(Oid mvoid, bool inh)
689 : {
690 : MVDependencies *result;
691 : bool isnull;
692 : Datum deps;
693 : HeapTuple htup;
694 :
695 1608 : htup = SearchSysCache2(STATEXTDATASTXOID,
696 : ObjectIdGetDatum(mvoid),
697 : BoolGetDatum(inh));
698 1608 : if (!HeapTupleIsValid(htup))
699 0 : elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
700 :
701 1608 : deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
702 : Anum_pg_statistic_ext_data_stxddependencies, &isnull);
703 1608 : if (isnull)
704 0 : elog(ERROR,
705 : "requested statistics kind \"%c\" is not yet built for statistics object %u",
706 : STATS_EXT_DEPENDENCIES, mvoid);
707 :
708 1608 : result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
709 :
710 1608 : ReleaseSysCache(htup);
711 :
712 1608 : return result;
713 : }
714 :
715 : /*
716 : * dependency_is_compatible_clause
717 : * Determines if the clause is compatible with functional dependencies
718 : *
719 : * Only clauses that have the form of equality to a pseudoconstant, or can be
720 : * interpreted that way, are currently accepted. Furthermore the variable
721 : * part of the clause must be a simple Var belonging to the specified
722 : * relation, whose attribute number we return in *attnum on success.
723 : */
724 : static bool
725 3960 : dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
726 : {
727 : Var *var;
728 : Node *clause_expr;
729 :
730 3960 : if (IsA(clause, RestrictInfo))
731 : {
732 3840 : RestrictInfo *rinfo = (RestrictInfo *) clause;
733 :
734 : /* Pseudoconstants are not interesting (they couldn't contain a Var) */
735 3840 : if (rinfo->pseudoconstant)
736 6 : return false;
737 :
738 : /* Clauses referencing multiple, or no, varnos are incompatible */
739 3834 : if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
740 0 : return false;
741 :
742 3834 : clause = (Node *) rinfo->clause;
743 : }
744 :
745 3954 : if (is_opclause(clause))
746 : {
747 : /* If it's an opclause, check for Var = Const or Const = Var. */
748 1446 : OpExpr *expr = (OpExpr *) clause;
749 :
750 : /* Only expressions with two arguments are candidates. */
751 1446 : if (list_length(expr->args) != 2)
752 0 : return false;
753 :
754 : /* Make sure non-selected argument is a pseudoconstant. */
755 1446 : if (is_pseudo_constant_clause(lsecond(expr->args)))
756 1446 : clause_expr = linitial(expr->args);
757 0 : else if (is_pseudo_constant_clause(linitial(expr->args)))
758 0 : clause_expr = lsecond(expr->args);
759 : else
760 0 : return false;
761 :
762 : /*
763 : * If it's not an "=" operator, just ignore the clause, as it's not
764 : * compatible with functional dependencies.
765 : *
766 : * This uses the function for estimating selectivity, not the operator
767 : * directly (a bit awkward, but well ...).
768 : *
769 : * XXX this is pretty dubious; probably it'd be better to check btree
770 : * or hash opclass membership, so as not to be fooled by custom
771 : * selectivity functions, and to be more consistent with decisions
772 : * elsewhere in the planner.
773 : */
774 1446 : if (get_oprrest(expr->opno) != F_EQSEL)
775 36 : return false;
776 :
777 : /* OK to proceed with checking "var" */
778 : }
779 2508 : else if (IsA(clause, ScalarArrayOpExpr))
780 : {
781 : /* If it's a scalar array operator, check for Var IN Const. */
782 2436 : ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
783 :
784 : /*
785 : * Reject ALL() variant, we only care about ANY/IN.
786 : *
787 : * XXX Maybe we should check if all the values are the same, and allow
788 : * ALL in that case? Doesn't seem very practical, though.
789 : */
790 2436 : if (!expr->useOr)
791 36 : return false;
792 :
793 : /* Only expressions with two arguments are candidates. */
794 2400 : if (list_length(expr->args) != 2)
795 0 : return false;
796 :
797 : /*
798 : * We know it's always (Var IN Const), so we assume the var is the
799 : * first argument, and pseudoconstant is the second one.
800 : */
801 2400 : if (!is_pseudo_constant_clause(lsecond(expr->args)))
802 0 : return false;
803 :
804 2400 : clause_expr = linitial(expr->args);
805 :
806 : /*
807 : * If it's not an "=" operator, just ignore the clause, as it's not
808 : * compatible with functional dependencies. The operator is identified
809 : * simply by looking at which function it uses to estimate
810 : * selectivity. That's a bit strange, but it's what other similar
811 : * places do.
812 : */
813 2400 : if (get_oprrest(expr->opno) != F_EQSEL)
814 180 : return false;
815 :
816 : /* OK to proceed with checking "var" */
817 : }
818 72 : else if (is_orclause(clause))
819 : {
820 72 : BoolExpr *bool_expr = (BoolExpr *) clause;
821 : ListCell *lc;
822 :
823 : /* start with no attribute number */
824 72 : *attnum = InvalidAttrNumber;
825 :
826 150 : foreach(lc, bool_expr->args)
827 : {
828 : AttrNumber clause_attnum;
829 :
830 : /*
831 : * Had we found incompatible clause in the arguments, treat the
832 : * whole clause as incompatible.
833 : */
834 120 : if (!dependency_is_compatible_clause((Node *) lfirst(lc),
835 : relid, &clause_attnum))
836 42 : return false;
837 :
838 84 : if (*attnum == InvalidAttrNumber)
839 36 : *attnum = clause_attnum;
840 :
841 : /* ensure all the variables are the same (same attnum) */
842 84 : if (*attnum != clause_attnum)
843 6 : return false;
844 : }
845 :
846 : /* the Var is already checked by the recursive call */
847 30 : return true;
848 : }
849 0 : else if (is_notclause(clause))
850 : {
851 : /*
852 : * "NOT x" can be interpreted as "x = false", so get the argument and
853 : * proceed with seeing if it's a suitable Var.
854 : */
855 0 : clause_expr = (Node *) get_notclausearg(clause);
856 : }
857 : else
858 : {
859 : /*
860 : * A boolean expression "x" can be interpreted as "x = true", so
861 : * proceed with seeing if it's a suitable Var.
862 : */
863 0 : clause_expr = clause;
864 : }
865 :
866 : /*
867 : * We may ignore any RelabelType node above the operand. (There won't be
868 : * more than one, since eval_const_expressions has been applied already.)
869 : */
870 3630 : if (IsA(clause_expr, RelabelType))
871 0 : clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
872 :
873 : /* We only support plain Vars for now */
874 3630 : if (!IsA(clause_expr, Var))
875 288 : return false;
876 :
877 : /* OK, we know we have a Var */
878 3342 : var = (Var *) clause_expr;
879 :
880 : /* Ensure Var is from the correct relation */
881 3342 : if (var->varno != relid)
882 0 : return false;
883 :
884 : /* We also better ensure the Var is from the current level */
885 3342 : if (var->varlevelsup != 0)
886 0 : return false;
887 :
888 : /* Also ignore system attributes (we don't allow stats on those) */
889 3342 : if (!AttrNumberIsForUserDefinedAttr(var->varattno))
890 0 : return false;
891 :
892 3342 : *attnum = var->varattno;
893 3342 : return true;
894 : }
895 :
896 : /*
897 : * find_strongest_dependency
898 : * find the strongest dependency on the attributes
899 : *
900 : * When applying functional dependencies, we start with the strongest
901 : * dependencies. That is, we select the dependency that:
902 : *
903 : * (a) has all attributes covered by equality clauses
904 : *
905 : * (b) has the most attributes
906 : *
907 : * (c) has the highest degree of validity
908 : *
909 : * This guarantees that we eliminate the most redundant conditions first
910 : * (see the comment in dependencies_clauselist_selectivity).
911 : */
912 : static MVDependency *
913 3486 : find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
914 : Bitmapset *attnums)
915 : {
916 : int i,
917 : j;
918 3486 : MVDependency *strongest = NULL;
919 :
920 : /* number of attnums in clauses */
921 3486 : int nattnums = bms_num_members(attnums);
922 :
923 : /*
924 : * Iterate over the MVDependency items and find the strongest one from the
925 : * fully-matched dependencies. We do the cheap checks first, before
926 : * matching it against the attnums.
927 : */
928 7008 : for (i = 0; i < ndependencies; i++)
929 : {
930 20280 : for (j = 0; j < dependencies[i]->ndeps; j++)
931 : {
932 16758 : MVDependency *dependency = dependencies[i]->deps[j];
933 :
934 : /*
935 : * Skip dependencies referencing more attributes than available
936 : * clauses, as those can't be fully matched.
937 : */
938 16758 : if (dependency->nattributes > nattnums)
939 10602 : continue;
940 :
941 6156 : if (strongest)
942 : {
943 : /* skip dependencies on fewer attributes than the strongest. */
944 3936 : if (dependency->nattributes < strongest->nattributes)
945 0 : continue;
946 :
947 : /* also skip weaker dependencies when attribute count matches */
948 3936 : if (strongest->nattributes == dependency->nattributes &&
949 3654 : strongest->degree > dependency->degree)
950 0 : continue;
951 : }
952 :
953 : /*
954 : * this dependency is stronger, but we must still check that it's
955 : * fully matched to these attnums. We perform this check last as
956 : * it's slightly more expensive than the previous checks.
957 : */
958 6156 : if (dependency_is_fully_matched(dependency, attnums))
959 3084 : strongest = dependency; /* save new best match */
960 : }
961 : }
962 :
963 3486 : return strongest;
964 : }
965 :
966 : /*
967 : * clauselist_apply_dependencies
968 : * Apply the specified functional dependencies to a list of clauses and
969 : * return the estimated selectivity of the clauses that are compatible
970 : * with any of the given dependencies.
971 : *
972 : * This will estimate all not-already-estimated clauses that are compatible
973 : * with functional dependencies, and which have an attribute mentioned by any
974 : * of the given dependencies (either as an implying or implied attribute).
975 : *
976 : * Given (lists of) clauses on attributes (a,b) and a functional dependency
977 : * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
978 : * using the formula
979 : *
980 : * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
981 : *
982 : * where 'f' is the degree of dependency. This reflects the fact that we
983 : * expect a fraction f of all rows to be consistent with the dependency
984 : * (a=>b), and so have a selectivity of P(a), while the remaining rows are
985 : * treated as independent.
986 : *
987 : * In practice, we use a slightly modified version of this formula, which uses
988 : * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
989 : * should obviously not exceed either column's individual selectivity. I.e.,
990 : * we actually combine selectivities using the formula
991 : *
992 : * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
993 : *
994 : * This can make quite a difference if the specific values matching the
995 : * clauses are not consistent with the functional dependency.
996 : */
997 : static Selectivity
998 1596 : clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
999 : int varRelid, JoinType jointype,
1000 : SpecialJoinInfo *sjinfo,
1001 : MVDependency **dependencies, int ndependencies,
1002 : AttrNumber *list_attnums,
1003 : Bitmapset **estimatedclauses)
1004 : {
1005 : Bitmapset *attnums;
1006 : int i;
1007 : int j;
1008 : int nattrs;
1009 : Selectivity *attr_sel;
1010 : int attidx;
1011 : int listidx;
1012 : ListCell *l;
1013 : Selectivity s1;
1014 :
1015 : /*
1016 : * Extract the attnums of all implying and implied attributes from all the
1017 : * given dependencies. Each of these attributes is expected to have at
1018 : * least 1 not-already-estimated compatible clause that we will estimate
1019 : * here.
1020 : */
1021 1596 : attnums = NULL;
1022 3486 : for (i = 0; i < ndependencies; i++)
1023 : {
1024 5952 : for (j = 0; j < dependencies[i]->nattributes; j++)
1025 : {
1026 4062 : AttrNumber attnum = dependencies[i]->attributes[j];
1027 :
1028 4062 : attnums = bms_add_member(attnums, attnum);
1029 : }
1030 : }
1031 :
1032 : /*
1033 : * Compute per-column selectivity estimates for each of these attributes,
1034 : * and mark all the corresponding clauses as estimated.
1035 : */
1036 1596 : nattrs = bms_num_members(attnums);
1037 1596 : attr_sel = palloc_array(Selectivity, nattrs);
1038 :
1039 1596 : attidx = 0;
1040 1596 : i = -1;
1041 5094 : while ((i = bms_next_member(attnums, i)) >= 0)
1042 : {
1043 3498 : List *attr_clauses = NIL;
1044 : Selectivity simple_sel;
1045 :
1046 3498 : listidx = -1;
1047 11436 : foreach(l, clauses)
1048 : {
1049 7938 : Node *clause = (Node *) lfirst(l);
1050 :
1051 7938 : listidx++;
1052 7938 : if (list_attnums[listidx] == i)
1053 : {
1054 3498 : attr_clauses = lappend(attr_clauses, clause);
1055 3498 : *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1056 : }
1057 : }
1058 :
1059 3498 : simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
1060 : jointype, sjinfo, false);
1061 3498 : attr_sel[attidx++] = simple_sel;
1062 : }
1063 :
1064 : /*
1065 : * Now combine these selectivities using the dependency information. For
1066 : * chains of dependencies such as a -> b -> c, the b -> c dependency will
1067 : * come before the a -> b dependency in the array, so we traverse the
1068 : * array backwards to ensure such chains are computed in the right order.
1069 : *
1070 : * As explained above, pairs of selectivities are combined using the
1071 : * formula
1072 : *
1073 : * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1074 : *
1075 : * to ensure that the combined selectivity is never greater than either
1076 : * individual selectivity.
1077 : *
1078 : * Where multiple dependencies apply (e.g., a -> b -> c), we use
1079 : * conditional probabilities to compute the overall result as follows:
1080 : *
1081 : * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
1082 : *
1083 : * so we replace the selectivities of all implied attributes with
1084 : * conditional probabilities, that are conditional on all their implying
1085 : * attributes. The selectivities of all other non-implied attributes are
1086 : * left as they are.
1087 : */
1088 3486 : for (i = ndependencies - 1; i >= 0; i--)
1089 : {
1090 1890 : MVDependency *dependency = dependencies[i];
1091 : AttrNumber attnum;
1092 : Selectivity s2;
1093 : double f;
1094 :
1095 : /* Selectivity of all the implying attributes */
1096 1890 : s1 = 1.0;
1097 4062 : for (j = 0; j < dependency->nattributes - 1; j++)
1098 : {
1099 2172 : attnum = dependency->attributes[j];
1100 2172 : attidx = bms_member_index(attnums, attnum);
1101 2172 : s1 *= attr_sel[attidx];
1102 : }
1103 :
1104 : /* Original selectivity of the implied attribute */
1105 1890 : attnum = dependency->attributes[j];
1106 1890 : attidx = bms_member_index(attnums, attnum);
1107 1890 : s2 = attr_sel[attidx];
1108 :
1109 : /*
1110 : * Replace s2 with the conditional probability s2 given s1, computed
1111 : * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
1112 : *
1113 : * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
1114 : *
1115 : * where P(a) = s1, the selectivity of the implying attributes, and
1116 : * P(b) = s2, the selectivity of the implied attribute.
1117 : */
1118 1890 : f = dependency->degree;
1119 :
1120 1890 : if (s1 <= s2)
1121 1782 : attr_sel[attidx] = f + (1 - f) * s2;
1122 : else
1123 108 : attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
1124 : }
1125 :
1126 : /*
1127 : * The overall selectivity of all the clauses on all these attributes is
1128 : * then the product of all the original (non-implied) probabilities and
1129 : * the new conditional (implied) probabilities.
1130 : */
1131 1596 : s1 = 1.0;
1132 5094 : for (i = 0; i < nattrs; i++)
1133 3498 : s1 *= attr_sel[i];
1134 :
1135 1596 : CLAMP_PROBABILITY(s1);
1136 :
1137 1596 : pfree(attr_sel);
1138 1596 : bms_free(attnums);
1139 :
1140 1596 : return s1;
1141 : }
1142 :
1143 : /*
1144 : * dependency_is_compatible_expression
1145 : * Determines if the expression is compatible with functional dependencies
1146 : *
1147 : * Similar to dependency_is_compatible_clause, but doesn't enforce that the
1148 : * expression is a simple Var. On success, return the matching statistics
1149 : * expression into *expr.
1150 : */
1151 : static bool
1152 642 : dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
1153 : {
1154 : ListCell *lc,
1155 : *lc2;
1156 : Node *clause_expr;
1157 :
1158 642 : if (IsA(clause, RestrictInfo))
1159 : {
1160 552 : RestrictInfo *rinfo = (RestrictInfo *) clause;
1161 :
1162 : /* Pseudoconstants are not interesting (they couldn't contain a Var) */
1163 552 : if (rinfo->pseudoconstant)
1164 6 : return false;
1165 :
1166 : /* Clauses referencing multiple, or no, varnos are incompatible */
1167 546 : if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
1168 0 : return false;
1169 :
1170 546 : clause = (Node *) rinfo->clause;
1171 : }
1172 :
1173 636 : if (is_opclause(clause))
1174 : {
1175 : /* If it's an opclause, check for Var = Const or Const = Var. */
1176 204 : OpExpr *expr = (OpExpr *) clause;
1177 :
1178 : /* Only expressions with two arguments are candidates. */
1179 204 : if (list_length(expr->args) != 2)
1180 0 : return false;
1181 :
1182 : /* Make sure non-selected argument is a pseudoconstant. */
1183 204 : if (is_pseudo_constant_clause(lsecond(expr->args)))
1184 204 : clause_expr = linitial(expr->args);
1185 0 : else if (is_pseudo_constant_clause(linitial(expr->args)))
1186 0 : clause_expr = lsecond(expr->args);
1187 : else
1188 0 : return false;
1189 :
1190 : /*
1191 : * If it's not an "=" operator, just ignore the clause, as it's not
1192 : * compatible with functional dependencies.
1193 : *
1194 : * This uses the function for estimating selectivity, not the operator
1195 : * directly (a bit awkward, but well ...).
1196 : *
1197 : * XXX this is pretty dubious; probably it'd be better to check btree
1198 : * or hash opclass membership, so as not to be fooled by custom
1199 : * selectivity functions, and to be more consistent with decisions
1200 : * elsewhere in the planner.
1201 : */
1202 204 : if (get_oprrest(expr->opno) != F_EQSEL)
1203 36 : return false;
1204 :
1205 : /* OK to proceed with checking "var" */
1206 : }
1207 432 : else if (IsA(clause, ScalarArrayOpExpr))
1208 : {
1209 : /* If it's a scalar array operator, check for Var IN Const. */
1210 390 : ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1211 :
1212 : /*
1213 : * Reject ALL() variant, we only care about ANY/IN.
1214 : *
1215 : * FIXME Maybe we should check if all the values are the same, and
1216 : * allow ALL in that case? Doesn't seem very practical, though.
1217 : */
1218 390 : if (!expr->useOr)
1219 36 : return false;
1220 :
1221 : /* Only expressions with two arguments are candidates. */
1222 354 : if (list_length(expr->args) != 2)
1223 0 : return false;
1224 :
1225 : /*
1226 : * We know it's always (Var IN Const), so we assume the var is the
1227 : * first argument, and pseudoconstant is the second one.
1228 : */
1229 354 : if (!is_pseudo_constant_clause(lsecond(expr->args)))
1230 0 : return false;
1231 :
1232 354 : clause_expr = linitial(expr->args);
1233 :
1234 : /*
1235 : * If it's not an "=" operator, just ignore the clause, as it's not
1236 : * compatible with functional dependencies. The operator is identified
1237 : * simply by looking at which function it uses to estimate
1238 : * selectivity. That's a bit strange, but it's what other similar
1239 : * places do.
1240 : */
1241 354 : if (get_oprrest(expr->opno) != F_EQSEL)
1242 180 : return false;
1243 :
1244 : /* OK to proceed with checking "var" */
1245 : }
1246 42 : else if (is_orclause(clause))
1247 : {
1248 42 : BoolExpr *bool_expr = (BoolExpr *) clause;
1249 :
1250 : /* start with no expression (we'll use the first match) */
1251 42 : *expr = NULL;
1252 :
1253 120 : foreach(lc, bool_expr->args)
1254 : {
1255 90 : Node *or_expr = NULL;
1256 :
1257 : /*
1258 : * Had we found incompatible expression in the arguments, treat
1259 : * the whole expression as incompatible.
1260 : */
1261 90 : if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
1262 : statlist, &or_expr))
1263 12 : return false;
1264 :
1265 84 : if (*expr == NULL)
1266 36 : *expr = or_expr;
1267 :
1268 : /* ensure all the expressions are the same */
1269 84 : if (!equal(or_expr, *expr))
1270 6 : return false;
1271 : }
1272 :
1273 : /* the expression is already checked by the recursive call */
1274 30 : return true;
1275 : }
1276 0 : else if (is_notclause(clause))
1277 : {
1278 : /*
1279 : * "NOT x" can be interpreted as "x = false", so get the argument and
1280 : * proceed with seeing if it's a suitable Var.
1281 : */
1282 0 : clause_expr = (Node *) get_notclausearg(clause);
1283 : }
1284 : else
1285 : {
1286 : /*
1287 : * A boolean expression "x" can be interpreted as "x = true", so
1288 : * proceed with seeing if it's a suitable Var.
1289 : */
1290 0 : clause_expr = clause;
1291 : }
1292 :
1293 : /*
1294 : * We may ignore any RelabelType node above the operand. (There won't be
1295 : * more than one, since eval_const_expressions has been applied already.)
1296 : */
1297 342 : if (IsA(clause_expr, RelabelType))
1298 0 : clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
1299 :
1300 : /*
1301 : * Search for a matching statistics expression.
1302 : */
1303 348 : foreach(lc, statlist)
1304 : {
1305 342 : StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
1306 :
1307 : /* ignore stats without dependencies */
1308 342 : if (info->kind != STATS_EXT_DEPENDENCIES)
1309 0 : continue;
1310 :
1311 558 : foreach(lc2, info->exprs)
1312 : {
1313 552 : Node *stat_expr = (Node *) lfirst(lc2);
1314 :
1315 552 : if (equal(clause_expr, stat_expr))
1316 : {
1317 336 : *expr = stat_expr;
1318 336 : return true;
1319 : }
1320 : }
1321 : }
1322 :
1323 6 : return false;
1324 : }
1325 :
1326 : /*
1327 : * dependencies_clauselist_selectivity
1328 : * Return the estimated selectivity of (a subset of) the given clauses
1329 : * using functional dependency statistics, or 1.0 if no useful functional
1330 : * dependency statistic exists.
1331 : *
1332 : * 'estimatedclauses' is an input/output argument that gets a bit set
1333 : * corresponding to the (zero-based) list index of each clause that is included
1334 : * in the estimated selectivity.
1335 : *
1336 : * Given equality clauses on attributes (a,b) we find the strongest dependency
1337 : * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
1338 : * dependency, we then combine the per-clause selectivities using the formula
1339 : *
1340 : * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
1341 : *
1342 : * where 'f' is the degree of the dependency. (Actually we use a slightly
1343 : * modified version of this formula -- see clauselist_apply_dependencies()).
1344 : *
1345 : * With clauses on more than two attributes, the dependencies are applied
1346 : * recursively, starting with the widest/strongest dependencies. For example
1347 : * P(a,b,c) is first split like this:
1348 : *
1349 : * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
1350 : *
1351 : * assuming (a,b=>c) is the strongest dependency.
1352 : */
1353 : Selectivity
1354 2436 : dependencies_clauselist_selectivity(PlannerInfo *root,
1355 : List *clauses,
1356 : int varRelid,
1357 : JoinType jointype,
1358 : SpecialJoinInfo *sjinfo,
1359 : RelOptInfo *rel,
1360 : Bitmapset **estimatedclauses)
1361 : {
1362 2436 : Selectivity s1 = 1.0;
1363 : ListCell *l;
1364 2436 : Bitmapset *clauses_attnums = NULL;
1365 : AttrNumber *list_attnums;
1366 : int listidx;
1367 : MVDependencies **func_dependencies;
1368 : int nfunc_dependencies;
1369 : int total_ndeps;
1370 : MVDependency **dependencies;
1371 : int ndependencies;
1372 : int i;
1373 : AttrNumber attnum_offset;
1374 2436 : RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
1375 :
1376 : /* unique expressions */
1377 : Node **unique_exprs;
1378 : int unique_exprs_cnt;
1379 :
1380 : /* check if there's any stats that might be useful for us. */
1381 2436 : if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
1382 648 : return 1.0;
1383 :
1384 1788 : list_attnums = palloc_array(AttrNumber, list_length(clauses));
1385 :
1386 : /*
1387 : * We allocate space as if every clause was a unique expression, although
1388 : * that's probably overkill. Some will be simple column references that
1389 : * we'll translate to attnums, and there might be duplicates. But it's
1390 : * easier and cheaper to just do one allocation than repalloc later.
1391 : */
1392 1788 : unique_exprs = palloc_array(Node *, list_length(clauses));
1393 1788 : unique_exprs_cnt = 0;
1394 :
1395 : /*
1396 : * Pre-process the clauses list to extract the attnums seen in each item.
1397 : * We need to determine if there's any clauses which will be useful for
1398 : * dependency selectivity estimations. Along the way we'll record all of
1399 : * the attnums for each clause in a list which we'll reference later so we
1400 : * don't need to repeat the same work again. We'll also keep track of all
1401 : * attnums seen.
1402 : *
1403 : * We also skip clauses that we already estimated using different types of
1404 : * statistics (we treat them as incompatible).
1405 : *
1406 : * To handle expressions, we assign them negative attnums, as if it was a
1407 : * system attribute (this is fine, as we only allow extended stats on user
1408 : * attributes). And then we offset everything by the number of
1409 : * expressions, so that we can store the values in a bitmapset.
1410 : */
1411 1788 : listidx = 0;
1412 5670 : foreach(l, clauses)
1413 : {
1414 3882 : Node *clause = (Node *) lfirst(l);
1415 : AttrNumber attnum;
1416 3882 : Node *expr = NULL;
1417 :
1418 : /* ignore clause by default */
1419 3882 : list_attnums[listidx] = InvalidAttrNumber;
1420 :
1421 3882 : if (!bms_is_member(listidx, *estimatedclauses))
1422 : {
1423 : /*
1424 : * If it's a simple column reference, just extract the attnum. If
1425 : * it's an expression, assign a negative attnum as if it was a
1426 : * system attribute.
1427 : */
1428 3840 : if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
1429 : {
1430 3288 : list_attnums[listidx] = attnum;
1431 : }
1432 552 : else if (dependency_is_compatible_expression(clause, rel->relid,
1433 : rel->statlist,
1434 : &expr))
1435 : {
1436 : /* special attnum assigned to this expression */
1437 282 : attnum = InvalidAttrNumber;
1438 :
1439 : Assert(expr != NULL);
1440 :
1441 : /* If the expression is duplicate, use the same attnum. */
1442 474 : for (i = 0; i < unique_exprs_cnt; i++)
1443 : {
1444 192 : if (equal(unique_exprs[i], expr))
1445 : {
1446 : /* negative attribute number to expression */
1447 0 : attnum = -(i + 1);
1448 0 : break;
1449 : }
1450 : }
1451 :
1452 : /* not found in the list, so add it */
1453 282 : if (attnum == InvalidAttrNumber)
1454 : {
1455 282 : unique_exprs[unique_exprs_cnt++] = expr;
1456 :
1457 : /* after incrementing the value, to get -1, -2, ... */
1458 282 : attnum = (-unique_exprs_cnt);
1459 : }
1460 :
1461 : /* remember which attnum was assigned to this clause */
1462 282 : list_attnums[listidx] = attnum;
1463 : }
1464 : }
1465 :
1466 3882 : listidx++;
1467 : }
1468 :
1469 : Assert(listidx == list_length(clauses));
1470 :
1471 : /*
1472 : * How much we need to offset the attnums? If there are no expressions,
1473 : * then no offset is needed. Otherwise we need to offset enough for the
1474 : * lowest value (-unique_exprs_cnt) to become 1.
1475 : */
1476 1788 : if (unique_exprs_cnt > 0)
1477 132 : attnum_offset = (unique_exprs_cnt + 1);
1478 : else
1479 1656 : attnum_offset = 0;
1480 :
1481 : /*
1482 : * Now that we know how many expressions there are, we can offset the
1483 : * values just enough to build the bitmapset.
1484 : */
1485 5670 : for (i = 0; i < list_length(clauses); i++)
1486 : {
1487 : AttrNumber attnum;
1488 :
1489 : /* ignore incompatible or already estimated clauses */
1490 3882 : if (list_attnums[i] == InvalidAttrNumber)
1491 312 : continue;
1492 :
1493 : /* make sure the attnum is in the expected range */
1494 : Assert(list_attnums[i] >= (-unique_exprs_cnt));
1495 : Assert(list_attnums[i] <= MaxHeapAttributeNumber);
1496 :
1497 : /* make sure the attnum is positive (valid AttrNumber) */
1498 3570 : attnum = list_attnums[i] + attnum_offset;
1499 :
1500 : /*
1501 : * Either it's a regular attribute, or it's an expression, in which
1502 : * case we must not have seen it before (expressions are unique).
1503 : *
1504 : * XXX Check whether it's a regular attribute has to be done using the
1505 : * original attnum, while the second check has to use the value with
1506 : * an offset.
1507 : */
1508 : Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
1509 : !bms_is_member(attnum, clauses_attnums));
1510 :
1511 : /*
1512 : * Remember the offset attnum, both for attributes and expressions.
1513 : * We'll pass list_attnums to clauselist_apply_dependencies, which
1514 : * uses it to identify clauses in a bitmap. We could also pass the
1515 : * offset, but this is more convenient.
1516 : */
1517 3570 : list_attnums[i] = attnum;
1518 :
1519 3570 : clauses_attnums = bms_add_member(clauses_attnums, attnum);
1520 : }
1521 :
1522 : /*
1523 : * If there's not at least two distinct attnums and expressions, then
1524 : * reject the whole list of clauses. We must return 1.0 so the calling
1525 : * function's selectivity is unaffected.
1526 : */
1527 1788 : if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1528 : {
1529 192 : bms_free(clauses_attnums);
1530 192 : pfree(list_attnums);
1531 192 : return 1.0;
1532 : }
1533 :
1534 : /*
1535 : * Load all functional dependencies matching at least two parameters. We
1536 : * can simply consider all dependencies at once, without having to search
1537 : * for the best statistics object.
1538 : *
1539 : * To not waste cycles and memory, we deserialize dependencies only for
1540 : * statistics that match at least two attributes. The array is allocated
1541 : * with the assumption that all objects match - we could grow the array to
1542 : * make it just the right size, but it's likely wasteful anyway thanks to
1543 : * moving the freed chunks to freelists etc.
1544 : */
1545 1596 : func_dependencies = palloc_array(MVDependencies *, list_length(rel->statlist));
1546 1596 : nfunc_dependencies = 0;
1547 1596 : total_ndeps = 0;
1548 :
1549 3330 : foreach(l, rel->statlist)
1550 : {
1551 1734 : StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
1552 : int nmatched;
1553 : int nexprs;
1554 : int k;
1555 : MVDependencies *deps;
1556 :
1557 : /* skip statistics that are not of the correct type */
1558 1734 : if (stat->kind != STATS_EXT_DEPENDENCIES)
1559 108 : continue;
1560 :
1561 : /* skip statistics with mismatching stxdinherit value */
1562 1626 : if (stat->inherit != rte->inh)
1563 0 : continue;
1564 :
1565 : /*
1566 : * Count matching attributes - we have to undo the attnum offsets. The
1567 : * input attribute numbers are not offset (expressions are not
1568 : * included in stat->keys, so it's not necessary). But we need to
1569 : * offset it before checking against clauses_attnums.
1570 : */
1571 1626 : nmatched = 0;
1572 1626 : k = -1;
1573 6120 : while ((k = bms_next_member(stat->keys, k)) >= 0)
1574 : {
1575 4494 : AttrNumber attnum = (AttrNumber) k;
1576 :
1577 : /* skip expressions */
1578 4494 : if (!AttrNumberIsForUserDefinedAttr(attnum))
1579 0 : continue;
1580 :
1581 : /* apply the same offset as above */
1582 4494 : attnum += attnum_offset;
1583 :
1584 4494 : if (bms_is_member(attnum, clauses_attnums))
1585 3240 : nmatched++;
1586 : }
1587 :
1588 : /* count matching expressions */
1589 1626 : nexprs = 0;
1590 1884 : for (i = 0; i < unique_exprs_cnt; i++)
1591 : {
1592 : ListCell *lc;
1593 :
1594 1032 : foreach(lc, stat->exprs)
1595 : {
1596 774 : Node *stat_expr = (Node *) lfirst(lc);
1597 :
1598 : /* try to match it */
1599 774 : if (equal(stat_expr, unique_exprs[i]))
1600 258 : nexprs++;
1601 : }
1602 : }
1603 :
1604 : /*
1605 : * Skip objects matching fewer than two attributes/expressions from
1606 : * clauses.
1607 : */
1608 1626 : if (nmatched + nexprs < 2)
1609 18 : continue;
1610 :
1611 1608 : deps = statext_dependencies_load(stat->statOid, rte->inh);
1612 :
1613 : /*
1614 : * The expressions may be represented by different attnums in the
1615 : * stats, we need to remap them to be consistent with the clauses.
1616 : * That will make the later steps (e.g. picking the strongest item and
1617 : * so on) much simpler and cheaper, because it won't need to care
1618 : * about the offset at all.
1619 : *
1620 : * When we're at it, we can ignore dependencies that are not fully
1621 : * matched by clauses (i.e. referencing attributes or expressions that
1622 : * are not in the clauses).
1623 : *
1624 : * We have to do this for all statistics, as long as there are any
1625 : * expressions - we need to shift the attnums in all dependencies.
1626 : *
1627 : * XXX Maybe we should do this always, because it also eliminates some
1628 : * of the dependencies early. It might be cheaper than having to walk
1629 : * the longer list in find_strongest_dependency later, especially as
1630 : * we need to do that repeatedly?
1631 : *
1632 : * XXX We have to do this even when there are no expressions in
1633 : * clauses, otherwise find_strongest_dependency may fail for stats
1634 : * with expressions (due to lookup of negative value in bitmap). So we
1635 : * need to at least filter out those dependencies. Maybe we could do
1636 : * it in a cheaper way (if there are no expr clauses, we can just
1637 : * discard all negative attnums without any lookups).
1638 : */
1639 1608 : if (unique_exprs_cnt > 0 || stat->exprs != NIL)
1640 : {
1641 108 : int ndeps = 0;
1642 :
1643 648 : for (i = 0; i < deps->ndeps; i++)
1644 : {
1645 540 : bool skip = false;
1646 540 : MVDependency *dep = deps->deps[i];
1647 : int j;
1648 :
1649 1506 : for (j = 0; j < dep->nattributes; j++)
1650 : {
1651 : int idx;
1652 : Node *expr;
1653 1230 : AttrNumber unique_attnum = InvalidAttrNumber;
1654 : AttrNumber attnum;
1655 :
1656 : /* undo the per-statistics offset */
1657 1230 : attnum = dep->attributes[j];
1658 :
1659 : /*
1660 : * For regular attributes we can simply check if it
1661 : * matches any clause. If there's no matching clause, we
1662 : * can just ignore it. We need to offset the attnum
1663 : * though.
1664 : */
1665 1230 : if (AttrNumberIsForUserDefinedAttr(attnum))
1666 : {
1667 0 : dep->attributes[j] = attnum + attnum_offset;
1668 :
1669 0 : if (!bms_is_member(dep->attributes[j], clauses_attnums))
1670 : {
1671 0 : skip = true;
1672 0 : break;
1673 : }
1674 :
1675 0 : continue;
1676 : }
1677 :
1678 : /*
1679 : * the attnum should be a valid system attnum (-1, -2,
1680 : * ...)
1681 : */
1682 : Assert(AttributeNumberIsValid(attnum));
1683 :
1684 : /*
1685 : * For expressions, we need to do two translations. First
1686 : * we have to translate the negative attnum to index in
1687 : * the list of expressions (in the statistics object).
1688 : * Then we need to see if there's a matching clause. The
1689 : * index of the unique expression determines the attnum
1690 : * (and we offset it).
1691 : */
1692 1230 : idx = -(1 + attnum);
1693 :
1694 : /* Is the expression index is valid? */
1695 : Assert((idx >= 0) && (idx < list_length(stat->exprs)));
1696 :
1697 1230 : expr = (Node *) list_nth(stat->exprs, idx);
1698 :
1699 : /* try to find the expression in the unique list */
1700 2460 : for (int m = 0; m < unique_exprs_cnt; m++)
1701 : {
1702 : /*
1703 : * found a matching unique expression, use the attnum
1704 : * (derived from index of the unique expression)
1705 : */
1706 2196 : if (equal(unique_exprs[m], expr))
1707 : {
1708 966 : unique_attnum = -(m + 1) + attnum_offset;
1709 966 : break;
1710 : }
1711 : }
1712 :
1713 : /*
1714 : * Found no matching expression, so we can simply skip
1715 : * this dependency, because there's no chance it will be
1716 : * fully covered.
1717 : */
1718 1230 : if (unique_attnum == InvalidAttrNumber)
1719 : {
1720 264 : skip = true;
1721 264 : break;
1722 : }
1723 :
1724 : /* otherwise remap it to the new attnum */
1725 966 : dep->attributes[j] = unique_attnum;
1726 : }
1727 :
1728 : /* if found a matching dependency, keep it */
1729 540 : if (!skip)
1730 : {
1731 : /* maybe we've skipped something earlier, so move it */
1732 276 : if (ndeps != i)
1733 0 : deps->deps[ndeps] = deps->deps[i];
1734 :
1735 276 : ndeps++;
1736 : }
1737 : }
1738 :
1739 108 : deps->ndeps = ndeps;
1740 : }
1741 :
1742 : /*
1743 : * It's possible we've removed all dependencies, in which case we
1744 : * don't bother adding it to the list.
1745 : */
1746 1608 : if (deps->ndeps > 0)
1747 : {
1748 1608 : func_dependencies[nfunc_dependencies] = deps;
1749 1608 : total_ndeps += deps->ndeps;
1750 1608 : nfunc_dependencies++;
1751 : }
1752 : }
1753 :
1754 : /* if no matching stats could be found then we've nothing to do */
1755 1596 : if (nfunc_dependencies == 0)
1756 : {
1757 0 : pfree(func_dependencies);
1758 0 : bms_free(clauses_attnums);
1759 0 : pfree(list_attnums);
1760 0 : pfree(unique_exprs);
1761 0 : return 1.0;
1762 : }
1763 :
1764 : /*
1765 : * Work out which dependencies we can apply, starting with the
1766 : * widest/strongest ones, and proceeding to smaller/weaker ones.
1767 : */
1768 1596 : dependencies = palloc_array(MVDependency *, total_ndeps);
1769 1596 : ndependencies = 0;
1770 :
1771 : while (true)
1772 1890 : {
1773 : MVDependency *dependency;
1774 : AttrNumber attnum;
1775 :
1776 : /* the widest/strongest dependency, fully matched by clauses */
1777 3486 : dependency = find_strongest_dependency(func_dependencies,
1778 : nfunc_dependencies,
1779 : clauses_attnums);
1780 3486 : if (!dependency)
1781 1596 : break;
1782 :
1783 1890 : dependencies[ndependencies++] = dependency;
1784 :
1785 : /* Ignore dependencies using this implied attribute in later loops */
1786 1890 : attnum = dependency->attributes[dependency->nattributes - 1];
1787 1890 : clauses_attnums = bms_del_member(clauses_attnums, attnum);
1788 : }
1789 :
1790 : /*
1791 : * If we found applicable dependencies, use them to estimate all
1792 : * compatible clauses on attributes that they refer to.
1793 : */
1794 1596 : if (ndependencies != 0)
1795 1596 : s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
1796 : sjinfo, dependencies, ndependencies,
1797 : list_attnums, estimatedclauses);
1798 :
1799 : /* free deserialized functional dependencies (and then the array) */
1800 3204 : for (i = 0; i < nfunc_dependencies; i++)
1801 1608 : pfree(func_dependencies[i]);
1802 :
1803 1596 : pfree(dependencies);
1804 1596 : pfree(func_dependencies);
1805 1596 : bms_free(clauses_attnums);
1806 1596 : pfree(list_attnums);
1807 1596 : pfree(unique_exprs);
1808 :
1809 1596 : return s1;
1810 : }
|