Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * mvdistinct.c
4 : * POSTGRES multivariate ndistinct coefficients
5 : *
6 : * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 : * is tricky, and the estimation error is often significant.
8 :
9 : * The multivariate ndistinct coefficients address this by storing ndistinct
10 : * estimates for combinations of the user-specified columns. So for example
11 : * given a statistics object on three columns (a,b,c), this module estimates
12 : * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 : * estimates are already available in pg_statistic.
14 : *
15 : *
16 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : * IDENTIFICATION
20 : * src/backend/statistics/mvdistinct.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 : #include "postgres.h"
25 :
26 : #include <math.h>
27 :
28 : #include "catalog/pg_statistic_ext.h"
29 : #include "catalog/pg_statistic_ext_data.h"
30 : #include "statistics/extended_stats_internal.h"
31 : #include "utils/syscache.h"
32 : #include "utils/typcache.h"
33 : #include "varatt.h"
34 :
35 : static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
36 : int k, int *combination);
37 : static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
38 : static int n_choose_k(int n, int k);
39 : static int num_combinations(int n);
40 :
41 : /* size of the struct header fields (magic, type, nitems) */
42 : #define SizeOfHeader (3 * sizeof(uint32))
43 :
44 : /* size of a serialized ndistinct item (coefficient, natts, atts) */
45 : #define SizeOfItem(natts) \
46 : (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
47 :
48 : /* minimal size of a ndistinct item (with two attributes) */
49 : #define MinSizeOfItem SizeOfItem(2)
50 :
51 : /* minimal size of mvndistinct, when all items are minimal */
52 : #define MinSizeOfItems(nitems) \
53 : (SizeOfHeader + (nitems) * MinSizeOfItem)
54 :
55 : /* Combination generator API */
56 :
57 : /* internal state for generator of k-combinations of n elements */
58 : typedef struct CombinationGenerator
59 : {
60 : int k; /* size of the combination */
61 : int n; /* total number of elements */
62 : int current; /* index of the next combination to return */
63 : int ncombinations; /* number of combinations (size of array) */
64 : int *combinations; /* array of pre-built combinations */
65 : } CombinationGenerator;
66 :
67 : static CombinationGenerator *generator_init(int n, int k);
68 : static void generator_free(CombinationGenerator *state);
69 : static int *generator_next(CombinationGenerator *state);
70 : static void generate_combinations(CombinationGenerator *state);
71 :
72 :
73 : /*
74 : * statext_ndistinct_build
75 : * Compute ndistinct coefficient for the combination of attributes.
76 : *
77 : * This computes the ndistinct estimate using the same estimator used
78 : * in analyze.c and then computes the coefficient.
79 : *
80 : * To handle expressions easily, we treat them as system attributes with
81 : * negative attnums, and offset everything by number of expressions to
82 : * allow using Bitmapsets.
83 : */
84 : MVNDistinct *
85 204 : statext_ndistinct_build(double totalrows, StatsBuildData *data)
86 : {
87 : MVNDistinct *result;
88 : int k;
89 : int itemcnt;
90 204 : int numattrs = data->nattnums;
91 204 : int numcombs = num_combinations(numattrs);
92 :
93 204 : result = palloc(offsetof(MVNDistinct, items) +
94 204 : numcombs * sizeof(MVNDistinctItem));
95 204 : result->magic = STATS_NDISTINCT_MAGIC;
96 204 : result->type = STATS_NDISTINCT_TYPE_BASIC;
97 204 : result->nitems = numcombs;
98 :
99 204 : itemcnt = 0;
100 504 : for (k = 2; k <= numattrs; k++)
101 : {
102 : int *combination;
103 : CombinationGenerator *generator;
104 :
105 : /* generate combinations of K out of N elements */
106 300 : generator = generator_init(numattrs, k);
107 :
108 888 : while ((combination = generator_next(generator)))
109 : {
110 588 : MVNDistinctItem *item = &result->items[itemcnt];
111 : int j;
112 :
113 588 : item->attributes = palloc(sizeof(AttrNumber) * k);
114 588 : item->nattributes = k;
115 :
116 : /* translate the indexes to attnums */
117 1956 : for (j = 0; j < k; j++)
118 : {
119 1368 : item->attributes[j] = data->attnums[combination[j]];
120 :
121 : Assert(AttributeNumberIsValid(item->attributes[j]));
122 : }
123 :
124 588 : item->ndistinct =
125 588 : ndistinct_for_combination(totalrows, data, k, combination);
126 :
127 588 : itemcnt++;
128 : Assert(itemcnt <= result->nitems);
129 : }
130 :
131 300 : generator_free(generator);
132 : }
133 :
134 : /* must consume exactly the whole output array */
135 : Assert(itemcnt == result->nitems);
136 :
137 204 : return result;
138 : }
139 :
140 : /*
141 : * statext_ndistinct_load
142 : * Load the ndistinct value for the indicated pg_statistic_ext tuple
143 : */
144 : MVNDistinct *
145 426 : statext_ndistinct_load(Oid mvoid, bool inh)
146 : {
147 : MVNDistinct *result;
148 : bool isnull;
149 : Datum ndist;
150 : HeapTuple htup;
151 :
152 426 : htup = SearchSysCache2(STATEXTDATASTXOID,
153 : ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
154 426 : if (!HeapTupleIsValid(htup))
155 0 : elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
156 :
157 426 : ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
158 : Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
159 426 : if (isnull)
160 0 : elog(ERROR,
161 : "requested statistics kind \"%c\" is not yet built for statistics object %u",
162 : STATS_EXT_NDISTINCT, mvoid);
163 :
164 426 : result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist));
165 :
166 426 : ReleaseSysCache(htup);
167 :
168 426 : return result;
169 : }
170 :
171 : /*
172 : * statext_ndistinct_serialize
173 : * serialize ndistinct to the on-disk bytea format
174 : */
175 : bytea *
176 204 : statext_ndistinct_serialize(MVNDistinct *ndistinct)
177 : {
178 : int i;
179 : bytea *output;
180 : char *tmp;
181 : Size len;
182 :
183 : Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
184 : Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC);
185 :
186 : /*
187 : * Base size is size of scalar fields in the struct, plus one base struct
188 : * for each item, including number of items for each.
189 : */
190 204 : len = VARHDRSZ + SizeOfHeader;
191 :
192 : /* and also include space for the actual attribute numbers */
193 792 : for (i = 0; i < ndistinct->nitems; i++)
194 : {
195 : int nmembers;
196 :
197 588 : nmembers = ndistinct->items[i].nattributes;
198 : Assert(nmembers >= 2);
199 :
200 588 : len += SizeOfItem(nmembers);
201 : }
202 :
203 204 : output = (bytea *) palloc(len);
204 204 : SET_VARSIZE(output, len);
205 :
206 204 : tmp = VARDATA(output);
207 :
208 : /* Store the base struct values (magic, type, nitems) */
209 204 : memcpy(tmp, &ndistinct->magic, sizeof(uint32));
210 204 : tmp += sizeof(uint32);
211 204 : memcpy(tmp, &ndistinct->type, sizeof(uint32));
212 204 : tmp += sizeof(uint32);
213 204 : memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
214 204 : tmp += sizeof(uint32);
215 :
216 : /*
217 : * store number of attributes and attribute numbers for each entry
218 : */
219 792 : for (i = 0; i < ndistinct->nitems; i++)
220 : {
221 588 : MVNDistinctItem item = ndistinct->items[i];
222 588 : int nmembers = item.nattributes;
223 :
224 588 : memcpy(tmp, &item.ndistinct, sizeof(double));
225 588 : tmp += sizeof(double);
226 588 : memcpy(tmp, &nmembers, sizeof(int));
227 588 : tmp += sizeof(int);
228 :
229 588 : memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
230 588 : tmp += nmembers * sizeof(AttrNumber);
231 :
232 : /* protect against overflows */
233 : Assert(tmp <= ((char *) output + len));
234 : }
235 :
236 : /* check we used exactly the expected space */
237 : Assert(tmp == ((char *) output + len));
238 :
239 204 : return output;
240 : }
241 :
242 : /*
243 : * statext_ndistinct_deserialize
244 : * Read an on-disk bytea format MVNDistinct to in-memory format
245 : */
246 : MVNDistinct *
247 450 : statext_ndistinct_deserialize(bytea *data)
248 : {
249 : int i;
250 : Size minimum_size;
251 : MVNDistinct ndist;
252 : MVNDistinct *ndistinct;
253 : char *tmp;
254 :
255 450 : if (data == NULL)
256 0 : return NULL;
257 :
258 : /* we expect at least the basic fields of MVNDistinct struct */
259 450 : if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
260 0 : elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
261 : VARSIZE_ANY_EXHDR(data), SizeOfHeader);
262 :
263 : /* initialize pointer to the data part (skip the varlena header) */
264 450 : tmp = VARDATA_ANY(data);
265 :
266 : /* read the header fields and perform basic sanity checks */
267 450 : memcpy(&ndist.magic, tmp, sizeof(uint32));
268 450 : tmp += sizeof(uint32);
269 450 : memcpy(&ndist.type, tmp, sizeof(uint32));
270 450 : tmp += sizeof(uint32);
271 450 : memcpy(&ndist.nitems, tmp, sizeof(uint32));
272 450 : tmp += sizeof(uint32);
273 :
274 450 : if (ndist.magic != STATS_NDISTINCT_MAGIC)
275 0 : elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
276 : ndist.magic, STATS_NDISTINCT_MAGIC);
277 450 : if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
278 0 : elog(ERROR, "invalid ndistinct type %d (expected %d)",
279 : ndist.type, STATS_NDISTINCT_TYPE_BASIC);
280 450 : if (ndist.nitems == 0)
281 0 : elog(ERROR, "invalid zero-length item array in MVNDistinct");
282 :
283 : /* what minimum bytea size do we expect for those parameters */
284 450 : minimum_size = MinSizeOfItems(ndist.nitems);
285 450 : if (VARSIZE_ANY_EXHDR(data) < minimum_size)
286 0 : elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
287 : VARSIZE_ANY_EXHDR(data), minimum_size);
288 :
289 : /*
290 : * Allocate space for the ndistinct items (no space for each item's
291 : * attnos: those live in bitmapsets allocated separately)
292 : */
293 450 : ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
294 450 : (ndist.nitems * sizeof(MVNDistinctItem)));
295 450 : ndistinct->magic = ndist.magic;
296 450 : ndistinct->type = ndist.type;
297 450 : ndistinct->nitems = ndist.nitems;
298 :
299 2412 : for (i = 0; i < ndistinct->nitems; i++)
300 : {
301 1962 : MVNDistinctItem *item = &ndistinct->items[i];
302 :
303 : /* ndistinct value */
304 1962 : memcpy(&item->ndistinct, tmp, sizeof(double));
305 1962 : tmp += sizeof(double);
306 :
307 : /* number of attributes */
308 1962 : memcpy(&item->nattributes, tmp, sizeof(int));
309 1962 : tmp += sizeof(int);
310 : Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
311 :
312 : item->attributes
313 1962 : = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
314 :
315 1962 : memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
316 1962 : tmp += sizeof(AttrNumber) * item->nattributes;
317 :
318 : /* still within the bytea */
319 : Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
320 : }
321 :
322 : /* we should have consumed the whole bytea exactly */
323 : Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
324 :
325 450 : return ndistinct;
326 : }
327 :
328 : /*
329 : * ndistinct_for_combination
330 : * Estimates number of distinct values in a combination of columns.
331 : *
332 : * This uses the same ndistinct estimator as compute_scalar_stats() in
333 : * ANALYZE, i.e.,
334 : * n*d / (n - f1 + f1*n/N)
335 : *
336 : * except that instead of values in a single column we are dealing with
337 : * combination of multiple columns.
338 : */
339 : static double
340 588 : ndistinct_for_combination(double totalrows, StatsBuildData *data,
341 : int k, int *combination)
342 : {
343 : int i,
344 : j;
345 : int f1,
346 : cnt,
347 : d;
348 : bool *isnull;
349 : Datum *values;
350 : SortItem *items;
351 : MultiSortSupport mss;
352 588 : int numrows = data->numrows;
353 :
354 588 : mss = multi_sort_init(k);
355 :
356 : /*
357 : * In order to determine the number of distinct elements, create separate
358 : * values[]/isnull[] arrays with all the data we have, then sort them
359 : * using the specified column combination as dimensions. We could try to
360 : * sort in place, but it'd probably be more complex and bug-prone.
361 : */
362 588 : items = (SortItem *) palloc(numrows * sizeof(SortItem));
363 588 : values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
364 588 : isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
365 :
366 963818 : for (i = 0; i < numrows; i++)
367 : {
368 963230 : items[i].values = &values[i * k];
369 963230 : items[i].isnull = &isnull[i * k];
370 : }
371 :
372 : /*
373 : * For each dimension, set up sort-support and fill in the values from the
374 : * sample data.
375 : *
376 : * We use the column data types' default sort operators and collations;
377 : * perhaps at some point it'd be worth using column-specific collations?
378 : */
379 1956 : for (i = 0; i < k; i++)
380 : {
381 : Oid typid;
382 : TypeCacheEntry *type;
383 1368 : Oid collid = InvalidOid;
384 1368 : VacAttrStats *colstat = data->stats[combination[i]];
385 :
386 1368 : typid = colstat->attrtypid;
387 1368 : collid = colstat->attrcollid;
388 :
389 1368 : type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
390 1368 : if (type->lt_opr == InvalidOid) /* shouldn't happen */
391 0 : elog(ERROR, "cache lookup failed for ordering operator for type %u",
392 : typid);
393 :
394 : /* prepare the sort function for this dimension */
395 1368 : multi_sort_add_dimension(mss, i, type->lt_opr, collid);
396 :
397 : /* accumulate all the data for this dimension into the arrays */
398 2227822 : for (j = 0; j < numrows; j++)
399 : {
400 2226454 : items[j].values[i] = data->values[combination[i]][j];
401 2226454 : items[j].isnull[i] = data->nulls[combination[i]][j];
402 : }
403 : }
404 :
405 : /* We can sort the array now ... */
406 588 : qsort_interruptible(items, numrows, sizeof(SortItem),
407 : multi_sort_compare, mss);
408 :
409 : /* ... and count the number of distinct combinations */
410 :
411 588 : f1 = 0;
412 588 : cnt = 1;
413 588 : d = 1;
414 963230 : for (i = 1; i < numrows; i++)
415 : {
416 962642 : if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
417 : {
418 284412 : if (cnt == 1)
419 145984 : f1 += 1;
420 :
421 284412 : d++;
422 284412 : cnt = 0;
423 : }
424 :
425 962642 : cnt += 1;
426 : }
427 :
428 588 : if (cnt == 1)
429 138 : f1 += 1;
430 :
431 588 : return estimate_ndistinct(totalrows, numrows, d, f1);
432 : }
433 :
434 : /* The Duj1 estimator (already used in analyze.c). */
435 : static double
436 588 : estimate_ndistinct(double totalrows, int numrows, int d, int f1)
437 : {
438 : double numer,
439 : denom,
440 : ndistinct;
441 :
442 588 : numer = (double) numrows * (double) d;
443 :
444 588 : denom = (double) (numrows - f1) +
445 588 : (double) f1 * (double) numrows / totalrows;
446 :
447 588 : ndistinct = numer / denom;
448 :
449 : /* Clamp to sane range in case of roundoff error */
450 588 : if (ndistinct < (double) d)
451 0 : ndistinct = (double) d;
452 :
453 588 : if (ndistinct > totalrows)
454 0 : ndistinct = totalrows;
455 :
456 588 : return floor(ndistinct + 0.5);
457 : }
458 :
459 : /*
460 : * n_choose_k
461 : * computes binomial coefficients using an algorithm that is both
462 : * efficient and prevents overflows
463 : */
464 : static int
465 300 : n_choose_k(int n, int k)
466 : {
467 : int d,
468 : r;
469 :
470 : Assert((k > 0) && (n >= k));
471 :
472 : /* use symmetry of the binomial coefficients */
473 300 : k = Min(k, n - k);
474 :
475 300 : r = 1;
476 420 : for (d = 1; d <= k; ++d)
477 : {
478 120 : r *= n--;
479 120 : r /= d;
480 : }
481 :
482 300 : return r;
483 : }
484 :
485 : /*
486 : * num_combinations
487 : * number of combinations, excluding single-value combinations
488 : */
489 : static int
490 204 : num_combinations(int n)
491 : {
492 204 : return (1 << n) - (n + 1);
493 : }
494 :
495 : /*
496 : * generator_init
497 : * initialize the generator of combinations
498 : *
499 : * The generator produces combinations of K elements in the interval (0..N).
500 : * We prebuild all the combinations in this method, which is simpler than
501 : * generating them on the fly.
502 : */
503 : static CombinationGenerator *
504 300 : generator_init(int n, int k)
505 : {
506 : CombinationGenerator *state;
507 :
508 : Assert((n >= k) && (k > 0));
509 :
510 : /* allocate the generator state as a single chunk of memory */
511 300 : state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator));
512 :
513 300 : state->ncombinations = n_choose_k(n, k);
514 :
515 : /* pre-allocate space for all combinations */
516 300 : state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
517 :
518 300 : state->current = 0;
519 300 : state->k = k;
520 300 : state->n = n;
521 :
522 : /* now actually pre-generate all the combinations of K elements */
523 300 : generate_combinations(state);
524 :
525 : /* make sure we got the expected number of combinations */
526 : Assert(state->current == state->ncombinations);
527 :
528 : /* reset the number, so we start with the first one */
529 300 : state->current = 0;
530 :
531 300 : return state;
532 : }
533 :
534 : /*
535 : * generator_next
536 : * returns the next combination from the prebuilt list
537 : *
538 : * Returns a combination of K array indexes (0 .. N), as specified to
539 : * generator_init), or NULL when there are no more combination.
540 : */
541 : static int *
542 888 : generator_next(CombinationGenerator *state)
543 : {
544 888 : if (state->current == state->ncombinations)
545 300 : return NULL;
546 :
547 588 : return &state->combinations[state->k * state->current++];
548 : }
549 :
550 : /*
551 : * generator_free
552 : * free the internal state of the generator
553 : *
554 : * Releases the generator internal state (pre-built combinations).
555 : */
556 : static void
557 300 : generator_free(CombinationGenerator *state)
558 : {
559 300 : pfree(state->combinations);
560 300 : pfree(state);
561 300 : }
562 :
563 : /*
564 : * generate_combinations_recurse
565 : * given a prefix, generate all possible combinations
566 : *
567 : * Given a prefix (first few elements of the combination), generate following
568 : * elements recursively. We generate the combinations in lexicographic order,
569 : * which eliminates permutations of the same combination.
570 : */
571 : static void
572 2256 : generate_combinations_recurse(CombinationGenerator *state,
573 : int index, int start, int *current)
574 : {
575 : /* If we haven't filled all the elements, simply recurse. */
576 2256 : if (index < state->k)
577 : {
578 : int i;
579 :
580 : /*
581 : * The values have to be in ascending order, so make sure we start
582 : * with the value passed by parameter.
583 : */
584 :
585 3624 : for (i = start; i < state->n; i++)
586 : {
587 1956 : current[index] = i;
588 1956 : generate_combinations_recurse(state, (index + 1), (i + 1), current);
589 : }
590 :
591 1668 : return;
592 : }
593 : else
594 : {
595 : /* we got a valid combination, add it to the array */
596 588 : memcpy(&state->combinations[(state->k * state->current)],
597 588 : current, state->k * sizeof(int));
598 588 : state->current++;
599 : }
600 : }
601 :
602 : /*
603 : * generate_combinations
604 : * generate all k-combinations of N elements
605 : */
606 : static void
607 300 : generate_combinations(CombinationGenerator *state)
608 : {
609 300 : int *current = (int *) palloc0(sizeof(int) * state->k);
610 :
611 300 : generate_combinations_recurse(state, 0, 0, current);
612 :
613 300 : pfree(current);
614 300 : }
|