Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * rangetypes_typanalyze.c
4 : * Functions for gathering statistics from range columns
5 : *
6 : * For a range type column, histograms of lower and upper bounds, and
7 : * the fraction of NULL and empty ranges are collected.
8 : *
9 : * Both histograms have the same length, and they are combined into a
10 : * single array of ranges. This has the same shape as the histogram that
11 : * std_typanalyze would collect, but the values are different. Each range
12 : * in the array is a valid range, even though the lower and upper bounds
13 : * come from different tuples. In theory, the standard scalar selectivity
14 : * functions could be used with the combined histogram.
15 : *
16 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : *
20 : * IDENTIFICATION
21 : * src/backend/utils/adt/rangetypes_typanalyze.c
22 : *
23 : *-------------------------------------------------------------------------
24 : */
25 : #include "postgres.h"
26 :
27 : #include "catalog/pg_operator.h"
28 : #include "commands/vacuum.h"
29 : #include "utils/float.h"
30 : #include "utils/fmgrprotos.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/multirangetypes.h"
33 : #include "utils/rangetypes.h"
34 : #include "varatt.h"
35 :
36 : static int float8_qsort_cmp(const void *a1, const void *a2, void *arg);
37 : static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
38 : static void compute_range_stats(VacAttrStats *stats,
39 : AnalyzeAttrFetchFunc fetchfunc, int samplerows,
40 : double totalrows);
41 :
42 : /*
43 : * range_typanalyze -- typanalyze function for range columns
44 : */
45 : Datum
46 18 : range_typanalyze(PG_FUNCTION_ARGS)
47 : {
48 18 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
49 : TypeCacheEntry *typcache;
50 :
51 : /* Get information about range type; note column might be a domain */
52 18 : typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
53 :
54 18 : if (stats->attstattarget < 0)
55 18 : stats->attstattarget = default_statistics_target;
56 :
57 18 : stats->compute_stats = compute_range_stats;
58 18 : stats->extra_data = typcache;
59 : /* same as in std_typanalyze */
60 18 : stats->minrows = 300 * stats->attstattarget;
61 :
62 18 : PG_RETURN_BOOL(true);
63 : }
64 :
65 : /*
66 : * multirange_typanalyze -- typanalyze function for multirange columns
67 : *
68 : * We do the same analysis as for ranges, but on the smallest range that
69 : * completely includes the multirange.
70 : */
71 : Datum
72 12 : multirange_typanalyze(PG_FUNCTION_ARGS)
73 : {
74 12 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
75 : TypeCacheEntry *typcache;
76 :
77 : /* Get information about multirange type; note column might be a domain */
78 12 : typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
79 :
80 12 : if (stats->attstattarget < 0)
81 12 : stats->attstattarget = default_statistics_target;
82 :
83 12 : stats->compute_stats = compute_range_stats;
84 12 : stats->extra_data = typcache;
85 : /* same as in std_typanalyze */
86 12 : stats->minrows = 300 * stats->attstattarget;
87 :
88 12 : PG_RETURN_BOOL(true);
89 : }
90 :
91 : /*
92 : * Comparison function for sorting float8s, used for range lengths.
93 : */
94 : static int
95 77016 : float8_qsort_cmp(const void *a1, const void *a2, void *arg)
96 : {
97 77016 : const float8 *f1 = (const float8 *) a1;
98 77016 : const float8 *f2 = (const float8 *) a2;
99 :
100 77016 : if (*f1 < *f2)
101 66 : return -1;
102 76950 : else if (*f1 == *f2)
103 69660 : return 0;
104 : else
105 7290 : return 1;
106 : }
107 :
108 : /*
109 : * Comparison function for sorting RangeBounds.
110 : */
111 : static int
112 1382130 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
113 : {
114 1382130 : RangeBound *b1 = (RangeBound *) a1;
115 1382130 : RangeBound *b2 = (RangeBound *) a2;
116 1382130 : TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
117 :
118 1382130 : return range_cmp_bounds(typcache, b1, b2);
119 : }
120 :
121 : /*
122 : * compute_range_stats() -- compute statistics for a range column
123 : */
124 : static void
125 30 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
126 : int samplerows, double totalrows)
127 : {
128 30 : TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
129 30 : TypeCacheEntry *mltrng_typcache = NULL;
130 : bool has_subdiff;
131 30 : int null_cnt = 0;
132 30 : int non_null_cnt = 0;
133 30 : int non_empty_cnt = 0;
134 30 : int empty_cnt = 0;
135 : int range_no;
136 : int slot_idx;
137 30 : int num_bins = stats->attstattarget;
138 : int num_hist;
139 : float8 *lengths;
140 : RangeBound *lowers,
141 : *uppers;
142 30 : double total_width = 0;
143 :
144 30 : if (typcache->typtype == TYPTYPE_MULTIRANGE)
145 : {
146 12 : mltrng_typcache = typcache;
147 12 : typcache = typcache->rngtype;
148 : }
149 : else
150 : Assert(typcache->typtype == TYPTYPE_RANGE);
151 30 : has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
152 :
153 : /* Allocate memory to hold range bounds and lengths of the sample ranges. */
154 30 : lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
155 30 : uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
156 30 : lengths = (float8 *) palloc(sizeof(float8) * samplerows);
157 :
158 : /* Loop over the sample ranges. */
159 59556 : for (range_no = 0; range_no < samplerows; range_no++)
160 : {
161 : Datum value;
162 : bool isnull,
163 : empty;
164 : MultirangeType *multirange;
165 : RangeType *range;
166 : RangeBound lower,
167 : upper;
168 : float8 length;
169 :
170 59526 : vacuum_delay_point();
171 :
172 59526 : value = fetchfunc(stats, range_no, &isnull);
173 59526 : if (isnull)
174 : {
175 : /* range is null, just count that */
176 0 : null_cnt++;
177 0 : continue;
178 : }
179 :
180 : /*
181 : * XXX: should we ignore wide values, like std_typanalyze does, to
182 : * avoid bloating the statistics table?
183 : */
184 59526 : total_width += VARSIZE_ANY(DatumGetPointer(value));
185 :
186 : /* Get range and deserialize it for further analysis. */
187 59526 : if (mltrng_typcache != NULL)
188 : {
189 : /* Treat multiranges like a big range without gaps. */
190 22266 : multirange = DatumGetMultirangeTypeP(value);
191 22266 : if (!MultirangeIsEmpty(multirange))
192 : {
193 : RangeBound tmp;
194 :
195 19242 : multirange_get_bounds(typcache, multirange, 0,
196 : &lower, &tmp);
197 19242 : multirange_get_bounds(typcache, multirange,
198 19242 : multirange->rangeCount - 1,
199 : &tmp, &upper);
200 19242 : empty = false;
201 : }
202 : else
203 : {
204 3024 : empty = true;
205 : }
206 : }
207 : else
208 : {
209 37260 : range = DatumGetRangeTypeP(value);
210 37260 : range_deserialize(typcache, range, &lower, &upper, &empty);
211 : }
212 :
213 59526 : if (!empty)
214 : {
215 : /* Remember bounds and length for further usage in histograms */
216 50496 : lowers[non_empty_cnt] = lower;
217 50496 : uppers[non_empty_cnt] = upper;
218 :
219 50496 : if (lower.infinite || upper.infinite)
220 : {
221 : /* Length of any kind of an infinite range is infinite */
222 2442 : length = get_float8_infinity();
223 : }
224 48054 : else if (has_subdiff)
225 : {
226 : /*
227 : * For an ordinary range, use subdiff function between upper
228 : * and lower bound values.
229 : */
230 48054 : length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
231 : typcache->rng_collation,
232 : upper.val, lower.val));
233 : }
234 : else
235 : {
236 : /* Use default value of 1.0 if no subdiff is available. */
237 0 : length = 1.0;
238 : }
239 50496 : lengths[non_empty_cnt] = length;
240 :
241 50496 : non_empty_cnt++;
242 : }
243 : else
244 9030 : empty_cnt++;
245 :
246 59526 : non_null_cnt++;
247 : }
248 :
249 30 : slot_idx = 0;
250 :
251 : /* We can only compute real stats if we found some non-null values. */
252 30 : if (non_null_cnt > 0)
253 : {
254 : Datum *bound_hist_values;
255 : Datum *length_hist_values;
256 : int pos,
257 : posfrac,
258 : delta,
259 : deltafrac,
260 : i;
261 : MemoryContext old_cxt;
262 : float4 *emptyfrac;
263 :
264 30 : stats->stats_valid = true;
265 : /* Do the simple null-frac and width stats */
266 30 : stats->stanullfrac = (double) null_cnt / (double) samplerows;
267 30 : stats->stawidth = total_width / (double) non_null_cnt;
268 :
269 : /* Estimate that non-null values are unique */
270 30 : stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
271 :
272 : /* Must copy the target values into anl_context */
273 30 : old_cxt = MemoryContextSwitchTo(stats->anl_context);
274 :
275 : /*
276 : * Generate a bounds histogram slot entry if there are at least two
277 : * values.
278 : */
279 30 : if (non_empty_cnt >= 2)
280 : {
281 : /* Sort bound values */
282 30 : qsort_interruptible(lowers, non_empty_cnt, sizeof(RangeBound),
283 : range_bound_qsort_cmp, typcache);
284 30 : qsort_interruptible(uppers, non_empty_cnt, sizeof(RangeBound),
285 : range_bound_qsort_cmp, typcache);
286 :
287 30 : num_hist = non_empty_cnt;
288 30 : if (num_hist > num_bins)
289 12 : num_hist = num_bins + 1;
290 :
291 30 : bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
292 :
293 : /*
294 : * The object of this loop is to construct ranges from first and
295 : * last entries in lowers[] and uppers[] along with evenly-spaced
296 : * values in between. So the i'th value is a range of lowers[(i *
297 : * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
298 : * (num_hist - 1)]. But computing that subscript directly risks
299 : * integer overflow when the stats target is more than a couple
300 : * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
301 : * at each step, tracking the integral and fractional parts of the
302 : * sum separately.
303 : */
304 30 : delta = (non_empty_cnt - 1) / (num_hist - 1);
305 30 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
306 30 : pos = posfrac = 0;
307 :
308 1338 : for (i = 0; i < num_hist; i++)
309 : {
310 1308 : bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
311 1308 : &lowers[pos],
312 1308 : &uppers[pos],
313 : false,
314 : NULL));
315 1308 : pos += delta;
316 1308 : posfrac += deltafrac;
317 1308 : if (posfrac >= (num_hist - 1))
318 : {
319 : /* fractional part exceeds 1, carry to integer part */
320 1188 : pos++;
321 1188 : posfrac -= (num_hist - 1);
322 : }
323 : }
324 :
325 30 : stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
326 30 : stats->stavalues[slot_idx] = bound_hist_values;
327 30 : stats->numvalues[slot_idx] = num_hist;
328 :
329 : /* Store ranges even if we're analyzing a multirange column */
330 30 : stats->statypid[slot_idx] = typcache->type_id;
331 30 : stats->statyplen[slot_idx] = typcache->typlen;
332 30 : stats->statypbyval[slot_idx] = typcache->typbyval;
333 30 : stats->statypalign[slot_idx] = typcache->typalign;
334 :
335 30 : slot_idx++;
336 : }
337 :
338 : /*
339 : * Generate a length histogram slot entry if there are at least two
340 : * values.
341 : */
342 30 : if (non_empty_cnt >= 2)
343 : {
344 : /*
345 : * Ascending sort of range lengths for further filling of
346 : * histogram
347 : */
348 30 : qsort_interruptible(lengths, non_empty_cnt, sizeof(float8),
349 : float8_qsort_cmp, NULL);
350 :
351 30 : num_hist = non_empty_cnt;
352 30 : if (num_hist > num_bins)
353 12 : num_hist = num_bins + 1;
354 :
355 30 : length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
356 :
357 : /*
358 : * The object of this loop is to copy the first and last lengths[]
359 : * entries along with evenly-spaced values in between. So the i'th
360 : * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
361 : * computing that subscript directly risks integer overflow when
362 : * the stats target is more than a couple thousand. Instead we
363 : * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
364 : * the integral and fractional parts of the sum separately.
365 : */
366 30 : delta = (non_empty_cnt - 1) / (num_hist - 1);
367 30 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
368 30 : pos = posfrac = 0;
369 :
370 1338 : for (i = 0; i < num_hist; i++)
371 : {
372 1308 : length_hist_values[i] = Float8GetDatum(lengths[pos]);
373 1308 : pos += delta;
374 1308 : posfrac += deltafrac;
375 1308 : if (posfrac >= (num_hist - 1))
376 : {
377 : /* fractional part exceeds 1, carry to integer part */
378 1188 : pos++;
379 1188 : posfrac -= (num_hist - 1);
380 : }
381 : }
382 : }
383 : else
384 : {
385 : /*
386 : * Even when we don't create the histogram, store an empty array
387 : * to mean "no histogram". We can't just leave stavalues NULL,
388 : * because get_attstatsslot() errors if you ask for stavalues, and
389 : * it's NULL. We'll still store the empty fraction in stanumbers.
390 : */
391 0 : length_hist_values = palloc(0);
392 0 : num_hist = 0;
393 : }
394 30 : stats->staop[slot_idx] = Float8LessOperator;
395 30 : stats->stacoll[slot_idx] = InvalidOid;
396 30 : stats->stavalues[slot_idx] = length_hist_values;
397 30 : stats->numvalues[slot_idx] = num_hist;
398 30 : stats->statypid[slot_idx] = FLOAT8OID;
399 30 : stats->statyplen[slot_idx] = sizeof(float8);
400 30 : stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
401 30 : stats->statypalign[slot_idx] = 'd';
402 :
403 : /* Store the fraction of empty ranges */
404 30 : emptyfrac = (float4 *) palloc(sizeof(float4));
405 30 : *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
406 30 : stats->stanumbers[slot_idx] = emptyfrac;
407 30 : stats->numnumbers[slot_idx] = 1;
408 :
409 30 : stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
410 30 : slot_idx++;
411 :
412 30 : MemoryContextSwitchTo(old_cxt);
413 : }
414 0 : else if (null_cnt > 0)
415 : {
416 : /* We found only nulls; assume the column is entirely null */
417 0 : stats->stats_valid = true;
418 0 : stats->stanullfrac = 1.0;
419 0 : stats->stawidth = 0; /* "unknown" */
420 0 : stats->stadistinct = 0.0; /* "unknown" */
421 : }
422 :
423 : /*
424 : * We don't need to bother cleaning up any of our temporary palloc's. The
425 : * hashtable should also go away, as it used a child memory context.
426 : */
427 30 : }
|