Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * rangetypes_typanalyze.c
4 : * Functions for gathering statistics from range columns
5 : *
6 : * For a range type column, histograms of lower and upper bounds, and
7 : * the fraction of NULL and empty ranges are collected.
8 : *
9 : * Both histograms have the same length, and they are combined into a
10 : * single array of ranges. This has the same shape as the histogram that
11 : * std_typanalyze would collect, but the values are different. Each range
12 : * in the array is a valid range, even though the lower and upper bounds
13 : * come from different tuples. In theory, the standard scalar selectivity
14 : * functions could be used with the combined histogram.
15 : *
16 : * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : *
20 : * IDENTIFICATION
21 : * src/backend/utils/adt/rangetypes_typanalyze.c
22 : *
23 : *-------------------------------------------------------------------------
24 : */
25 : #include "postgres.h"
26 :
27 : #include "catalog/pg_operator.h"
28 : #include "commands/vacuum.h"
29 : #include "utils/float.h"
30 : #include "utils/fmgrprotos.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/rangetypes.h"
33 : #include "utils/multirangetypes.h"
34 :
35 : static int float8_qsort_cmp(const void *a1, const void *a2);
36 : static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
37 : static void compute_range_stats(VacAttrStats *stats,
38 : AnalyzeAttrFetchFunc fetchfunc, int samplerows,
39 : double totalrows);
40 :
41 : /*
42 : * range_typanalyze -- typanalyze function for range columns
43 : */
44 : Datum
45 16 : range_typanalyze(PG_FUNCTION_ARGS)
46 : {
47 16 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
48 : TypeCacheEntry *typcache;
49 16 : Form_pg_attribute attr = stats->attr;
50 :
51 : /* Get information about range type; note column might be a domain */
52 16 : typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
53 :
54 16 : if (attr->attstattarget < 0)
55 16 : attr->attstattarget = default_statistics_target;
56 :
57 16 : stats->compute_stats = compute_range_stats;
58 16 : stats->extra_data = typcache;
59 : /* same as in std_typanalyze */
60 16 : stats->minrows = 300 * attr->attstattarget;
61 :
62 16 : PG_RETURN_BOOL(true);
63 : }
64 :
65 : /*
66 : * multirange_typanalyze -- typanalyze function for multirange columns
67 : *
68 : * We do the same analysis as for ranges, but on the smallest range that
69 : * completely includes the multirange.
70 : */
71 : Datum
72 12 : multirange_typanalyze(PG_FUNCTION_ARGS)
73 : {
74 12 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
75 : TypeCacheEntry *typcache;
76 12 : Form_pg_attribute attr = stats->attr;
77 :
78 : /* Get information about multirange type; note column might be a domain */
79 12 : typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
80 :
81 12 : if (attr->attstattarget < 0)
82 12 : attr->attstattarget = default_statistics_target;
83 :
84 12 : stats->compute_stats = compute_range_stats;
85 12 : stats->extra_data = typcache;
86 : /* same as in std_typanalyze */
87 12 : stats->minrows = 300 * attr->attstattarget;
88 :
89 12 : PG_RETURN_BOOL(true);
90 : }
91 :
92 : /*
93 : * Comparison function for sorting float8s, used for range lengths.
94 : */
95 : static int
96 115436 : float8_qsort_cmp(const void *a1, const void *a2)
97 : {
98 115436 : const float8 *f1 = (const float8 *) a1;
99 115436 : const float8 *f2 = (const float8 *) a2;
100 :
101 115436 : if (*f1 < *f2)
102 76 : return -1;
103 115360 : else if (*f1 == *f2)
104 103270 : return 0;
105 : else
106 12090 : return 1;
107 : }
108 :
109 : /*
110 : * Comparison function for sorting RangeBounds.
111 : */
112 : static int
113 1917202 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
114 : {
115 1917202 : RangeBound *b1 = (RangeBound *) a1;
116 1917202 : RangeBound *b2 = (RangeBound *) a2;
117 1917202 : TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
118 :
119 1917202 : return range_cmp_bounds(typcache, b1, b2);
120 : }
121 :
122 : /*
123 : * compute_range_stats() -- compute statistics for a range column
124 : */
125 : static void
126 28 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
127 : int samplerows, double totalrows)
128 : {
129 28 : TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
130 28 : TypeCacheEntry *mltrng_typcache = NULL;
131 : bool has_subdiff;
132 28 : int null_cnt = 0;
133 28 : int non_null_cnt = 0;
134 28 : int non_empty_cnt = 0;
135 28 : int empty_cnt = 0;
136 : int range_no;
137 : int slot_idx;
138 28 : int num_bins = stats->attr->attstattarget;
139 : int num_hist;
140 : float8 *lengths;
141 : RangeBound *lowers,
142 : *uppers;
143 28 : double total_width = 0;
144 :
145 28 : if (typcache->typtype == TYPTYPE_MULTIRANGE)
146 : {
147 12 : mltrng_typcache = typcache;
148 12 : typcache = typcache->rngtype;
149 : }
150 : else
151 : Assert(typcache->typtype == TYPTYPE_RANGE);
152 28 : has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
153 :
154 : /* Allocate memory to hold range bounds and lengths of the sample ranges. */
155 28 : lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
156 28 : uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
157 28 : lengths = (float8 *) palloc(sizeof(float8) * samplerows);
158 :
159 : /* Loop over the sample ranges. */
160 84330 : for (range_no = 0; range_no < samplerows; range_no++)
161 : {
162 : Datum value;
163 : bool isnull,
164 : empty;
165 : MultirangeType *multirange;
166 : RangeType *range;
167 : RangeBound lower,
168 : upper;
169 : float8 length;
170 :
171 84302 : vacuum_delay_point();
172 :
173 84302 : value = fetchfunc(stats, range_no, &isnull);
174 84302 : if (isnull)
175 : {
176 : /* range is null, just count that */
177 0 : null_cnt++;
178 0 : continue;
179 : }
180 :
181 : /*
182 : * XXX: should we ignore wide values, like std_typanalyze does, to
183 : * avoid bloating the statistics table?
184 : */
185 84302 : total_width += VARSIZE_ANY(DatumGetPointer(value));
186 :
187 : /* Get range and deserialize it for further analysis. */
188 84302 : if (mltrng_typcache != NULL)
189 : {
190 : /* Treat multiranges like a big range without gaps. */
191 22266 : multirange = DatumGetMultirangeTypeP(value);
192 22266 : if (!MultirangeIsEmpty(multirange))
193 : {
194 : RangeBound tmp;
195 :
196 19242 : multirange_get_bounds(typcache, multirange, 0,
197 : &lower, &tmp);
198 19242 : multirange_get_bounds(typcache, multirange,
199 19242 : multirange->rangeCount - 1,
200 : &tmp, &upper);
201 19242 : empty = false;
202 : }
203 : else
204 : {
205 3024 : empty = true;
206 : }
207 : }
208 : else
209 : {
210 62036 : range = DatumGetRangeTypeP(value);
211 62036 : range_deserialize(typcache, range, &lower, &upper, &empty);
212 : }
213 :
214 84302 : if (!empty)
215 : {
216 : /* Remember bounds and length for further usage in histograms */
217 71272 : lowers[non_empty_cnt] = lower;
218 71272 : uppers[non_empty_cnt] = upper;
219 :
220 71272 : if (lower.infinite || upper.infinite)
221 : {
222 : /* Length of any kind of an infinite range is infinite */
223 3242 : length = get_float8_infinity();
224 : }
225 68030 : else if (has_subdiff)
226 : {
227 : /*
228 : * For an ordinary range, use subdiff function between upper
229 : * and lower bound values.
230 : */
231 68030 : length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
232 : typcache->rng_collation,
233 : upper.val, lower.val));
234 : }
235 : else
236 : {
237 : /* Use default value of 1.0 if no subdiff is available. */
238 0 : length = 1.0;
239 : }
240 71272 : lengths[non_empty_cnt] = length;
241 :
242 71272 : non_empty_cnt++;
243 : }
244 : else
245 13030 : empty_cnt++;
246 :
247 84302 : non_null_cnt++;
248 : }
249 :
250 28 : slot_idx = 0;
251 :
252 : /* We can only compute real stats if we found some non-null values. */
253 28 : if (non_null_cnt > 0)
254 : {
255 : Datum *bound_hist_values;
256 : Datum *length_hist_values;
257 : int pos,
258 : posfrac,
259 : delta,
260 : deltafrac,
261 : i;
262 : MemoryContext old_cxt;
263 : float4 *emptyfrac;
264 :
265 28 : stats->stats_valid = true;
266 : /* Do the simple null-frac and width stats */
267 28 : stats->stanullfrac = (double) null_cnt / (double) samplerows;
268 28 : stats->stawidth = total_width / (double) non_null_cnt;
269 :
270 : /* Estimate that non-null values are unique */
271 28 : stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
272 :
273 : /* Must copy the target values into anl_context */
274 28 : old_cxt = MemoryContextSwitchTo(stats->anl_context);
275 :
276 : /*
277 : * Generate a bounds histogram slot entry if there are at least two
278 : * values.
279 : */
280 28 : if (non_empty_cnt >= 2)
281 : {
282 : /* Sort bound values */
283 28 : qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
284 : range_bound_qsort_cmp, typcache);
285 28 : qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
286 : range_bound_qsort_cmp, typcache);
287 :
288 28 : num_hist = non_empty_cnt;
289 28 : if (num_hist > num_bins)
290 16 : num_hist = num_bins + 1;
291 :
292 28 : bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
293 :
294 : /*
295 : * The object of this loop is to construct ranges from first and
296 : * last entries in lowers[] and uppers[] along with evenly-spaced
297 : * values in between. So the i'th value is a range of lowers[(i *
298 : * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
299 : * (num_hist - 1)]. But computing that subscript directly risks
300 : * integer overflow when the stats target is more than a couple
301 : * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
302 : * at each step, tracking the integral and fractional parts of the
303 : * sum separately.
304 : */
305 28 : delta = (non_empty_cnt - 1) / (num_hist - 1);
306 28 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
307 28 : pos = posfrac = 0;
308 :
309 1716 : for (i = 0; i < num_hist; i++)
310 : {
311 1688 : bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
312 : &lowers[pos],
313 : &uppers[pos],
314 : false));
315 1688 : pos += delta;
316 1688 : posfrac += deltafrac;
317 1688 : if (posfrac >= (num_hist - 1))
318 : {
319 : /* fractional part exceeds 1, carry to integer part */
320 1584 : pos++;
321 1584 : posfrac -= (num_hist - 1);
322 : }
323 : }
324 :
325 28 : stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
326 28 : stats->stavalues[slot_idx] = bound_hist_values;
327 28 : stats->numvalues[slot_idx] = num_hist;
328 :
329 : /* Store ranges even if we're analyzing a multirange column */
330 28 : stats->statypid[slot_idx] = typcache->type_id;
331 28 : stats->statyplen[slot_idx] = typcache->typlen;
332 28 : stats->statypbyval[slot_idx] = typcache->typbyval;
333 28 : stats->statypalign[slot_idx] = typcache->typalign;
334 :
335 28 : slot_idx++;
336 : }
337 :
338 : /*
339 : * Generate a length histogram slot entry if there are at least two
340 : * values.
341 : */
342 28 : if (non_empty_cnt >= 2)
343 : {
344 : /*
345 : * Ascending sort of range lengths for further filling of
346 : * histogram
347 : */
348 28 : qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
349 :
350 28 : num_hist = non_empty_cnt;
351 28 : if (num_hist > num_bins)
352 16 : num_hist = num_bins + 1;
353 :
354 28 : length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
355 :
356 : /*
357 : * The object of this loop is to copy the first and last lengths[]
358 : * entries along with evenly-spaced values in between. So the i'th
359 : * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
360 : * computing that subscript directly risks integer overflow when
361 : * the stats target is more than a couple thousand. Instead we
362 : * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
363 : * the integral and fractional parts of the sum separately.
364 : */
365 28 : delta = (non_empty_cnt - 1) / (num_hist - 1);
366 28 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
367 28 : pos = posfrac = 0;
368 :
369 1716 : for (i = 0; i < num_hist; i++)
370 : {
371 1688 : length_hist_values[i] = Float8GetDatum(lengths[pos]);
372 1688 : pos += delta;
373 1688 : posfrac += deltafrac;
374 1688 : if (posfrac >= (num_hist - 1))
375 : {
376 : /* fractional part exceeds 1, carry to integer part */
377 1584 : pos++;
378 1584 : posfrac -= (num_hist - 1);
379 : }
380 : }
381 : }
382 : else
383 : {
384 : /*
385 : * Even when we don't create the histogram, store an empty array
386 : * to mean "no histogram". We can't just leave stavalues NULL,
387 : * because get_attstatsslot() errors if you ask for stavalues, and
388 : * it's NULL. We'll still store the empty fraction in stanumbers.
389 : */
390 0 : length_hist_values = palloc(0);
391 0 : num_hist = 0;
392 : }
393 28 : stats->staop[slot_idx] = Float8LessOperator;
394 28 : stats->stacoll[slot_idx] = InvalidOid;
395 28 : stats->stavalues[slot_idx] = length_hist_values;
396 28 : stats->numvalues[slot_idx] = num_hist;
397 28 : stats->statypid[slot_idx] = FLOAT8OID;
398 28 : stats->statyplen[slot_idx] = sizeof(float8);
399 28 : stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
400 28 : stats->statypalign[slot_idx] = 'd';
401 :
402 : /* Store the fraction of empty ranges */
403 28 : emptyfrac = (float4 *) palloc(sizeof(float4));
404 28 : *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
405 28 : stats->stanumbers[slot_idx] = emptyfrac;
406 28 : stats->numnumbers[slot_idx] = 1;
407 :
408 28 : stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
409 28 : slot_idx++;
410 :
411 28 : MemoryContextSwitchTo(old_cxt);
412 : }
413 0 : else if (null_cnt > 0)
414 : {
415 : /* We found only nulls; assume the column is entirely null */
416 0 : stats->stats_valid = true;
417 0 : stats->stanullfrac = 1.0;
418 0 : stats->stawidth = 0; /* "unknown" */
419 0 : stats->stadistinct = 0.0; /* "unknown" */
420 : }
421 :
422 : /*
423 : * We don't need to bother cleaning up any of our temporary palloc's. The
424 : * hashtable should also go away, as it used a child memory context.
425 : */
426 28 : }
|