LCOV - code coverage report
Current view: top level - src/backend/utils/adt - rangetypes_typanalyze.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 93.3 % 149 139
Test Date: 2026-03-03 16:15:26 Functions: 100.0 % 5 5
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * rangetypes_typanalyze.c
       4              :  *    Functions for gathering statistics from range columns
       5              :  *
       6              :  * For a range type column, histograms of lower and upper bounds, and
       7              :  * the fraction of NULL and empty ranges are collected.
       8              :  *
       9              :  * Both histograms have the same length, and they are combined into a
      10              :  * single array of ranges. This has the same shape as the histogram that
      11              :  * std_typanalyze would collect, but the values are different. Each range
      12              :  * in the array is a valid range, even though the lower and upper bounds
      13              :  * come from different tuples. In theory, the standard scalar selectivity
      14              :  * functions could be used with the combined histogram.
      15              :  *
      16              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      17              :  * Portions Copyright (c) 1994, Regents of the University of California
      18              :  *
      19              :  *
      20              :  * IDENTIFICATION
      21              :  *    src/backend/utils/adt/rangetypes_typanalyze.c
      22              :  *
      23              :  *-------------------------------------------------------------------------
      24              :  */
      25              : #include "postgres.h"
      26              : 
      27              : #include "catalog/pg_operator.h"
      28              : #include "commands/vacuum.h"
      29              : #include "utils/float.h"
      30              : #include "utils/fmgrprotos.h"
      31              : #include "utils/lsyscache.h"
      32              : #include "utils/multirangetypes.h"
      33              : #include "utils/rangetypes.h"
      34              : #include "varatt.h"
      35              : 
      36              : static int  float8_qsort_cmp(const void *a1, const void *a2, void *arg);
      37              : static int  range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
      38              : static void compute_range_stats(VacAttrStats *stats,
      39              :                                 AnalyzeAttrFetchFunc fetchfunc, int samplerows,
      40              :                                 double totalrows);
      41              : 
      42              : /*
      43              :  * range_typanalyze -- typanalyze function for range columns
      44              :  */
      45              : Datum
      46           44 : range_typanalyze(PG_FUNCTION_ARGS)
      47              : {
      48           44 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      49              :     TypeCacheEntry *typcache;
      50              : 
      51              :     /* Get information about range type; note column might be a domain */
      52           44 :     typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      53              : 
      54           44 :     if (stats->attstattarget < 0)
      55           32 :         stats->attstattarget = default_statistics_target;
      56              : 
      57           44 :     stats->compute_stats = compute_range_stats;
      58           44 :     stats->extra_data = typcache;
      59              :     /* same as in std_typanalyze */
      60           44 :     stats->minrows = 300 * stats->attstattarget;
      61              : 
      62           44 :     PG_RETURN_BOOL(true);
      63              : }
      64              : 
      65              : /*
      66              :  * multirange_typanalyze -- typanalyze function for multirange columns
      67              :  *
      68              :  * We do the same analysis as for ranges, but on the smallest range that
      69              :  * completely includes the multirange.
      70              :  */
      71              : Datum
      72           21 : multirange_typanalyze(PG_FUNCTION_ARGS)
      73              : {
      74           21 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      75              :     TypeCacheEntry *typcache;
      76              : 
      77              :     /* Get information about multirange type; note column might be a domain */
      78           21 :     typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      79              : 
      80           21 :     if (stats->attstattarget < 0)
      81           15 :         stats->attstattarget = default_statistics_target;
      82              : 
      83           21 :     stats->compute_stats = compute_range_stats;
      84           21 :     stats->extra_data = typcache;
      85              :     /* same as in std_typanalyze */
      86           21 :     stats->minrows = 300 * stats->attstattarget;
      87              : 
      88           21 :     PG_RETURN_BOOL(true);
      89              : }
      90              : 
      91              : /*
      92              :  * Comparison function for sorting float8s, used for range lengths.
      93              :  */
      94              : static int
      95        58249 : float8_qsort_cmp(const void *a1, const void *a2, void *arg)
      96              : {
      97        58249 :     const float8 *f1 = (const float8 *) a1;
      98        58249 :     const float8 *f2 = (const float8 *) a2;
      99              : 
     100        58249 :     if (*f1 < *f2)
     101           77 :         return -1;
     102        58172 :     else if (*f1 == *f2)
     103        52091 :         return 0;
     104              :     else
     105         6081 :         return 1;
     106              : }
     107              : 
     108              : /*
     109              :  * Comparison function for sorting RangeBounds.
     110              :  */
     111              : static int
     112       959657 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
     113              : {
     114       959657 :     const RangeBound *b1 = a1;
     115       959657 :     const RangeBound *b2 = a2;
     116       959657 :     TypeCacheEntry *typcache = arg;
     117              : 
     118       959657 :     return range_cmp_bounds(typcache, b1, b2);
     119              : }
     120              : 
     121              : /*
     122              :  * compute_range_stats() -- compute statistics for a range column
     123              :  */
     124              : static void
     125           38 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
     126              :                     int samplerows, double totalrows)
     127              : {
     128           38 :     TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
     129           38 :     TypeCacheEntry *mltrng_typcache = NULL;
     130              :     bool        has_subdiff;
     131           38 :     int         null_cnt = 0;
     132           38 :     int         non_null_cnt = 0;
     133           38 :     int         non_empty_cnt = 0;
     134           38 :     int         empty_cnt = 0;
     135              :     int         range_no;
     136              :     int         slot_idx;
     137           38 :     int         num_bins = stats->attstattarget;
     138              :     int         num_hist;
     139              :     float8     *lengths;
     140              :     RangeBound *lowers,
     141              :                *uppers;
     142           38 :     double      total_width = 0;
     143              : 
     144           38 :     if (typcache->typtype == TYPTYPE_MULTIRANGE)
     145              :     {
     146           12 :         mltrng_typcache = typcache;
     147           12 :         typcache = typcache->rngtype;
     148              :     }
     149              :     else
     150              :         Assert(typcache->typtype == TYPTYPE_RANGE);
     151           38 :     has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
     152              : 
     153              :     /* Allocate memory to hold range bounds and lengths of the sample ranges. */
     154           38 :     lowers = palloc_array(RangeBound, samplerows);
     155           38 :     uppers = palloc_array(RangeBound, samplerows);
     156           38 :     lengths = palloc_array(float8, samplerows);
     157              : 
     158              :     /* Loop over the sample ranges. */
     159        42849 :     for (range_no = 0; range_no < samplerows; range_no++)
     160              :     {
     161              :         Datum       value;
     162              :         bool        isnull,
     163              :                     empty;
     164              :         MultirangeType *multirange;
     165              :         RangeType  *range;
     166              :         RangeBound  lower,
     167              :                     upper;
     168              :         float8      length;
     169              : 
     170        42811 :         vacuum_delay_point(true);
     171              : 
     172        42811 :         value = fetchfunc(stats, range_no, &isnull);
     173        42811 :         if (isnull)
     174              :         {
     175              :             /* range is null, just count that */
     176            0 :             null_cnt++;
     177            0 :             continue;
     178              :         }
     179              : 
     180              :         /*
     181              :          * XXX: should we ignore wide values, like std_typanalyze does, to
     182              :          * avoid bloating the statistics table?
     183              :          */
     184        42811 :         total_width += VARSIZE_ANY(DatumGetPointer(value));
     185              : 
     186              :         /* Get range and deserialize it for further analysis. */
     187        42811 :         if (mltrng_typcache != NULL)
     188              :         {
     189              :             /* Treat multiranges like a big range without gaps. */
     190        11151 :             multirange = DatumGetMultirangeTypeP(value);
     191        11151 :             if (!MultirangeIsEmpty(multirange))
     192              :             {
     193              :                 RangeBound  tmp;
     194              : 
     195         9639 :                 multirange_get_bounds(typcache, multirange, 0,
     196              :                                       &lower, &tmp);
     197         9639 :                 multirange_get_bounds(typcache, multirange,
     198         9639 :                                       multirange->rangeCount - 1,
     199              :                                       &tmp, &upper);
     200         9639 :                 empty = false;
     201              :             }
     202              :             else
     203              :             {
     204         1512 :                 empty = true;
     205              :             }
     206              :         }
     207              :         else
     208              :         {
     209        31660 :             range = DatumGetRangeTypeP(value);
     210        31660 :             range_deserialize(typcache, range, &lower, &upper, &empty);
     211              :         }
     212              : 
     213        42811 :         if (!empty)
     214              :         {
     215              :             /* Remember bounds and length for further usage in histograms */
     216        36176 :             lowers[non_empty_cnt] = lower;
     217        36176 :             uppers[non_empty_cnt] = upper;
     218              : 
     219        36176 :             if (lower.infinite || upper.infinite)
     220              :             {
     221              :                 /* Length of any kind of an infinite range is infinite */
     222         1621 :                 length = get_float8_infinity();
     223              :             }
     224        34555 :             else if (has_subdiff)
     225              :             {
     226              :                 /*
     227              :                  * For an ordinary range, use subdiff function between upper
     228              :                  * and lower bound values.
     229              :                  */
     230        34555 :                 length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
     231              :                                                           typcache->rng_collation,
     232              :                                                           upper.val, lower.val));
     233              :             }
     234              :             else
     235              :             {
     236              :                 /* Use default value of 1.0 if no subdiff is available. */
     237            0 :                 length = 1.0;
     238              :             }
     239        36176 :             lengths[non_empty_cnt] = length;
     240              : 
     241        36176 :             non_empty_cnt++;
     242              :         }
     243              :         else
     244         6635 :             empty_cnt++;
     245              : 
     246        42811 :         non_null_cnt++;
     247              :     }
     248              : 
     249           38 :     slot_idx = 0;
     250              : 
     251              :     /* We can only compute real stats if we found some non-null values. */
     252           38 :     if (non_null_cnt > 0)
     253              :     {
     254              :         Datum      *bound_hist_values;
     255              :         Datum      *length_hist_values;
     256              :         int         pos,
     257              :                     posfrac,
     258              :                     delta,
     259              :                     deltafrac,
     260              :                     i;
     261              :         MemoryContext old_cxt;
     262              :         float4     *emptyfrac;
     263              : 
     264           38 :         stats->stats_valid = true;
     265              :         /* Do the simple null-frac and width stats */
     266           38 :         stats->stanullfrac = (double) null_cnt / (double) samplerows;
     267           38 :         stats->stawidth = total_width / (double) non_null_cnt;
     268              : 
     269              :         /* Estimate that non-null values are unique */
     270           38 :         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
     271              : 
     272              :         /* Must copy the target values into anl_context */
     273           38 :         old_cxt = MemoryContextSwitchTo(stats->anl_context);
     274              : 
     275              :         /*
     276              :          * Generate a bounds histogram slot entry if there are at least two
     277              :          * values.
     278              :          */
     279           38 :         if (non_empty_cnt >= 2)
     280              :         {
     281              :             /* Sort bound values */
     282           38 :             qsort_interruptible(lowers, non_empty_cnt, sizeof(RangeBound),
     283              :                                 range_bound_qsort_cmp, typcache);
     284           38 :             qsort_interruptible(uppers, non_empty_cnt, sizeof(RangeBound),
     285              :                                 range_bound_qsort_cmp, typcache);
     286              : 
     287           38 :             num_hist = non_empty_cnt;
     288           38 :             if (num_hist > num_bins)
     289            8 :                 num_hist = num_bins + 1;
     290              : 
     291           38 :             bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     292              : 
     293              :             /*
     294              :              * The object of this loop is to construct ranges from first and
     295              :              * last entries in lowers[] and uppers[] along with evenly-spaced
     296              :              * values in between. So the i'th value is a range of lowers[(i *
     297              :              * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
     298              :              * (num_hist - 1)]. But computing that subscript directly risks
     299              :              * integer overflow when the stats target is more than a couple
     300              :              * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
     301              :              * at each step, tracking the integral and fractional parts of the
     302              :              * sum separately.
     303              :              */
     304           38 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     305           38 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     306           38 :             pos = posfrac = 0;
     307              : 
     308         1422 :             for (i = 0; i < num_hist; i++)
     309              :             {
     310         1384 :                 bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
     311         1384 :                                                                        &lowers[pos],
     312         1384 :                                                                        &uppers[pos],
     313              :                                                                        false,
     314              :                                                                        NULL));
     315         1384 :                 pos += delta;
     316         1384 :                 posfrac += deltafrac;
     317         1384 :                 if (posfrac >= (num_hist - 1))
     318              :                 {
     319              :                     /* fractional part exceeds 1, carry to integer part */
     320          792 :                     pos++;
     321          792 :                     posfrac -= (num_hist - 1);
     322              :                 }
     323              :             }
     324              : 
     325           38 :             stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
     326           38 :             stats->stavalues[slot_idx] = bound_hist_values;
     327           38 :             stats->numvalues[slot_idx] = num_hist;
     328              : 
     329              :             /* Store ranges even if we're analyzing a multirange column */
     330           38 :             stats->statypid[slot_idx] = typcache->type_id;
     331           38 :             stats->statyplen[slot_idx] = typcache->typlen;
     332           38 :             stats->statypbyval[slot_idx] = typcache->typbyval;
     333           38 :             stats->statypalign[slot_idx] = typcache->typalign;
     334              : 
     335           38 :             slot_idx++;
     336              :         }
     337              : 
     338              :         /*
     339              :          * Generate a length histogram slot entry if there are at least two
     340              :          * values.
     341              :          */
     342           38 :         if (non_empty_cnt >= 2)
     343              :         {
     344              :             /*
     345              :              * Ascending sort of range lengths for further filling of
     346              :              * histogram
     347              :              */
     348           38 :             qsort_interruptible(lengths, non_empty_cnt, sizeof(float8),
     349              :                                 float8_qsort_cmp, NULL);
     350              : 
     351           38 :             num_hist = non_empty_cnt;
     352           38 :             if (num_hist > num_bins)
     353            8 :                 num_hist = num_bins + 1;
     354              : 
     355           38 :             length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     356              : 
     357              :             /*
     358              :              * The object of this loop is to copy the first and last lengths[]
     359              :              * entries along with evenly-spaced values in between. So the i'th
     360              :              * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
     361              :              * computing that subscript directly risks integer overflow when
     362              :              * the stats target is more than a couple thousand.  Instead we
     363              :              * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
     364              :              * the integral and fractional parts of the sum separately.
     365              :              */
     366           38 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     367           38 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     368           38 :             pos = posfrac = 0;
     369              : 
     370         1422 :             for (i = 0; i < num_hist; i++)
     371              :             {
     372         1384 :                 length_hist_values[i] = Float8GetDatum(lengths[pos]);
     373         1384 :                 pos += delta;
     374         1384 :                 posfrac += deltafrac;
     375         1384 :                 if (posfrac >= (num_hist - 1))
     376              :                 {
     377              :                     /* fractional part exceeds 1, carry to integer part */
     378          792 :                     pos++;
     379          792 :                     posfrac -= (num_hist - 1);
     380              :                 }
     381              :             }
     382              :         }
     383              :         else
     384              :         {
     385              :             /*
     386              :              * Even when we don't create the histogram, store an empty array
     387              :              * to mean "no histogram". We can't just leave stavalues NULL,
     388              :              * because get_attstatsslot() errors if you ask for stavalues, and
     389              :              * it's NULL. We'll still store the empty fraction in stanumbers.
     390              :              */
     391            0 :             length_hist_values = palloc(0);
     392            0 :             num_hist = 0;
     393              :         }
     394           38 :         stats->staop[slot_idx] = Float8LessOperator;
     395           38 :         stats->stacoll[slot_idx] = InvalidOid;
     396           38 :         stats->stavalues[slot_idx] = length_hist_values;
     397           38 :         stats->numvalues[slot_idx] = num_hist;
     398           38 :         stats->statypid[slot_idx] = FLOAT8OID;
     399           38 :         stats->statyplen[slot_idx] = sizeof(float8);
     400           38 :         stats->statypbyval[slot_idx] = true;
     401           38 :         stats->statypalign[slot_idx] = TYPALIGN_DOUBLE;
     402              : 
     403              :         /* Store the fraction of empty ranges */
     404           38 :         emptyfrac = palloc_object(float4);
     405           38 :         *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
     406           38 :         stats->stanumbers[slot_idx] = emptyfrac;
     407           38 :         stats->numnumbers[slot_idx] = 1;
     408              : 
     409           38 :         stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
     410           38 :         slot_idx++;
     411              : 
     412           38 :         MemoryContextSwitchTo(old_cxt);
     413              :     }
     414            0 :     else if (null_cnt > 0)
     415              :     {
     416              :         /* We found only nulls; assume the column is entirely null */
     417            0 :         stats->stats_valid = true;
     418            0 :         stats->stanullfrac = 1.0;
     419            0 :         stats->stawidth = 0; /* "unknown" */
     420            0 :         stats->stadistinct = 0.0;    /* "unknown" */
     421              :     }
     422              : 
     423              :     /*
     424              :      * We don't need to bother cleaning up any of our temporary palloc's. The
     425              :      * hashtable should also go away, as it used a child memory context.
     426              :      */
     427           38 : }
        

Generated by: LCOV version 2.0-1