LCOV - code coverage report
Current view: top level - src/backend/utils/adt - rangetypes_typanalyze.c (source / functions) Hit Total Coverage
Test: PostgreSQL 16beta1 Lines: 141 151 93.4 %
Date: 2023-05-31 04:12:22 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * rangetypes_typanalyze.c
       4             :  *    Functions for gathering statistics from range columns
       5             :  *
       6             :  * For a range type column, histograms of lower and upper bounds, and
       7             :  * the fraction of NULL and empty ranges are collected.
       8             :  *
       9             :  * Both histograms have the same length, and they are combined into a
      10             :  * single array of ranges. This has the same shape as the histogram that
      11             :  * std_typanalyze would collect, but the values are different. Each range
      12             :  * in the array is a valid range, even though the lower and upper bounds
      13             :  * come from different tuples. In theory, the standard scalar selectivity
      14             :  * functions could be used with the combined histogram.
      15             :  *
      16             :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
      17             :  * Portions Copyright (c) 1994, Regents of the University of California
      18             :  *
      19             :  *
      20             :  * IDENTIFICATION
      21             :  *    src/backend/utils/adt/rangetypes_typanalyze.c
      22             :  *
      23             :  *-------------------------------------------------------------------------
      24             :  */
      25             : #include "postgres.h"
      26             : 
      27             : #include "catalog/pg_operator.h"
      28             : #include "commands/vacuum.h"
      29             : #include "utils/float.h"
      30             : #include "utils/fmgrprotos.h"
      31             : #include "utils/lsyscache.h"
      32             : #include "utils/rangetypes.h"
      33             : #include "utils/multirangetypes.h"
      34             : #include "varatt.h"
      35             : 
      36             : static int  float8_qsort_cmp(const void *a1, const void *a2, void *arg);
      37             : static int  range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
      38             : static void compute_range_stats(VacAttrStats *stats,
      39             :                                 AnalyzeAttrFetchFunc fetchfunc, int samplerows,
      40             :                                 double totalrows);
      41             : 
      42             : /*
      43             :  * range_typanalyze -- typanalyze function for range columns
      44             :  */
      45             : Datum
      46          20 : range_typanalyze(PG_FUNCTION_ARGS)
      47             : {
      48          20 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      49             :     TypeCacheEntry *typcache;
      50          20 :     Form_pg_attribute attr = stats->attr;
      51             : 
      52             :     /* Get information about range type; note column might be a domain */
      53          20 :     typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      54             : 
      55          20 :     if (attr->attstattarget < 0)
      56          20 :         attr->attstattarget = default_statistics_target;
      57             : 
      58          20 :     stats->compute_stats = compute_range_stats;
      59          20 :     stats->extra_data = typcache;
      60             :     /* same as in std_typanalyze */
      61          20 :     stats->minrows = 300 * attr->attstattarget;
      62             : 
      63          20 :     PG_RETURN_BOOL(true);
      64             : }
      65             : 
      66             : /*
      67             :  * multirange_typanalyze -- typanalyze function for multirange columns
      68             :  *
      69             :  * We do the same analysis as for ranges, but on the smallest range that
      70             :  * completely includes the multirange.
      71             :  */
      72             : Datum
      73          12 : multirange_typanalyze(PG_FUNCTION_ARGS)
      74             : {
      75          12 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      76             :     TypeCacheEntry *typcache;
      77          12 :     Form_pg_attribute attr = stats->attr;
      78             : 
      79             :     /* Get information about multirange type; note column might be a domain */
      80          12 :     typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      81             : 
      82          12 :     if (attr->attstattarget < 0)
      83          12 :         attr->attstattarget = default_statistics_target;
      84             : 
      85          12 :     stats->compute_stats = compute_range_stats;
      86          12 :     stats->extra_data = typcache;
      87             :     /* same as in std_typanalyze */
      88          12 :     stats->minrows = 300 * attr->attstattarget;
      89             : 
      90          12 :     PG_RETURN_BOOL(true);
      91             : }
      92             : 
      93             : /*
      94             :  * Comparison function for sorting float8s, used for range lengths.
      95             :  */
      96             : static int
      97      153880 : float8_qsort_cmp(const void *a1, const void *a2, void *arg)
      98             : {
      99      153880 :     const float8 *f1 = (const float8 *) a1;
     100      153880 :     const float8 *f2 = (const float8 *) a2;
     101             : 
     102      153880 :     if (*f1 < *f2)
     103          92 :         return -1;
     104      153788 :     else if (*f1 == *f2)
     105      136886 :         return 0;
     106             :     else
     107       16902 :         return 1;
     108             : }
     109             : 
     110             : /*
     111             :  * Comparison function for sorting RangeBounds.
     112             :  */
     113             : static int
     114     2452334 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
     115             : {
     116     2452334 :     RangeBound *b1 = (RangeBound *) a1;
     117     2452334 :     RangeBound *b2 = (RangeBound *) a2;
     118     2452334 :     TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
     119             : 
     120     2452334 :     return range_cmp_bounds(typcache, b1, b2);
     121             : }
     122             : 
     123             : /*
     124             :  * compute_range_stats() -- compute statistics for a range column
     125             :  */
     126             : static void
     127          32 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
     128             :                     int samplerows, double totalrows)
     129             : {
     130          32 :     TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
     131          32 :     TypeCacheEntry *mltrng_typcache = NULL;
     132             :     bool        has_subdiff;
     133          32 :     int         null_cnt = 0;
     134          32 :     int         non_null_cnt = 0;
     135          32 :     int         non_empty_cnt = 0;
     136          32 :     int         empty_cnt = 0;
     137             :     int         range_no;
     138             :     int         slot_idx;
     139          32 :     int         num_bins = stats->attr->attstattarget;
     140             :     int         num_hist;
     141             :     float8     *lengths;
     142             :     RangeBound *lowers,
     143             :                *uppers;
     144          32 :     double      total_width = 0;
     145             : 
     146          32 :     if (typcache->typtype == TYPTYPE_MULTIRANGE)
     147             :     {
     148          12 :         mltrng_typcache = typcache;
     149          12 :         typcache = typcache->rngtype;
     150             :     }
     151             :     else
     152             :         Assert(typcache->typtype == TYPTYPE_RANGE);
     153          32 :     has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
     154             : 
     155             :     /* Allocate memory to hold range bounds and lengths of the sample ranges. */
     156          32 :     lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     157          32 :     uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     158          32 :     lengths = (float8 *) palloc(sizeof(float8) * samplerows);
     159             : 
     160             :     /* Loop over the sample ranges. */
     161      109134 :     for (range_no = 0; range_no < samplerows; range_no++)
     162             :     {
     163             :         Datum       value;
     164             :         bool        isnull,
     165             :                     empty;
     166             :         MultirangeType *multirange;
     167             :         RangeType  *range;
     168             :         RangeBound  lower,
     169             :                     upper;
     170             :         float8      length;
     171             : 
     172      109102 :         vacuum_delay_point();
     173             : 
     174      109102 :         value = fetchfunc(stats, range_no, &isnull);
     175      109102 :         if (isnull)
     176             :         {
     177             :             /* range is null, just count that */
     178           0 :             null_cnt++;
     179           0 :             continue;
     180             :         }
     181             : 
     182             :         /*
     183             :          * XXX: should we ignore wide values, like std_typanalyze does, to
     184             :          * avoid bloating the statistics table?
     185             :          */
     186      109102 :         total_width += VARSIZE_ANY(DatumGetPointer(value));
     187             : 
     188             :         /* Get range and deserialize it for further analysis. */
     189      109102 :         if (mltrng_typcache != NULL)
     190             :         {
     191             :             /* Treat multiranges like a big range without gaps. */
     192       22266 :             multirange = DatumGetMultirangeTypeP(value);
     193       22266 :             if (!MultirangeIsEmpty(multirange))
     194             :             {
     195             :                 RangeBound  tmp;
     196             : 
     197       19242 :                 multirange_get_bounds(typcache, multirange, 0,
     198             :                                       &lower, &tmp);
     199       19242 :                 multirange_get_bounds(typcache, multirange,
     200       19242 :                                       multirange->rangeCount - 1,
     201             :                                       &tmp, &upper);
     202       19242 :                 empty = false;
     203             :             }
     204             :             else
     205             :             {
     206        3024 :                 empty = true;
     207             :             }
     208             :         }
     209             :         else
     210             :         {
     211       86836 :             range = DatumGetRangeTypeP(value);
     212       86836 :             range_deserialize(typcache, range, &lower, &upper, &empty);
     213             :         }
     214             : 
     215      109102 :         if (!empty)
     216             :         {
     217             :             /* Remember bounds and length for further usage in histograms */
     218       92072 :             lowers[non_empty_cnt] = lower;
     219       92072 :             uppers[non_empty_cnt] = upper;
     220             : 
     221       92072 :             if (lower.infinite || upper.infinite)
     222             :             {
     223             :                 /* Length of any kind of an infinite range is infinite */
     224        4042 :                 length = get_float8_infinity();
     225             :             }
     226       88030 :             else if (has_subdiff)
     227             :             {
     228             :                 /*
     229             :                  * For an ordinary range, use subdiff function between upper
     230             :                  * and lower bound values.
     231             :                  */
     232       88030 :                 length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
     233             :                                                           typcache->rng_collation,
     234             :                                                           upper.val, lower.val));
     235             :             }
     236             :             else
     237             :             {
     238             :                 /* Use default value of 1.0 if no subdiff is available. */
     239           0 :                 length = 1.0;
     240             :             }
     241       92072 :             lengths[non_empty_cnt] = length;
     242             : 
     243       92072 :             non_empty_cnt++;
     244             :         }
     245             :         else
     246       17030 :             empty_cnt++;
     247             : 
     248      109102 :         non_null_cnt++;
     249             :     }
     250             : 
     251          32 :     slot_idx = 0;
     252             : 
     253             :     /* We can only compute real stats if we found some non-null values. */
     254          32 :     if (non_null_cnt > 0)
     255             :     {
     256             :         Datum      *bound_hist_values;
     257             :         Datum      *length_hist_values;
     258             :         int         pos,
     259             :                     posfrac,
     260             :                     delta,
     261             :                     deltafrac,
     262             :                     i;
     263             :         MemoryContext old_cxt;
     264             :         float4     *emptyfrac;
     265             : 
     266          32 :         stats->stats_valid = true;
     267             :         /* Do the simple null-frac and width stats */
     268          32 :         stats->stanullfrac = (double) null_cnt / (double) samplerows;
     269          32 :         stats->stawidth = total_width / (double) non_null_cnt;
     270             : 
     271             :         /* Estimate that non-null values are unique */
     272          32 :         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
     273             : 
     274             :         /* Must copy the target values into anl_context */
     275          32 :         old_cxt = MemoryContextSwitchTo(stats->anl_context);
     276             : 
     277             :         /*
     278             :          * Generate a bounds histogram slot entry if there are at least two
     279             :          * values.
     280             :          */
     281          32 :         if (non_empty_cnt >= 2)
     282             :         {
     283             :             /* Sort bound values */
     284          32 :             qsort_interruptible(lowers, non_empty_cnt, sizeof(RangeBound),
     285             :                                 range_bound_qsort_cmp, typcache);
     286          32 :             qsort_interruptible(uppers, non_empty_cnt, sizeof(RangeBound),
     287             :                                 range_bound_qsort_cmp, typcache);
     288             : 
     289          32 :             num_hist = non_empty_cnt;
     290          32 :             if (num_hist > num_bins)
     291          20 :                 num_hist = num_bins + 1;
     292             : 
     293          32 :             bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     294             : 
     295             :             /*
     296             :              * The object of this loop is to construct ranges from first and
     297             :              * last entries in lowers[] and uppers[] along with evenly-spaced
     298             :              * values in between. So the i'th value is a range of lowers[(i *
     299             :              * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
     300             :              * (num_hist - 1)]. But computing that subscript directly risks
     301             :              * integer overflow when the stats target is more than a couple
     302             :              * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
     303             :              * at each step, tracking the integral and fractional parts of the
     304             :              * sum separately.
     305             :              */
     306          32 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     307          32 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     308          32 :             pos = posfrac = 0;
     309             : 
     310        2124 :             for (i = 0; i < num_hist; i++)
     311             :             {
     312        2092 :                 bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
     313        2092 :                                                                        &lowers[pos],
     314        2092 :                                                                        &uppers[pos],
     315             :                                                                        false,
     316             :                                                                        NULL));
     317        2092 :                 pos += delta;
     318        2092 :                 posfrac += deltafrac;
     319        2092 :                 if (posfrac >= (num_hist - 1))
     320             :                 {
     321             :                     /* fractional part exceeds 1, carry to integer part */
     322        1980 :                     pos++;
     323        1980 :                     posfrac -= (num_hist - 1);
     324             :                 }
     325             :             }
     326             : 
     327          32 :             stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
     328          32 :             stats->stavalues[slot_idx] = bound_hist_values;
     329          32 :             stats->numvalues[slot_idx] = num_hist;
     330             : 
     331             :             /* Store ranges even if we're analyzing a multirange column */
     332          32 :             stats->statypid[slot_idx] = typcache->type_id;
     333          32 :             stats->statyplen[slot_idx] = typcache->typlen;
     334          32 :             stats->statypbyval[slot_idx] = typcache->typbyval;
     335          32 :             stats->statypalign[slot_idx] = typcache->typalign;
     336             : 
     337          32 :             slot_idx++;
     338             :         }
     339             : 
     340             :         /*
     341             :          * Generate a length histogram slot entry if there are at least two
     342             :          * values.
     343             :          */
     344          32 :         if (non_empty_cnt >= 2)
     345             :         {
     346             :             /*
     347             :              * Ascending sort of range lengths for further filling of
     348             :              * histogram
     349             :              */
     350          32 :             qsort_interruptible(lengths, non_empty_cnt, sizeof(float8),
     351             :                                 float8_qsort_cmp, NULL);
     352             : 
     353          32 :             num_hist = non_empty_cnt;
     354          32 :             if (num_hist > num_bins)
     355          20 :                 num_hist = num_bins + 1;
     356             : 
     357          32 :             length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     358             : 
     359             :             /*
     360             :              * The object of this loop is to copy the first and last lengths[]
     361             :              * entries along with evenly-spaced values in between. So the i'th
     362             :              * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
     363             :              * computing that subscript directly risks integer overflow when
     364             :              * the stats target is more than a couple thousand.  Instead we
     365             :              * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
     366             :              * the integral and fractional parts of the sum separately.
     367             :              */
     368          32 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     369          32 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     370          32 :             pos = posfrac = 0;
     371             : 
     372        2124 :             for (i = 0; i < num_hist; i++)
     373             :             {
     374        2092 :                 length_hist_values[i] = Float8GetDatum(lengths[pos]);
     375        2092 :                 pos += delta;
     376        2092 :                 posfrac += deltafrac;
     377        2092 :                 if (posfrac >= (num_hist - 1))
     378             :                 {
     379             :                     /* fractional part exceeds 1, carry to integer part */
     380        1980 :                     pos++;
     381        1980 :                     posfrac -= (num_hist - 1);
     382             :                 }
     383             :             }
     384             :         }
     385             :         else
     386             :         {
     387             :             /*
     388             :              * Even when we don't create the histogram, store an empty array
     389             :              * to mean "no histogram". We can't just leave stavalues NULL,
     390             :              * because get_attstatsslot() errors if you ask for stavalues, and
     391             :              * it's NULL. We'll still store the empty fraction in stanumbers.
     392             :              */
     393           0 :             length_hist_values = palloc(0);
     394           0 :             num_hist = 0;
     395             :         }
     396          32 :         stats->staop[slot_idx] = Float8LessOperator;
     397          32 :         stats->stacoll[slot_idx] = InvalidOid;
     398          32 :         stats->stavalues[slot_idx] = length_hist_values;
     399          32 :         stats->numvalues[slot_idx] = num_hist;
     400          32 :         stats->statypid[slot_idx] = FLOAT8OID;
     401          32 :         stats->statyplen[slot_idx] = sizeof(float8);
     402          32 :         stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
     403          32 :         stats->statypalign[slot_idx] = 'd';
     404             : 
     405             :         /* Store the fraction of empty ranges */
     406          32 :         emptyfrac = (float4 *) palloc(sizeof(float4));
     407          32 :         *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
     408          32 :         stats->stanumbers[slot_idx] = emptyfrac;
     409          32 :         stats->numnumbers[slot_idx] = 1;
     410             : 
     411          32 :         stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
     412          32 :         slot_idx++;
     413             : 
     414          32 :         MemoryContextSwitchTo(old_cxt);
     415             :     }
     416           0 :     else if (null_cnt > 0)
     417             :     {
     418             :         /* We found only nulls; assume the column is entirely null */
     419           0 :         stats->stats_valid = true;
     420           0 :         stats->stanullfrac = 1.0;
     421           0 :         stats->stawidth = 0; /* "unknown" */
     422           0 :         stats->stadistinct = 0.0;    /* "unknown" */
     423             :     }
     424             : 
     425             :     /*
     426             :      * We don't need to bother cleaning up any of our temporary palloc's. The
     427             :      * hashtable should also go away, as it used a child memory context.
     428             :      */
     429          32 : }

Generated by: LCOV version 1.14