LCOV - code coverage report
Current view: top level - src/backend/utils/adt - rangetypes_typanalyze.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 113 123 91.9 %
Date: 2019-06-18 07:06:57 Functions: 4 4 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * ragetypes_typanalyze.c
       4             :  *    Functions for gathering statistics from range columns
       5             :  *
       6             :  * For a range type column, histograms of lower and upper bounds, and
       7             :  * the fraction of NULL and empty ranges are collected.
       8             :  *
       9             :  * Both histograms have the same length, and they are combined into a
      10             :  * single array of ranges. This has the same shape as the histogram that
      11             :  * std_typanalyze would collect, but the values are different. Each range
      12             :  * in the array is a valid range, even though the lower and upper bounds
      13             :  * come from different tuples. In theory, the standard scalar selectivity
      14             :  * functions could be used with the combined histogram.
      15             :  *
      16             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      17             :  * Portions Copyright (c) 1994, Regents of the University of California
      18             :  *
      19             :  *
      20             :  * IDENTIFICATION
      21             :  *    src/backend/utils/adt/rangetypes_typanalyze.c
      22             :  *
      23             :  *-------------------------------------------------------------------------
      24             :  */
      25             : #include "postgres.h"
      26             : 
      27             : #include "catalog/pg_operator.h"
      28             : #include "commands/vacuum.h"
      29             : #include "utils/float.h"
      30             : #include "utils/fmgrprotos.h"
      31             : #include "utils/lsyscache.h"
      32             : #include "utils/rangetypes.h"
      33             : 
      34             : static int  float8_qsort_cmp(const void *a1, const void *a2);
      35             : static int  range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
      36             : static void compute_range_stats(VacAttrStats *stats,
      37             :                                 AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
      38             : 
      39             : /*
      40             :  * range_typanalyze -- typanalyze function for range columns
      41             :  */
      42             : Datum
      43          48 : range_typanalyze(PG_FUNCTION_ARGS)
      44             : {
      45          48 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      46             :     TypeCacheEntry *typcache;
      47          48 :     Form_pg_attribute attr = stats->attr;
      48             : 
      49             :     /* Get information about range type; note column might be a domain */
      50          48 :     typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      51             : 
      52          48 :     if (attr->attstattarget < 0)
      53          48 :         attr->attstattarget = default_statistics_target;
      54             : 
      55          48 :     stats->compute_stats = compute_range_stats;
      56          48 :     stats->extra_data = typcache;
      57             :     /* same as in std_typanalyze */
      58          48 :     stats->minrows = 300 * attr->attstattarget;
      59             : 
      60          48 :     PG_RETURN_BOOL(true);
      61             : }
      62             : 
      63             : /*
      64             :  * Comparison function for sorting float8s, used for range lengths.
      65             :  */
      66             : static int
      67      136668 : float8_qsort_cmp(const void *a1, const void *a2)
      68             : {
      69      136668 :     const float8 *f1 = (const float8 *) a1;
      70      136668 :     const float8 *f2 = (const float8 *) a2;
      71             : 
      72      136668 :     if (*f1 < *f2)
      73          84 :         return -1;
      74      136584 :     else if (*f1 == *f2)
      75      119520 :         return 0;
      76             :     else
      77       17064 :         return 1;
      78             : }
      79             : 
      80             : /*
      81             :  * Comparison function for sorting RangeBounds.
      82             :  */
      83             : static int
      84     1884402 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
      85             : {
      86     1884402 :     RangeBound *b1 = (RangeBound *) a1;
      87     1884402 :     RangeBound *b2 = (RangeBound *) a2;
      88     1884402 :     TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
      89             : 
      90     1884402 :     return range_cmp_bounds(typcache, b1, b2);
      91             : }
      92             : 
      93             : /*
      94             :  * compute_range_stats() -- compute statistics for a range column
      95             :  */
      96             : static void
      97          48 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
      98             :                     int samplerows, double totalrows)
      99             : {
     100          48 :     TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
     101          48 :     bool        has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
     102          48 :     int         null_cnt = 0;
     103          48 :     int         non_null_cnt = 0;
     104          48 :     int         non_empty_cnt = 0;
     105          48 :     int         empty_cnt = 0;
     106             :     int         range_no;
     107             :     int         slot_idx;
     108          48 :     int         num_bins = stats->attr->attstattarget;
     109             :     int         num_hist;
     110             :     float8     *lengths;
     111             :     RangeBound *lowers,
     112             :                *uppers;
     113          48 :     double      total_width = 0;
     114             : 
     115             :     /* Allocate memory to hold range bounds and lengths of the sample ranges. */
     116          48 :     lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     117          48 :     uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     118          48 :     lengths = (float8 *) palloc(sizeof(float8) * samplerows);
     119             : 
     120             :     /* Loop over the sample ranges. */
     121       89002 :     for (range_no = 0; range_no < samplerows; range_no++)
     122             :     {
     123             :         Datum       value;
     124             :         bool        isnull,
     125             :                     empty;
     126             :         RangeType  *range;
     127             :         RangeBound  lower,
     128             :                     upper;
     129             :         float8      length;
     130             : 
     131       88954 :         vacuum_delay_point();
     132             : 
     133       88954 :         value = fetchfunc(stats, range_no, &isnull);
     134       88954 :         if (isnull)
     135             :         {
     136             :             /* range is null, just count that */
     137           0 :             null_cnt++;
     138           0 :             continue;
     139             :         }
     140             : 
     141             :         /*
     142             :          * XXX: should we ignore wide values, like std_typanalyze does, to
     143             :          * avoid bloating the statistics table?
     144             :          */
     145       88954 :         total_width += VARSIZE_ANY(DatumGetPointer(value));
     146             : 
     147             :         /* Get range and deserialize it for further analysis. */
     148       88954 :         range = DatumGetRangeTypeP(value);
     149       88954 :         range_deserialize(typcache, range, &lower, &upper, &empty);
     150             : 
     151       88954 :         if (!empty)
     152             :         {
     153             :             /* Remember bounds and length for further usage in histograms */
     154       74158 :             lowers[non_empty_cnt] = lower;
     155       74158 :             uppers[non_empty_cnt] = upper;
     156             : 
     157       74158 :             if (lower.infinite || upper.infinite)
     158             :             {
     159             :                 /* Length of any kind of an infinite range is infinite */
     160        2844 :                 length = get_float8_infinity();
     161             :             }
     162       71314 :             else if (has_subdiff)
     163             :             {
     164             :                 /*
     165             :                  * For an ordinary range, use subdiff function between upper
     166             :                  * and lower bound values.
     167             :                  */
     168       71314 :                 length = DatumGetFloat8(FunctionCall2Coll(
     169             :                                                           &typcache->rng_subdiff_finfo,
     170             :                                                           typcache->rng_collation,
     171             :                                                           upper.val, lower.val));
     172             :             }
     173             :             else
     174             :             {
     175             :                 /* Use default value of 1.0 if no subdiff is available. */
     176           0 :                 length = 1.0;
     177             :             }
     178       74158 :             lengths[non_empty_cnt] = length;
     179             : 
     180       74158 :             non_empty_cnt++;
     181             :         }
     182             :         else
     183       14796 :             empty_cnt++;
     184             : 
     185       88954 :         non_null_cnt++;
     186             :     }
     187             : 
     188          48 :     slot_idx = 0;
     189             : 
     190             :     /* We can only compute real stats if we found some non-null values. */
     191          48 :     if (non_null_cnt > 0)
     192             :     {
     193             :         Datum      *bound_hist_values;
     194             :         Datum      *length_hist_values;
     195             :         int         pos,
     196             :                     posfrac,
     197             :                     delta,
     198             :                     deltafrac,
     199             :                     i;
     200             :         MemoryContext old_cxt;
     201             :         float4     *emptyfrac;
     202             : 
     203          48 :         stats->stats_valid = true;
     204             :         /* Do the simple null-frac and width stats */
     205          48 :         stats->stanullfrac = (double) null_cnt / (double) samplerows;
     206          48 :         stats->stawidth = total_width / (double) non_null_cnt;
     207             : 
     208             :         /* Estimate that non-null values are unique */
     209          48 :         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
     210             : 
     211             :         /* Must copy the target values into anl_context */
     212          48 :         old_cxt = MemoryContextSwitchTo(stats->anl_context);
     213             : 
     214             :         /*
     215             :          * Generate a bounds histogram slot entry if there are at least two
     216             :          * values.
     217             :          */
     218          48 :         if (non_empty_cnt >= 2)
     219             :         {
     220             :             /* Sort bound values */
     221          48 :             qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
     222             :                       range_bound_qsort_cmp, typcache);
     223          48 :             qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
     224             :                       range_bound_qsort_cmp, typcache);
     225             : 
     226          48 :             num_hist = non_empty_cnt;
     227          48 :             if (num_hist > num_bins)
     228          30 :                 num_hist = num_bins + 1;
     229             : 
     230          48 :             bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     231             : 
     232             :             /*
     233             :              * The object of this loop is to construct ranges from first and
     234             :              * last entries in lowers[] and uppers[] along with evenly-spaced
     235             :              * values in between. So the i'th value is a range of lowers[(i *
     236             :              * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
     237             :              * (num_hist - 1)]. But computing that subscript directly risks
     238             :              * integer overflow when the stats target is more than a couple
     239             :              * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
     240             :              * at each step, tracking the integral and fractional parts of the
     241             :              * sum separately.
     242             :              */
     243          48 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     244          48 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     245          48 :             pos = posfrac = 0;
     246             : 
     247        1716 :             for (i = 0; i < num_hist; i++)
     248             :             {
     249        1668 :                 bound_hist_values[i] = PointerGetDatum(range_serialize(
     250             :                                                                        typcache, &lowers[pos], &uppers[pos], false));
     251        1668 :                 pos += delta;
     252        1668 :                 posfrac += deltafrac;
     253        1668 :                 if (posfrac >= (num_hist - 1))
     254             :                 {
     255             :                     /* fractional part exceeds 1, carry to integer part */
     256        1218 :                     pos++;
     257        1218 :                     posfrac -= (num_hist - 1);
     258             :                 }
     259             :             }
     260             : 
     261          48 :             stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
     262          48 :             stats->stavalues[slot_idx] = bound_hist_values;
     263          48 :             stats->numvalues[slot_idx] = num_hist;
     264          48 :             slot_idx++;
     265             :         }
     266             : 
     267             :         /*
     268             :          * Generate a length histogram slot entry if there are at least two
     269             :          * values.
     270             :          */
     271          48 :         if (non_empty_cnt >= 2)
     272             :         {
     273             :             /*
     274             :              * Ascending sort of range lengths for further filling of
     275             :              * histogram
     276             :              */
     277          48 :             qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
     278             : 
     279          48 :             num_hist = non_empty_cnt;
     280          48 :             if (num_hist > num_bins)
     281          30 :                 num_hist = num_bins + 1;
     282             : 
     283          48 :             length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     284             : 
     285             :             /*
     286             :              * The object of this loop is to copy the first and last lengths[]
     287             :              * entries along with evenly-spaced values in between. So the i'th
     288             :              * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
     289             :              * computing that subscript directly risks integer overflow when
     290             :              * the stats target is more than a couple thousand.  Instead we
     291             :              * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
     292             :              * the integral and fractional parts of the sum separately.
     293             :              */
     294          48 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     295          48 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     296          48 :             pos = posfrac = 0;
     297             : 
     298        1716 :             for (i = 0; i < num_hist; i++)
     299             :             {
     300        1668 :                 length_hist_values[i] = Float8GetDatum(lengths[pos]);
     301        1668 :                 pos += delta;
     302        1668 :                 posfrac += deltafrac;
     303        1668 :                 if (posfrac >= (num_hist - 1))
     304             :                 {
     305             :                     /* fractional part exceeds 1, carry to integer part */
     306        1218 :                     pos++;
     307        1218 :                     posfrac -= (num_hist - 1);
     308             :                 }
     309             :             }
     310             :         }
     311             :         else
     312             :         {
     313             :             /*
     314             :              * Even when we don't create the histogram, store an empty array
     315             :              * to mean "no histogram". We can't just leave stavalues NULL,
     316             :              * because get_attstatsslot() errors if you ask for stavalues, and
     317             :              * it's NULL. We'll still store the empty fraction in stanumbers.
     318             :              */
     319           0 :             length_hist_values = palloc(0);
     320           0 :             num_hist = 0;
     321             :         }
     322          48 :         stats->staop[slot_idx] = Float8LessOperator;
     323          48 :         stats->stacoll[slot_idx] = InvalidOid;
     324          48 :         stats->stavalues[slot_idx] = length_hist_values;
     325          48 :         stats->numvalues[slot_idx] = num_hist;
     326          48 :         stats->statypid[slot_idx] = FLOAT8OID;
     327          48 :         stats->statyplen[slot_idx] = sizeof(float8);
     328             : #ifdef USE_FLOAT8_BYVAL
     329          48 :         stats->statypbyval[slot_idx] = true;
     330             : #else
     331             :         stats->statypbyval[slot_idx] = false;
     332             : #endif
     333          48 :         stats->statypalign[slot_idx] = 'd';
     334             : 
     335             :         /* Store the fraction of empty ranges */
     336          48 :         emptyfrac = (float4 *) palloc(sizeof(float4));
     337          48 :         *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
     338          48 :         stats->stanumbers[slot_idx] = emptyfrac;
     339          48 :         stats->numnumbers[slot_idx] = 1;
     340             : 
     341          48 :         stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
     342          48 :         slot_idx++;
     343             : 
     344          48 :         MemoryContextSwitchTo(old_cxt);
     345             :     }
     346           0 :     else if (null_cnt > 0)
     347             :     {
     348             :         /* We found only nulls; assume the column is entirely null */
     349           0 :         stats->stats_valid = true;
     350           0 :         stats->stanullfrac = 1.0;
     351           0 :         stats->stawidth = 0; /* "unknown" */
     352           0 :         stats->stadistinct = 0.0;    /* "unknown" */
     353             :     }
     354             : 
     355             :     /*
     356             :      * We don't need to bother cleaning up any of our temporary palloc's. The
     357             :      * hashtable should also go away, as it used a child memory context.
     358             :      */
     359          48 : }

Generated by: LCOV version 1.13