LCOV - code coverage report
Current view: top level - src/backend/statistics - mcv.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 432 491 88.0 %
Date: 2019-11-21 13:06:38 Functions: 14 18 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mcv.c
       4             :  *    POSTGRES multivariate MCV lists
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/statistics/mcv.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include <math.h>
      18             : 
      19             : #include "access/htup_details.h"
      20             : #include "catalog/pg_collation.h"
      21             : #include "catalog/pg_statistic_ext.h"
      22             : #include "catalog/pg_statistic_ext_data.h"
      23             : #include "fmgr.h"
      24             : #include "funcapi.h"
      25             : #include "nodes/nodeFuncs.h"
      26             : #include "optimizer/clauses.h"
      27             : #include "statistics/extended_stats_internal.h"
      28             : #include "statistics/statistics.h"
      29             : #include "utils/array.h"
      30             : #include "utils/builtins.h"
      31             : #include "utils/bytea.h"
      32             : #include "utils/fmgroids.h"
      33             : #include "utils/fmgrprotos.h"
      34             : #include "utils/lsyscache.h"
      35             : #include "utils/syscache.h"
      36             : #include "utils/typcache.h"
      37             : 
      38             : /*
      39             :  * Computes size of a serialized MCV item, depending on the number of
      40             :  * dimensions (columns) the statistic is defined on. The datum values are
      41             :  * stored in a separate array (deduplicated, to minimize the size), and
      42             :  * so the serialized items only store uint16 indexes into that array.
      43             :  *
      44             :  * Each serialized item stores (in this order):
      45             :  *
      46             :  * - indexes to values    (ndim * sizeof(uint16))
      47             :  * - null flags           (ndim * sizeof(bool))
      48             :  * - frequency            (sizeof(double))
      49             :  * - base_frequency       (sizeof(double))
      50             :  *
      51             :  * There is no alignment padding within an MCV item.
      52             :  * So in total each MCV item requires this many bytes:
      53             :  *
      54             :  *   ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
      55             :  */
      56             : #define ITEM_SIZE(ndims)    \
      57             :     ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
      58             : 
      59             : /*
      60             :  * Used to compute size of serialized MCV list representation.
      61             :  */
      62             : #define MinSizeOfMCVList        \
      63             :     (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
      64             : 
      65             : /*
      66             :  * Size of the serialized MCV list, excluding the space needed for
      67             :  * deduplicated per-dimension values. The macro is meant to be used
      68             :  * when it's not yet safe to access the serialized info about amount
      69             :  * of data for each column.
      70             :  */
      71             : #define SizeOfMCVList(ndims,nitems) \
      72             :     ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
      73             :      ((ndims) * sizeof(DimensionInfo)) + \
      74             :      ((nitems) * ITEM_SIZE(ndims)))
      75             : 
      76             : static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
      77             : 
      78             : static SortItem *build_distinct_groups(int numrows, SortItem *items,
      79             :                                        MultiSortSupport mss, int *ndistinct);
      80             : 
      81             : static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
      82             :                                            MultiSortSupport mss, int *ncounts);
      83             : 
      84             : static int  count_distinct_groups(int numrows, SortItem *items,
      85             :                                   MultiSortSupport mss);
      86             : 
      87             : /*
      88             :  * Compute new value for bitmap item, considering whether it's used for
      89             :  * clauses connected by AND/OR.
      90             :  */
      91             : #define RESULT_MERGE(value, is_or, match) \
      92             :     ((is_or) ? ((value) || (match)) : ((value) && (match)))
      93             : 
      94             : /*
      95             :  * When processing a list of clauses, the bitmap item may get set to a value
      96             :  * such that additional clauses can't change it. For example, when processing
      97             :  * a list of clauses connected to AND, as soon as the item gets set to 'false'
      98             :  * then it'll remain like that. Similarly clauses connected by OR and 'true'.
      99             :  *
     100             :  * Returns true when the value in the bitmap can't change no matter how the
     101             :  * remaining clauses are evaluated.
     102             :  */
     103             : #define RESULT_IS_FINAL(value, is_or)   ((is_or) ? (value) : (!(value)))
     104             : 
     105             : /*
     106             :  * get_mincount_for_mcv_list
     107             :  *      Determine the minimum number of times a value needs to appear in
     108             :  *      the sample for it to be included in the MCV list.
     109             :  *
     110             :  * We want to keep only values that appear sufficiently often in the
     111             :  * sample that it is reasonable to extrapolate their sample frequencies to
     112             :  * the entire table.  We do this by placing an upper bound on the relative
     113             :  * standard error of the sample frequency, so that any estimates the
     114             :  * planner generates from the MCV statistics can be expected to be
     115             :  * reasonably accurate.
     116             :  *
     117             :  * Since we are sampling without replacement, the sample frequency of a
     118             :  * particular value is described by a hypergeometric distribution.  A
     119             :  * common rule of thumb when estimating errors in this situation is to
     120             :  * require at least 10 instances of the value in the sample, in which case
     121             :  * the distribution can be approximated by a normal distribution, and
     122             :  * standard error analysis techniques can be applied.  Given a sample size
     123             :  * of n, a population size of N, and a sample frequency of p=cnt/n, the
     124             :  * standard error of the proportion p is given by
     125             :  *      SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
     126             :  * where the second term is the finite population correction.  To get
     127             :  * reasonably accurate planner estimates, we impose an upper bound on the
     128             :  * relative standard error of 20% -- i.e., SE/p < 0.2.  This 20% relative
     129             :  * error bound is fairly arbitrary, but has been found empirically to work
     130             :  * well.  Rearranging this formula gives a lower bound on the number of
     131             :  * instances of the value seen:
     132             :  *      cnt > n*(N-n) / (N-n+0.04*n*(N-1))
     133             :  * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
     134             :  * case where n approaches 0 cannot happen in practice, since the sample
     135             :  * size is at least 300.  The case where n approaches N corresponds to
     136             :  * sampling the whole the table, in which case it is reasonable to keep
     137             :  * the whole MCV list (have no lower bound), so it makes sense to apply
     138             :  * this formula for all inputs, even though the above derivation is
     139             :  * technically only valid when the right hand side is at least around 10.
     140             :  *
     141             :  * An alternative way to look at this formula is as follows -- assume that
     142             :  * the number of instances of the value seen scales up to the entire
     143             :  * table, so that the population count is K=N*cnt/n. Then the distribution
     144             :  * in the sample is a hypergeometric distribution parameterised by N, n
     145             :  * and K, and the bound above is mathematically equivalent to demanding
     146             :  * that the standard deviation of that distribution is less than 20% of
     147             :  * its mean.  Thus the relative errors in any planner estimates produced
     148             :  * from the MCV statistics are likely to be not too large.
     149             :  */
     150             : static double
     151          76 : get_mincount_for_mcv_list(int samplerows, double totalrows)
     152             : {
     153          76 :     double      n = samplerows;
     154          76 :     double      N = totalrows;
     155             :     double      numer,
     156             :                 denom;
     157             : 
     158          76 :     numer = n * (N - n);
     159          76 :     denom = N - n + 0.04 * n * (N - 1);
     160             : 
     161             :     /* Guard against division by zero (possible if n = N = 1) */
     162          76 :     if (denom == 0.0)
     163           4 :         return 0.0;
     164             : 
     165          72 :     return numer / denom;
     166             : }
     167             : 
     168             : /*
     169             :  * Builds MCV list from the set of sampled rows.
     170             :  *
     171             :  * The algorithm is quite simple:
     172             :  *
     173             :  *     (1) sort the data (default collation, '<' for the data type)
     174             :  *
     175             :  *     (2) count distinct groups, decide how many to keep
     176             :  *
     177             :  *     (3) build the MCV list using the threshold determined in (2)
     178             :  *
     179             :  *     (4) remove rows represented by the MCV from the sample
     180             :  *
     181             :  */
     182             : MCVList *
     183          76 : statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
     184             :                   VacAttrStats **stats, double totalrows, int stattarget)
     185             : {
     186             :     int         i,
     187             :                 numattrs,
     188             :                 ngroups,
     189             :                 nitems;
     190             :     AttrNumber *attnums;
     191             :     double      mincount;
     192             :     SortItem   *items;
     193             :     SortItem   *groups;
     194          76 :     MCVList    *mcvlist = NULL;
     195             :     MultiSortSupport mss;
     196             : 
     197          76 :     attnums = build_attnums_array(attrs, &numattrs);
     198             : 
     199             :     /* comparator for all the columns */
     200          76 :     mss = build_mss(stats, numattrs);
     201             : 
     202             :     /* sort the rows */
     203          76 :     items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
     204             :                                mss, numattrs, attnums);
     205             : 
     206          76 :     if (!items)
     207           0 :         return NULL;
     208             : 
     209             :     /* transform the sorted rows into groups (sorted by frequency) */
     210          76 :     groups = build_distinct_groups(nitems, items, mss, &ngroups);
     211             : 
     212             :     /*
     213             :      * Maximum number of MCV items to store, based on the statistics target
     214             :      * we computed for the statistics object (from target set for the object
     215             :      * itself, attributes and the system default). In any case, we can't keep
     216             :      * more groups than we have available.
     217             :      */
     218          76 :     nitems = stattarget;
     219          76 :     if (nitems > ngroups)
     220          44 :         nitems = ngroups;
     221             : 
     222             :     /*
     223             :      * Decide how many items to keep in the MCV list. We can't use the same
     224             :      * algorithm as per-column MCV lists, because that only considers the
     225             :      * actual group frequency - but we're primarily interested in how the
     226             :      * actual frequency differs from the base frequency (product of simple
     227             :      * per-column frequencies, as if the columns were independent).
     228             :      *
     229             :      * Using the same algorithm might exclude items that are close to the
     230             :      * "average" frequency of the sample. But that does not say whether the
     231             :      * observed frequency is close to the base frequency or not. We also need
     232             :      * to consider unexpectedly uncommon items (again, compared to the base
     233             :      * frequency), and the single-column algorithm does not have to.
     234             :      *
     235             :      * We simply decide how many items to keep by computing minimum count
     236             :      * using get_mincount_for_mcv_list() and then keep all items that seem to
     237             :      * be more common than that.
     238             :      */
     239          76 :     mincount = get_mincount_for_mcv_list(numrows, totalrows);
     240             : 
     241             :     /*
     242             :      * Walk the groups until we find the first group with a count below the
     243             :      * mincount threshold (the index of that group is the number of groups we
     244             :      * want to keep).
     245             :      */
     246        3060 :     for (i = 0; i < nitems; i++)
     247             :     {
     248        2986 :         if (groups[i].count < mincount)
     249             :         {
     250           2 :             nitems = i;
     251           2 :             break;
     252             :         }
     253             :     }
     254             : 
     255             :     /*
     256             :      * At this point we know the number of items for the MCV list. There might
     257             :      * be none (for uniform distribution with many groups), and in that case
     258             :      * there will be no MCV list. Otherwise construct the MCV list.
     259             :      */
     260          76 :     if (nitems > 0)
     261             :     {
     262             :         int         j;
     263             :         SortItem    key;
     264             :         MultiSortSupport    tmp;
     265             : 
     266             :         /* frequencies for values in each attribute */
     267             :         SortItem  **freqs;
     268             :         int        *nfreqs;
     269             : 
     270             :         /* used to search values */
     271          74 :         tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
     272             :                                         + sizeof(SortSupportData));
     273             : 
     274             :         /* compute frequencies for values in each column */
     275          74 :         nfreqs = (int *) palloc0(sizeof(int) * numattrs);
     276          74 :         freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
     277             : 
     278             :         /*
     279             :          * Allocate the MCV list structure, set the global parameters.
     280             :          */
     281          74 :         mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
     282          74 :                                       sizeof(MCVItem) * nitems);
     283             : 
     284          74 :         mcvlist->magic = STATS_MCV_MAGIC;
     285          74 :         mcvlist->type = STATS_MCV_TYPE_BASIC;
     286          74 :         mcvlist->ndimensions = numattrs;
     287          74 :         mcvlist->nitems = nitems;
     288             : 
     289             :         /* store info about data type OIDs */
     290         272 :         for (i = 0; i < numattrs; i++)
     291         198 :             mcvlist->types[i] = stats[i]->attrtypid;
     292             : 
     293             :         /* Copy the first chunk of groups into the result. */
     294        3058 :         for (i = 0; i < nitems; i++)
     295             :         {
     296             :             /* just pointer to the proper place in the list */
     297        2984 :             MCVItem    *item = &mcvlist->items[i];
     298             : 
     299        2984 :             item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
     300        2984 :             item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
     301             : 
     302             :             /* copy values for the group */
     303        2984 :             memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
     304        2984 :             memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
     305             : 
     306             :             /* groups should be sorted by frequency in descending order */
     307             :             Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
     308             : 
     309             :             /* group frequency */
     310        2984 :             item->frequency = (double) groups[i].count / numrows;
     311             : 
     312             :             /* base frequency, if the attributes were independent */
     313        2984 :             item->base_frequency = 1.0;
     314       11470 :             for (j = 0; j < numattrs; j++)
     315             :             {
     316             :                 SortItem   *freq;
     317             : 
     318             :                 /* single dimension */
     319        8486 :                 tmp->ndims = 1;
     320        8486 :                 tmp->ssup[0] = mss->ssup[j];
     321             : 
     322             :                 /* fill search key */
     323        8486 :                 key.values = &groups[i].values[j];
     324        8486 :                 key.isnull = &groups[i].isnull[j];
     325             : 
     326        8486 :                 freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
     327             :                                                 sizeof(SortItem),
     328             :                                                 multi_sort_compare, tmp);
     329             : 
     330        8486 :                 item->base_frequency *= ((double) freq->count) / numrows;
     331             :             }
     332             :         }
     333             : 
     334          74 :         pfree(nfreqs);
     335          74 :         pfree(freqs);
     336             :     }
     337             : 
     338          76 :     pfree(items);
     339          76 :     pfree(groups);
     340             : 
     341          76 :     return mcvlist;
     342             : }
     343             : 
     344             : /*
     345             :  * build_mss
     346             :  *  build MultiSortSupport for the attributes passed in attrs
     347             :  */
     348             : static MultiSortSupport
     349          76 : build_mss(VacAttrStats **stats, int numattrs)
     350             : {
     351             :     int         i;
     352             : 
     353             :     /* Sort by multiple columns (using array of SortSupport) */
     354          76 :     MultiSortSupport mss = multi_sort_init(numattrs);
     355             : 
     356             :     /* prepare the sort functions for all the attributes */
     357         280 :     for (i = 0; i < numattrs; i++)
     358             :     {
     359         204 :         VacAttrStats *colstat = stats[i];
     360             :         TypeCacheEntry *type;
     361             : 
     362         204 :         type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
     363         204 :         if (type->lt_opr == InvalidOid) /* shouldn't happen */
     364           0 :             elog(ERROR, "cache lookup failed for ordering operator for type %u",
     365             :                  colstat->attrtypid);
     366             : 
     367         204 :         multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
     368             :     }
     369             : 
     370          76 :     return mss;
     371             : }
     372             : 
     373             : /*
     374             :  * count_distinct_groups
     375             :  *  count distinct combinations of SortItems in the array
     376             :  *
     377             :  * The array is assumed to be sorted according to the MultiSortSupport.
     378             :  */
     379             : static int
     380          76 : count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
     381             : {
     382             :     int         i;
     383             :     int         ndistinct;
     384             : 
     385          76 :     ndistinct = 1;
     386      292204 :     for (i = 1; i < numrows; i++)
     387             :     {
     388             :         /* make sure the array really is sorted */
     389             :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
     390             : 
     391      292128 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
     392       45898 :             ndistinct += 1;
     393             :     }
     394             : 
     395          76 :     return ndistinct;
     396             : }
     397             : 
     398             : /*
     399             :  * compare_sort_item_count
     400             :  *  comparator for sorting items by count (frequencies) in descending order
     401             :  */
     402             : static int
     403       46898 : compare_sort_item_count(const void *a, const void *b)
     404             : {
     405       46898 :     SortItem   *ia = (SortItem *) a;
     406       46898 :     SortItem   *ib = (SortItem *) b;
     407             : 
     408       46898 :     if (ia->count == ib->count)
     409       46164 :         return 0;
     410         734 :     else if (ia->count > ib->count)
     411         392 :         return -1;
     412             : 
     413         342 :     return 1;
     414             : }
     415             : 
     416             : /*
     417             :  * build_distinct_groups
     418             :  *  build an array of SortItems for distinct groups and counts matching items
     419             :  *
     420             :  * The input array is assumed to be sorted
     421             :  */
     422             : static SortItem *
     423          76 : build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
     424             :                       int *ndistinct)
     425             : {
     426             :     int         i,
     427             :                 j;
     428          76 :     int         ngroups = count_distinct_groups(numrows, items, mss);
     429             : 
     430          76 :     SortItem   *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
     431             : 
     432          76 :     j = 0;
     433          76 :     groups[0] = items[0];
     434          76 :     groups[0].count = 1;
     435             : 
     436      292204 :     for (i = 1; i < numrows; i++)
     437             :     {
     438             :         /* Assume sorted in ascending order. */
     439             :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
     440             : 
     441             :         /* New distinct group detected. */
     442      292128 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
     443             :         {
     444       45898 :             groups[++j] = items[i];
     445       45898 :             groups[j].count = 0;
     446             :         }
     447             : 
     448      292128 :         groups[j].count++;
     449             :     }
     450             : 
     451             :     /* ensure we filled the expected number of distinct groups */
     452             :     Assert(j + 1 == ngroups);
     453             : 
     454             :     /* Sort the distinct groups by frequency (in descending order). */
     455          76 :     pg_qsort((void *) groups, ngroups, sizeof(SortItem),
     456             :              compare_sort_item_count);
     457             : 
     458          76 :     *ndistinct = ngroups;
     459          76 :     return groups;
     460             : }
     461             : 
     462             : /* compare sort items (single dimension) */
     463             : static int
     464      620864 : sort_item_compare(const void *a, const void *b, void *arg)
     465             : {
     466      620864 :     SortSupport ssup = (SortSupport) arg;
     467      620864 :     SortItem   *ia = (SortItem *) a;
     468      620864 :     SortItem   *ib = (SortItem *) b;
     469             : 
     470     1241728 :     return ApplySortComparator(ia->values[0], ia->isnull[0],
     471     1241728 :                                ib->values[0], ib->isnull[0],
     472             :                                ssup);
     473             : }
     474             : 
     475             : /*
     476             :  * build_column_frequencies
     477             :  *  compute frequencies of values in each column
     478             :  *
     479             :  * This returns an array of SortItems for each attribute the MCV is built
     480             :  * on, with a frequency (number of occurrences) for each value. This is
     481             :  * then used to compute "base" frequency of MCV items.
     482             :  *
     483             :  * All the memory is allocated in a single chunk, so that a single pfree
     484             :  * is enough to release it. We do not allocate space for values/isnull
     485             :  * arrays in the SortItems, because we can simply point into the input
     486             :  * groups directly.
     487             :  */
     488             : static SortItem **
     489          74 : build_column_frequencies(SortItem *groups, int ngroups,
     490             :                          MultiSortSupport mss, int *ncounts)
     491             : {
     492             :     int         i,
     493             :                 dim;
     494             :     SortItem  **result;
     495             :     char       *ptr;
     496             : 
     497             :     Assert(groups);
     498             :     Assert(ncounts);
     499             : 
     500             :     /* allocate arrays for all columns as a single chunk */
     501         148 :     ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
     502          74 :           mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
     503             : 
     504             :     /* initial array of pointers */
     505          74 :     result = (SortItem **) ptr;
     506          74 :     ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
     507             : 
     508         272 :     for (dim = 0; dim < mss->ndims; dim++)
     509             :     {
     510         198 :         SortSupport ssup = &mss->ssup[dim];
     511             : 
     512             :         /* array of values for a single column */
     513         198 :         result[dim] = (SortItem *) ptr;
     514         198 :         ptr += MAXALIGN(sizeof(SortItem) * ngroups);
     515             : 
     516             :         /* extract data for the dimension */
     517      133752 :         for (i = 0; i < ngroups; i++)
     518             :         {
     519             :             /* point into the input groups */
     520      133554 :             result[dim][i].values = &groups[i].values[dim];
     521      133554 :             result[dim][i].isnull = &groups[i].isnull[dim];
     522      133554 :             result[dim][i].count = groups[i].count;
     523             :         }
     524             : 
     525             :         /* sort the values, deduplicate */
     526         198 :         qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
     527             :                   sort_item_compare, ssup);
     528             : 
     529             :         /*
     530             :          * Identify distinct values, compute frequency (there might be
     531             :          * multiple MCV items containing this value, so we need to sum
     532             :          * counts from all of them.
     533             :          */
     534         198 :         ncounts[dim] = 1;
     535      133554 :         for (i = 1; i < ngroups; i++)
     536             :         {
     537      133356 :             if (sort_item_compare(&result[dim][i-1], &result[dim][i], ssup) == 0)
     538             :             {
     539      124484 :                 result[dim][ncounts[dim]-1].count += result[dim][i].count;
     540      124484 :                 continue;
     541             :             }
     542             : 
     543        8872 :             result[dim][ncounts[dim]] = result[dim][i];
     544             : 
     545        8872 :             ncounts[dim]++;
     546             :         }
     547             :     }
     548             : 
     549          74 :     return result;
     550             : }
     551             : 
     552             : /*
     553             :  * statext_mcv_load
     554             :  *      Load the MCV list for the indicated pg_statistic_ext tuple
     555             :  */
     556             : MCVList *
     557          64 : statext_mcv_load(Oid mvoid)
     558             : {
     559             :     MCVList    *result;
     560             :     bool        isnull;
     561             :     Datum       mcvlist;
     562          64 :     HeapTuple   htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
     563             : 
     564          64 :     if (!HeapTupleIsValid(htup))
     565           0 :         elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
     566             : 
     567          64 :     mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
     568             :                               Anum_pg_statistic_ext_data_stxdmcv, &isnull);
     569             : 
     570          64 :     if (isnull)
     571           0 :         elog(ERROR,
     572             :              "requested statistic kind \"%c\" is not yet built for statistics object %u",
     573             :              STATS_EXT_DEPENDENCIES, mvoid);
     574             : 
     575          64 :     result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
     576             : 
     577          64 :     ReleaseSysCache(htup);
     578             : 
     579          64 :     return result;
     580             : }
     581             : 
     582             : 
     583             : /*
     584             :  * statext_mcv_serialize
     585             :  *      Serialize MCV list into a pg_mcv_list value.
     586             :  *
     587             :  * The MCV items may include values of various data types, and it's reasonable
     588             :  * to expect redundancy (values for a given attribute, repeated for multiple
     589             :  * MCV list items). So we deduplicate the values into arrays, and then replace
     590             :  * the values by indexes into those arrays.
     591             :  *
     592             :  * The overall structure of the serialized representation looks like this:
     593             :  *
     594             :  * +---------------+----------------+---------------------+-------+
     595             :  * | header fields | dimension info | deduplicated values | items |
     596             :  * +---------------+----------------+---------------------+-------+
     597             :  *
     598             :  * Where dimension info stores information about type of K-th attribute (e.g.
     599             :  * typlen, typbyval and length of deduplicated values).  Deduplicated values
     600             :  * store deduplicated values for each attribute.  And items store the actual
     601             :  * MCV list items, with values replaced by indexes into the arrays.
     602             :  *
     603             :  * When serializing the items, we use uint16 indexes. The number of MCV items
     604             :  * is limited by the statistics target (which is capped to 10k at the moment).
     605             :  * We might increase this to 65k and still fit into uint16, so there's a bit of
     606             :  * slack. Furthermore, this limit is on the number of distinct values per column,
     607             :  * and we usually have few of those (and various combinations of them for the
     608             :  * those MCV list). So uint16 seems fine for now.
     609             :  *
     610             :  * We don't really expect the serialization to save as much space as for
     611             :  * histograms, as we are not doing any bucket splits (which is the source
     612             :  * of high redundancy in histograms).
     613             :  *
     614             :  * TODO: Consider packing boolean flags (NULL) for each item into a single char
     615             :  * (or a longer type) instead of using an array of bool items.
     616             :  */
     617             : bytea *
     618          74 : statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
     619             : {
     620             :     int         i;
     621             :     int         dim;
     622          74 :     int         ndims = mcvlist->ndimensions;
     623             : 
     624             :     SortSupport ssup;
     625             :     DimensionInfo *info;
     626             : 
     627             :     Size        total_length;
     628             : 
     629             :     /* serialized items (indexes into arrays, etc.) */
     630             :     bytea      *raw;
     631             :     char       *ptr;
     632             :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
     633             : 
     634             :     /* values per dimension (and number of non-NULL values) */
     635          74 :     Datum     **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
     636          74 :     int        *counts = (int *) palloc0(sizeof(int) * ndims);
     637             : 
     638             :     /*
     639             :      * We'll include some rudimentary information about the attribute types
     640             :      * (length, by-val flag), so that we don't have to look them up while
     641             :      * deserializating the MCV list (we already have the type OID in the
     642             :      * header).  This is safe, because when changing type of the attribute the
     643             :      * statistics gets dropped automatically.  We need to store the info about
     644             :      * the arrays of deduplicated values anyway.
     645             :      */
     646          74 :     info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
     647             : 
     648             :     /* sort support data for all attributes included in the MCV list */
     649          74 :     ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
     650             : 
     651             :     /* collect and deduplicate values for each dimension (attribute) */
     652         272 :     for (dim = 0; dim < ndims; dim++)
     653             :     {
     654             :         int         ndistinct;
     655             :         TypeCacheEntry *typentry;
     656             : 
     657             :         /*
     658             :          * Lookup the LT operator (can't get it from stats extra_data, as we
     659             :          * don't know how to interpret that - scalar vs. array etc.).
     660             :          */
     661         198 :         typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
     662             : 
     663             :         /* copy important info about the data type (length, by-value) */
     664         198 :         info[dim].typlen = stats[dim]->attrtype->typlen;
     665         198 :         info[dim].typbyval = stats[dim]->attrtype->typbyval;
     666             : 
     667             :         /* allocate space for values in the attribute and collect them */
     668         198 :         values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
     669             : 
     670        8684 :         for (i = 0; i < mcvlist->nitems; i++)
     671             :         {
     672             :             /* skip NULL values - we don't need to deduplicate those */
     673        8486 :             if (mcvlist->items[i].isnull[dim])
     674          52 :                 continue;
     675             : 
     676             :             /* append the value at the end */
     677        8434 :             values[dim][counts[dim]] = mcvlist->items[i].values[dim];
     678        8434 :             counts[dim] += 1;
     679             :         }
     680             : 
     681             :         /* if there are just NULL values in this dimension, we're done */
     682         198 :         if (counts[dim] == 0)
     683           4 :             continue;
     684             : 
     685             :         /* sort and deduplicate the data */
     686         194 :         ssup[dim].ssup_cxt = CurrentMemoryContext;
     687         194 :         ssup[dim].ssup_collation = stats[dim]->attrcollid;
     688         194 :         ssup[dim].ssup_nulls_first = false;
     689             : 
     690         194 :         PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
     691             : 
     692         194 :         qsort_arg(values[dim], counts[dim], sizeof(Datum),
     693         194 :                   compare_scalars_simple, &ssup[dim]);
     694             : 
     695             :         /*
     696             :          * Walk through the array and eliminate duplicate values, but keep the
     697             :          * ordering (so that we can do bsearch later). We know there's at
     698             :          * least one item as (counts[dim] != 0), so we can skip the first
     699             :          * element.
     700             :          */
     701         194 :         ndistinct = 1;          /* number of distinct values */
     702        8434 :         for (i = 1; i < counts[dim]; i++)
     703             :         {
     704             :             /* expect sorted array */
     705             :             Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
     706             : 
     707             :             /* if the value is the same as the previous one, we can skip it */
     708        8240 :             if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
     709        3700 :                 continue;
     710             : 
     711        4540 :             values[dim][ndistinct] = values[dim][i];
     712        4540 :             ndistinct += 1;
     713             :         }
     714             : 
     715             :         /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
     716             :         Assert(ndistinct <= PG_UINT16_MAX);
     717             : 
     718             :         /*
     719             :          * Store additional info about the attribute - number of deduplicated
     720             :          * values, and also size of the serialized data. For fixed-length data
     721             :          * types this is trivial to compute, for varwidth types we need to
     722             :          * actually walk the array and sum the sizes.
     723             :          */
     724         194 :         info[dim].nvalues = ndistinct;
     725             : 
     726         194 :         if (info[dim].typbyval) /* by-value data types */
     727             :         {
     728         112 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
     729             : 
     730             :             /*
     731             :              * We copy the data into the MCV item during deserialization, so
     732             :              * we don't need to allocate any extra space.
     733             :             */
     734         112 :             info[dim].nbytes_aligned = 0;
     735             :         }
     736          82 :         else if (info[dim].typlen > 0)       /* fixed-length by-ref */
     737             :         {
     738             :             /*
     739             :              * We don't care about alignment in the serialized data, so we
     740             :              * pack the data as much as possible. But we also track how much
     741             :              * data will be needed after deserialization, and in that case
     742             :              * we need to account for alignment of each item.
     743             :              *
     744             :              * Note: As the items are fixed-length, we could easily compute
     745             :              * this during deserialization, but we do it here anyway.
     746             :              */
     747           0 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
     748           0 :             info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
     749             :         }
     750          82 :         else if (info[dim].typlen == -1)    /* varlena */
     751             :         {
     752          82 :             info[dim].nbytes = 0;
     753          82 :             info[dim].nbytes_aligned = 0;
     754        2310 :             for (i = 0; i < info[dim].nvalues; i++)
     755             :             {
     756             :                 Size        len;
     757             : 
     758             :                 /*
     759             :                  * For varlena values, we detoast the values and store the
     760             :                  * length and data separately. We don't bother with alignment
     761             :                  * here, which means that during deserialization we need to
     762             :                  * copy the fields and only access the copies.
     763             :                  */
     764        2228 :                 values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
     765             : 
     766             :                 /* serialized length (uint32 length + data) */
     767        2228 :                 len = VARSIZE_ANY_EXHDR(values[dim][i]);
     768        2228 :                 info[dim].nbytes += sizeof(uint32); /* length */
     769        2228 :                 info[dim].nbytes += len;            /* value (no header) */
     770             : 
     771             :                 /*
     772             :                  * During deserialization we'll build regular varlena values
     773             :                  * with full headers, and we need to align them properly.
     774             :                  */
     775        2228 :                 info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
     776             :             }
     777             :         }
     778           0 :         else if (info[dim].typlen == -2)    /* cstring */
     779             :         {
     780           0 :             info[dim].nbytes = 0;
     781           0 :             info[dim].nbytes_aligned = 0;
     782           0 :             for (i = 0; i < info[dim].nvalues; i++)
     783             :             {
     784             :                 Size        len;
     785             : 
     786             :                 /*
     787             :                  * For cstring, we do similar thing as for varlena - first we
     788             :                  * store the length as uint32 and then the data. We don't care
     789             :                  * about alignment, which means that during deserialization we
     790             :                  * need to copy the fields and only access the copies.
     791             :                  */
     792             : 
     793             :                 /* c-strings include terminator, so +1 byte */
     794           0 :                 len = strlen(DatumGetCString(values[dim][i])) + 1;
     795           0 :                 info[dim].nbytes += sizeof(uint32); /* length */
     796           0 :                 info[dim].nbytes += len;            /* value */
     797             : 
     798             :                 /* space needed for properly aligned deserialized copies */
     799           0 :                 info[dim].nbytes_aligned += MAXALIGN(len);
     800             :             }
     801             :         }
     802             : 
     803             :         /* we know (count>0) so there must be some data */
     804             :         Assert(info[dim].nbytes > 0);
     805             :     }
     806             : 
     807             :     /*
     808             :      * Now we can finally compute how much space we'll actually need for the
     809             :      * whole serialized MCV list (varlena header, MCV header, dimension info
     810             :      * for each attribute, deduplicated values and items).
     811             :      */
     812          74 :     total_length = (3 * sizeof(uint32))         /* magic + type + nitems */
     813             :                     + sizeof(AttrNumber)        /* ndimensions */
     814          74 :                     + (ndims * sizeof(Oid));    /* attribute types */
     815             : 
     816             :     /* dimension info */
     817          74 :     total_length += ndims * sizeof(DimensionInfo);
     818             : 
     819             :     /* add space for the arrays of deduplicated values */
     820         272 :     for (i = 0; i < ndims; i++)
     821         198 :         total_length += info[i].nbytes;
     822             : 
     823             :     /*
     824             :      * And finally account for the items (those are fixed-length, thanks to
     825             :      * replacing values with uint16 indexes into the deduplicated arrays).
     826             :      */
     827          74 :     total_length += mcvlist->nitems * ITEM_SIZE(dim);
     828             : 
     829             :     /*
     830             :      * Allocate space for the whole serialized MCV list (we'll skip bytes, so
     831             :      * we set them to zero to make the result more compressible).
     832             :      */
     833          74 :     raw = (bytea *) palloc0(VARHDRSZ + total_length);
     834          74 :     SET_VARSIZE(raw, VARHDRSZ + total_length);
     835             : 
     836          74 :     ptr = VARDATA(raw);
     837          74 :     endptr = ptr + total_length;
     838             : 
     839             :     /* copy the MCV list header fields, one by one */
     840          74 :     memcpy(ptr, &mcvlist->magic, sizeof(uint32));
     841          74 :     ptr += sizeof(uint32);
     842             : 
     843          74 :     memcpy(ptr, &mcvlist->type, sizeof(uint32));
     844          74 :     ptr += sizeof(uint32);
     845             : 
     846          74 :     memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
     847          74 :     ptr += sizeof(uint32);
     848             : 
     849          74 :     memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
     850          74 :     ptr += sizeof(AttrNumber);
     851             : 
     852          74 :     memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
     853          74 :     ptr += (sizeof(Oid) * ndims);
     854             : 
     855             :     /* store information about the attributes (data amounts, ...) */
     856          74 :     memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
     857          74 :     ptr += sizeof(DimensionInfo) * ndims;
     858             : 
     859             :     /* Copy the deduplicated values for all attributes to the output. */
     860         272 :     for (dim = 0; dim < ndims; dim++)
     861             :     {
     862             :         /* remember the starting point for Asserts later */
     863         198 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
     864             : 
     865        4932 :         for (i = 0; i < info[dim].nvalues; i++)
     866             :         {
     867        4734 :             Datum       value = values[dim][i];
     868             : 
     869        4734 :             if (info[dim].typbyval) /* passed by value */
     870             :             {
     871             :                 Datum       tmp;
     872             : 
     873             :                 /*
     874             :                  * For values passed by value, we need to copy just the
     875             :                  * significant bytes - we can't use memcpy directly, as that
     876             :                  * assumes little endian behavior.  store_att_byval does
     877             :                  * almost what we need, but it requires properly aligned
     878             :                  * buffer - the output buffer does not guarantee that. So we
     879             :                  * simply use a local Datum variable (which guarantees proper
     880             :                  * alignment), and then copy the value from it.
     881             :                  */
     882        2506 :                 store_att_byval(&tmp, value, info[dim].typlen);
     883             : 
     884        2506 :                 memcpy(ptr, &tmp, info[dim].typlen);
     885        2506 :                 ptr += info[dim].typlen;
     886             :             }
     887        2228 :             else if (info[dim].typlen > 0)   /* passed by reference */
     888             :             {
     889             :                 /* no special alignment needed, treated as char array */
     890           0 :                 memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
     891           0 :                 ptr += info[dim].typlen;
     892             :             }
     893        2228 :             else if (info[dim].typlen == -1)    /* varlena */
     894             :             {
     895        2228 :                 uint32      len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
     896             : 
     897             :                 /* copy the length */
     898        2228 :                 memcpy(ptr, &len, sizeof(uint32));
     899        2228 :                 ptr += sizeof(uint32);
     900             : 
     901             :                 /* data from the varlena value (without the header) */
     902        2228 :                 memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
     903        2228 :                 ptr += len;
     904             :             }
     905           0 :             else if (info[dim].typlen == -2)    /* cstring */
     906             :             {
     907           0 :                 uint32      len = (uint32) strlen(DatumGetCString(value)) + 1;
     908             : 
     909             :                 /* copy the length */
     910           0 :                 memcpy(ptr, &len, sizeof(uint32));
     911           0 :                 ptr += sizeof(uint32);
     912             : 
     913             :                 /* value */
     914           0 :                 memcpy(ptr, DatumGetCString(value), len);
     915           0 :                 ptr += len;
     916             :             }
     917             : 
     918             :             /* no underflows or overflows */
     919             :             Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
     920             :         }
     921             : 
     922             :         /* we should get exactly nbytes of data for this dimension */
     923             :         Assert((ptr - start) == info[dim].nbytes);
     924             :     }
     925             : 
     926             :     /* Serialize the items, with uint16 indexes instead of the values. */
     927        3058 :     for (i = 0; i < mcvlist->nitems; i++)
     928             :     {
     929        2984 :         MCVItem    *mcvitem = &mcvlist->items[i];
     930             : 
     931             :         /* don't write beyond the allocated space */
     932             :         Assert(ptr <= (endptr - ITEM_SIZE(dim)));
     933             : 
     934             :         /* copy NULL and frequency flags into the serialized MCV */
     935        2984 :         memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
     936        2984 :         ptr += sizeof(bool) * ndims;
     937             : 
     938        2984 :         memcpy(ptr, &mcvitem->frequency, sizeof(double));
     939        2984 :         ptr += sizeof(double);
     940             : 
     941        2984 :         memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
     942        2984 :         ptr += sizeof(double);
     943             : 
     944             :         /* store the indexes last */
     945       11470 :         for (dim = 0; dim < ndims; dim++)
     946             :         {
     947        8486 :             uint16      index = 0;
     948             :             Datum      *value;
     949             : 
     950             :             /* do the lookup only for non-NULL values */
     951        8486 :             if (!mcvitem->isnull[dim])
     952             :             {
     953        8434 :                 value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
     954        8434 :                                               info[dim].nvalues, sizeof(Datum),
     955        8434 :                                               compare_scalars_simple, &ssup[dim]);
     956             : 
     957             :                 Assert(value != NULL);  /* serialization or deduplication error */
     958             : 
     959             :                 /* compute index within the deduplicated array */
     960        8434 :                 index = (uint16) (value - values[dim]);
     961             : 
     962             :                 /* check the index is within expected bounds */
     963             :                 Assert(index < info[dim].nvalues);
     964             :             }
     965             : 
     966             :             /* copy the index into the serialized MCV */
     967        8486 :             memcpy(ptr, &index, sizeof(uint16));
     968        8486 :             ptr += sizeof(uint16);
     969             :         }
     970             : 
     971             :         /* make sure we don't overflow the allocated value */
     972             :         Assert(ptr <= endptr);
     973             :     }
     974             : 
     975             :     /* at this point we expect to match the total_length exactly */
     976             :     Assert(ptr == endptr);
     977             : 
     978          74 :     pfree(values);
     979          74 :     pfree(counts);
     980             : 
     981          74 :     return raw;
     982             : }
     983             : 
     984             : /*
     985             :  * statext_mcv_deserialize
     986             :  *      Reads serialized MCV list into MCVList structure.
     987             :  *
     988             :  * All the memory needed by the MCV list is allocated as a single chunk, so
     989             :  * it's possible to simply pfree() it at once.
     990             :  */
     991             : MCVList *
     992          68 : statext_mcv_deserialize(bytea *data)
     993             : {
     994             :     int         dim,
     995             :                 i;
     996             :     Size        expected_size;
     997             :     MCVList    *mcvlist;
     998             :     char       *raw;
     999             :     char       *ptr;
    1000             :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
    1001             : 
    1002             :     int         ndims,
    1003             :                 nitems;
    1004          68 :     DimensionInfo *info = NULL;
    1005             : 
    1006             :     /* local allocation buffer (used only for deserialization) */
    1007          68 :     Datum     **map = NULL;
    1008             : 
    1009             :     /* MCV list */
    1010             :     Size        mcvlen;
    1011             : 
    1012             :     /* buffer used for the result */
    1013             :     Size        datalen;
    1014             :     char       *dataptr;
    1015             :     char       *valuesptr;
    1016             :     char       *isnullptr;
    1017             : 
    1018          68 :     if (data == NULL)
    1019           0 :         return NULL;
    1020             : 
    1021             :     /*
    1022             :      * We can't possibly deserialize a MCV list if there's not even a complete
    1023             :      * header. We need an explicit formula here, because we serialize the
    1024             :      * header fields one by one, so we need to ignore struct alignment.
    1025             :      */
    1026          68 :     if (VARSIZE_ANY(data) < MinSizeOfMCVList)
    1027           0 :         elog(ERROR, "invalid MCV size %zd (expected at least %zu)",
    1028             :              VARSIZE_ANY(data), MinSizeOfMCVList);
    1029             : 
    1030             :     /* read the MCV list header */
    1031          68 :     mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
    1032             : 
    1033             :     /* pointer to the data part (skip the varlena header) */
    1034          68 :     raw = (char *) data;
    1035          68 :     ptr = VARDATA_ANY(raw);
    1036          68 :     endptr = (char *) raw + VARSIZE_ANY(data);
    1037             : 
    1038             :     /* get the header and perform further sanity checks */
    1039          68 :     memcpy(&mcvlist->magic, ptr, sizeof(uint32));
    1040          68 :     ptr += sizeof(uint32);
    1041             : 
    1042          68 :     memcpy(&mcvlist->type, ptr, sizeof(uint32));
    1043          68 :     ptr += sizeof(uint32);
    1044             : 
    1045          68 :     memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
    1046          68 :     ptr += sizeof(uint32);
    1047             : 
    1048          68 :     memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
    1049          68 :     ptr += sizeof(AttrNumber);
    1050             : 
    1051          68 :     if (mcvlist->magic != STATS_MCV_MAGIC)
    1052           0 :         elog(ERROR, "invalid MCV magic %u (expected %u)",
    1053             :              mcvlist->magic, STATS_MCV_MAGIC);
    1054             : 
    1055          68 :     if (mcvlist->type != STATS_MCV_TYPE_BASIC)
    1056           0 :         elog(ERROR, "invalid MCV type %u (expected %u)",
    1057             :              mcvlist->type, STATS_MCV_TYPE_BASIC);
    1058             : 
    1059          68 :     if (mcvlist->ndimensions == 0)
    1060           0 :         elog(ERROR, "invalid zero-length dimension array in MCVList");
    1061         136 :     else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
    1062          68 :              (mcvlist->ndimensions < 0))
    1063           0 :         elog(ERROR, "invalid length (%d) dimension array in MCVList",
    1064             :              mcvlist->ndimensions);
    1065             : 
    1066          68 :     if (mcvlist->nitems == 0)
    1067           0 :         elog(ERROR, "invalid zero-length item array in MCVList");
    1068          68 :     else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
    1069           0 :         elog(ERROR, "invalid length (%u) item array in MCVList",
    1070             :              mcvlist->nitems);
    1071             : 
    1072          68 :     nitems = mcvlist->nitems;
    1073          68 :     ndims = mcvlist->ndimensions;
    1074             : 
    1075             :     /*
    1076             :      * Check amount of data including DimensionInfo for all dimensions and
    1077             :      * also the serialized items (including uint16 indexes). Also, walk
    1078             :      * through the dimension information and add it to the sum.
    1079             :      */
    1080          68 :     expected_size = SizeOfMCVList(ndims, nitems);
    1081             : 
    1082             :     /*
    1083             :      * Check that we have at least the dimension and info records, along with
    1084             :      * the items. We don't know the size of the serialized values yet. We need
    1085             :      * to do this check first, before accessing the dimension info.
    1086             :      */
    1087          68 :     if (VARSIZE_ANY(data) < expected_size)
    1088           0 :         elog(ERROR, "invalid MCV size %zd (expected %zu)",
    1089             :              VARSIZE_ANY(data), expected_size);
    1090             : 
    1091             :     /* Now copy the array of type Oids. */
    1092          68 :     memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
    1093          68 :     ptr += (sizeof(Oid) * ndims);
    1094             : 
    1095             :     /* Now it's safe to access the dimension info. */
    1096          68 :     info = palloc(ndims * sizeof(DimensionInfo));
    1097             : 
    1098          68 :     memcpy(info, ptr, ndims * sizeof(DimensionInfo));
    1099          68 :     ptr += (ndims * sizeof(DimensionInfo));
    1100             : 
    1101             :     /* account for the value arrays */
    1102         268 :     for (dim = 0; dim < ndims; dim++)
    1103             :     {
    1104             :         /*
    1105             :          * XXX I wonder if we can/should rely on asserts here. Maybe those
    1106             :          * checks should be done every time?
    1107             :          */
    1108             :         Assert(info[dim].nvalues >= 0);
    1109             :         Assert(info[dim].nbytes >= 0);
    1110             : 
    1111         200 :         expected_size += info[dim].nbytes;
    1112             :     }
    1113             : 
    1114             :     /*
    1115             :      * Now we know the total expected MCV size, including all the pieces
    1116             :      * (header, dimension info. items and deduplicated data). So do the final
    1117             :      * check on size.
    1118             :      */
    1119          68 :     if (VARSIZE_ANY(data) != expected_size)
    1120           0 :         elog(ERROR, "invalid MCV size %zd (expected %zu)",
    1121             :              VARSIZE_ANY(data), expected_size);
    1122             : 
    1123             :     /*
    1124             :      * We need an array of Datum values for each dimension, so that we can
    1125             :      * easily translate the uint16 indexes later. We also need a top-level
    1126             :      * array of pointers to those per-dimension arrays.
    1127             :      *
    1128             :      * While allocating the arrays for dimensions, compute how much space we
    1129             :      * need for a copy of the by-ref data, as we can't simply point to the
    1130             :      * original values (it might go away).
    1131             :      */
    1132          68 :     datalen = 0;                /* space for by-ref data */
    1133          68 :     map = (Datum **) palloc(ndims * sizeof(Datum *));
    1134             : 
    1135         268 :     for (dim = 0; dim < ndims; dim++)
    1136             :     {
    1137         200 :         map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
    1138             : 
    1139             :         /* space needed for a copy of data for by-ref types */
    1140         200 :         datalen += info[dim].nbytes_aligned;
    1141             :     }
    1142             : 
    1143             :     /*
    1144             :      * Now resize the MCV list so that the allocation includes all the data.
    1145             :      *
    1146             :      * Allocate space for a copy of the data, as we can't simply reference the
    1147             :      * serialized data - it's not aligned properly, and it may disappear while
    1148             :      * we're still using the MCV list, e.g. due to catcache release.
    1149             :      *
    1150             :      * We do care about alignment here, because we will allocate all the pieces
    1151             :      * at once, but then use pointers to different parts.
    1152             :      */
    1153          68 :     mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
    1154             : 
    1155             :     /* arrays of values and isnull flags for all MCV items */
    1156          68 :     mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
    1157          68 :     mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
    1158             : 
    1159             :     /* we don't quite need to align this, but it makes some asserts easier */
    1160          68 :     mcvlen += MAXALIGN(datalen);
    1161             : 
    1162             :     /* now resize the deserialized MCV list, and compute pointers to parts */
    1163          68 :     mcvlist = repalloc(mcvlist, mcvlen);
    1164             : 
    1165             :     /* pointer to the beginning of values/isnull arrays */
    1166          68 :     valuesptr = (char *) mcvlist
    1167          68 :         + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
    1168             : 
    1169          68 :     isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
    1170             : 
    1171          68 :     dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
    1172             : 
    1173             :     /*
    1174             :      * Build mapping (index => value) for translating the serialized data into
    1175             :      * the in-memory representation.
    1176             :      */
    1177         268 :     for (dim = 0; dim < ndims; dim++)
    1178             :     {
    1179             :         /* remember start position in the input array */
    1180         200 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
    1181             : 
    1182         200 :         if (info[dim].typbyval)
    1183             :         {
    1184             :             /* for by-val types we simply copy data into the mapping */
    1185        4772 :             for (i = 0; i < info[dim].nvalues; i++)
    1186             :             {
    1187        4644 :                 Datum       v = 0;
    1188             : 
    1189        4644 :                 memcpy(&v, ptr, info[dim].typlen);
    1190        4644 :                 ptr += info[dim].typlen;
    1191             : 
    1192        4644 :                 map[dim][i] = fetch_att(&v, true, info[dim].typlen);
    1193             : 
    1194             :                 /* no under/overflow of input array */
    1195             :                 Assert(ptr <= (start + info[dim].nbytes));
    1196             :             }
    1197             :         }
    1198             :         else
    1199             :         {
    1200             :             /* for by-ref types we need to also make a copy of the data */
    1201             : 
    1202             :             /* passed by reference, but fixed length (name, tid, ...) */
    1203          72 :             if (info[dim].typlen > 0)
    1204             :             {
    1205           0 :                 for (i = 0; i < info[dim].nvalues; i++)
    1206             :                 {
    1207           0 :                     memcpy(dataptr, ptr, info[dim].typlen);
    1208           0 :                     ptr += info[dim].typlen;
    1209             : 
    1210             :                     /* just point into the array */
    1211           0 :                     map[dim][i] = PointerGetDatum(dataptr);
    1212           0 :                     dataptr += MAXALIGN(info[dim].typlen);
    1213             :                 }
    1214             :             }
    1215          72 :             else if (info[dim].typlen == -1)
    1216             :             {
    1217             :                 /* varlena */
    1218        2420 :                 for (i = 0; i < info[dim].nvalues; i++)
    1219             :                 {
    1220             :                     uint32      len;
    1221             : 
    1222             :                     /* read the uint32 length */
    1223        2348 :                     memcpy(&len, ptr, sizeof(uint32));
    1224        2348 :                     ptr += sizeof(uint32);
    1225             : 
    1226             :                     /* the length is data-only */
    1227        2348 :                     SET_VARSIZE(dataptr, len + VARHDRSZ);
    1228        2348 :                     memcpy(VARDATA(dataptr), ptr, len);
    1229        2348 :                     ptr += len;
    1230             : 
    1231             :                     /* just point into the array */
    1232        2348 :                     map[dim][i] = PointerGetDatum(dataptr);
    1233             : 
    1234             :                     /* skip to place of the next deserialized value */
    1235        2348 :                     dataptr += MAXALIGN(len + VARHDRSZ);
    1236             :                 }
    1237             :             }
    1238           0 :             else if (info[dim].typlen == -2)
    1239             :             {
    1240             :                 /* cstring */
    1241           0 :                 for (i = 0; i < info[dim].nvalues; i++)
    1242             :                 {
    1243             :                     uint32      len;
    1244             : 
    1245           0 :                     memcpy(&len, ptr, sizeof(uint32));
    1246           0 :                     ptr += sizeof(uint32);
    1247             : 
    1248           0 :                     memcpy(dataptr, ptr, len);
    1249           0 :                     ptr += len;
    1250             : 
    1251             :                     /* just point into the array */
    1252           0 :                     map[dim][i] = PointerGetDatum(dataptr);
    1253           0 :                     dataptr += MAXALIGN(len);
    1254             :                 }
    1255             :             }
    1256             : 
    1257             :             /* no under/overflow of input array */
    1258             :             Assert(ptr <= (start + info[dim].nbytes));
    1259             : 
    1260             :             /* no overflow of the output mcv value */
    1261             :             Assert(dataptr <= ((char *) mcvlist + mcvlen));
    1262             :         }
    1263             : 
    1264             :         /* check we consumed input data for this dimension exactly */
    1265             :         Assert(ptr == (start + info[dim].nbytes));
    1266             :     }
    1267             : 
    1268             :     /* we should have also filled the MCV list exactly */
    1269             :     Assert(dataptr == ((char *) mcvlist + mcvlen));
    1270             : 
    1271             :     /* deserialize the MCV items and translate the indexes to Datums */
    1272        4544 :     for (i = 0; i < nitems; i++)
    1273             :     {
    1274        4476 :         MCVItem    *item = &mcvlist->items[i];
    1275             : 
    1276        4476 :         item->values = (Datum *) valuesptr;
    1277        4476 :         valuesptr += MAXALIGN(sizeof(Datum) * ndims);
    1278             : 
    1279        4476 :         item->isnull = (bool *) isnullptr;
    1280        4476 :         isnullptr += MAXALIGN(sizeof(bool) * ndims);
    1281             : 
    1282        4476 :         memcpy(item->isnull, ptr, sizeof(bool) * ndims);
    1283        4476 :         ptr += sizeof(bool) * ndims;
    1284             : 
    1285        4476 :         memcpy(&item->frequency, ptr, sizeof(double));
    1286        4476 :         ptr += sizeof(double);
    1287             : 
    1288        4476 :         memcpy(&item->base_frequency, ptr, sizeof(double));
    1289        4476 :         ptr += sizeof(double);
    1290             : 
    1291             :         /* finally translate the indexes (for non-NULL only) */
    1292       17896 :         for (dim = 0; dim < ndims; dim++)
    1293             :         {
    1294             :             uint16  index;
    1295             : 
    1296       13420 :             memcpy(&index, ptr, sizeof(uint16));
    1297       13420 :             ptr += sizeof(uint16);
    1298             : 
    1299       13420 :             if (item->isnull[dim])
    1300          64 :                 continue;
    1301             : 
    1302       13356 :             item->values[dim] = map[dim][index];
    1303             :         }
    1304             : 
    1305             :         /* check we're not overflowing the input */
    1306             :         Assert(ptr <= endptr);
    1307             :     }
    1308             : 
    1309             :     /* check that we processed all the data */
    1310             :     Assert(ptr == endptr);
    1311             : 
    1312             :     /* release the buffers used for mapping */
    1313         268 :     for (dim = 0; dim < ndims; dim++)
    1314         200 :         pfree(map[dim]);
    1315             : 
    1316          68 :     pfree(map);
    1317             : 
    1318          68 :     return mcvlist;
    1319             : }
    1320             : 
    1321             : /*
    1322             :  * SRF with details about buckets of a histogram:
    1323             :  *
    1324             :  * - item ID (0...nitems)
    1325             :  * - values (string array)
    1326             :  * - nulls only (boolean array)
    1327             :  * - frequency (double precision)
    1328             :  * - base_frequency (double precision)
    1329             :  *
    1330             :  * The input is the OID of the statistics, and there are no rows returned if
    1331             :  * the statistics contains no histogram.
    1332             :  */
    1333             : Datum
    1334           8 : pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
    1335             : {
    1336             :     FuncCallContext *funcctx;
    1337             : 
    1338             :     /* stuff done only on the first call of the function */
    1339           8 :     if (SRF_IS_FIRSTCALL())
    1340             :     {
    1341             :         MemoryContext oldcontext;
    1342             :         MCVList    *mcvlist;
    1343             :         TupleDesc   tupdesc;
    1344             : 
    1345             :         /* create a function context for cross-call persistence */
    1346           4 :         funcctx = SRF_FIRSTCALL_INIT();
    1347             : 
    1348             :         /* switch to memory context appropriate for multiple function calls */
    1349           4 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1350             : 
    1351           4 :         mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
    1352             : 
    1353           4 :         funcctx->user_fctx = mcvlist;
    1354             : 
    1355             :         /* total number of tuples to be returned */
    1356           4 :         funcctx->max_calls = 0;
    1357           4 :         if (funcctx->user_fctx != NULL)
    1358           4 :             funcctx->max_calls = mcvlist->nitems;
    1359             : 
    1360             :         /* Build a tuple descriptor for our result type */
    1361           4 :         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
    1362           0 :             ereport(ERROR,
    1363             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1364             :                      errmsg("function returning record called in context "
    1365             :                             "that cannot accept type record")));
    1366           4 :         tupdesc = BlessTupleDesc(tupdesc);
    1367             : 
    1368             :         /*
    1369             :          * generate attribute metadata needed later to produce tuples from raw
    1370             :          * C strings
    1371             :          */
    1372           4 :         funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
    1373             : 
    1374           4 :         MemoryContextSwitchTo(oldcontext);
    1375             :     }
    1376             : 
    1377             :     /* stuff done on every call of the function */
    1378           8 :     funcctx = SRF_PERCALL_SETUP();
    1379             : 
    1380           8 :     if (funcctx->call_cntr < funcctx->max_calls)   /* do when there is more left to send */
    1381             :     {
    1382             :         Datum       values[5];
    1383             :         bool        nulls[5];
    1384             :         HeapTuple   tuple;
    1385             :         Datum       result;
    1386           4 :         ArrayBuildState *astate_values = NULL;
    1387           4 :         ArrayBuildState *astate_nulls = NULL;
    1388             : 
    1389             :         int         i;
    1390             :         MCVList    *mcvlist;
    1391             :         MCVItem    *item;
    1392             : 
    1393           4 :         mcvlist = (MCVList *) funcctx->user_fctx;
    1394             : 
    1395             :         Assert(funcctx->call_cntr < mcvlist->nitems);
    1396             : 
    1397           4 :         item = &mcvlist->items[funcctx->call_cntr];
    1398             : 
    1399          16 :         for (i = 0; i < mcvlist->ndimensions; i++)
    1400             :         {
    1401             : 
    1402          24 :             astate_nulls = accumArrayResult(astate_nulls,
    1403          12 :                                   BoolGetDatum(item->isnull[i]),
    1404             :                                   false,
    1405             :                                   BOOLOID,
    1406             :                                   CurrentMemoryContext);
    1407             : 
    1408          12 :             if (!item->isnull[i])
    1409             :             {
    1410             :                 bool        isvarlena;
    1411             :                 Oid         outfunc;
    1412             :                 FmgrInfo    fmgrinfo;
    1413             :                 Datum       val;
    1414             :                 text       *txt;
    1415             : 
    1416             :                 /* lookup output func for the type */
    1417          12 :                 getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
    1418          12 :                 fmgr_info(outfunc, &fmgrinfo);
    1419             : 
    1420          12 :                 val = FunctionCall1(&fmgrinfo, item->values[i]);
    1421          12 :                 txt = cstring_to_text(DatumGetPointer(val));
    1422             : 
    1423          12 :                 astate_values = accumArrayResult(astate_values,
    1424             :                                   PointerGetDatum(txt),
    1425             :                                   false,
    1426             :                                   TEXTOID,
    1427             :                                   CurrentMemoryContext);
    1428             :             }
    1429             :             else
    1430           0 :                 astate_values = accumArrayResult(astate_values,
    1431             :                                   (Datum) 0,
    1432             :                                   true,
    1433             :                                   TEXTOID,
    1434             :                                   CurrentMemoryContext);
    1435             :         }
    1436             : 
    1437           4 :         values[0] = Int32GetDatum(funcctx->call_cntr);
    1438           4 :         values[1] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
    1439           4 :         values[2] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext));
    1440           4 :         values[3] = Float8GetDatum(item->frequency);
    1441           4 :         values[4] = Float8GetDatum(item->base_frequency);
    1442             : 
    1443             :         /* no NULLs in the tuple */
    1444           4 :         memset(nulls, 0, sizeof(nulls));
    1445             : 
    1446             :         /* build a tuple */
    1447           4 :         tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
    1448             : 
    1449             :         /* make the tuple into a datum */
    1450           4 :         result = HeapTupleGetDatum(tuple);
    1451             : 
    1452           4 :         SRF_RETURN_NEXT(funcctx, result);
    1453             :     }
    1454             :     else                        /* do when there is no more left */
    1455             :     {
    1456           4 :         SRF_RETURN_DONE(funcctx);
    1457             :     }
    1458             : }
    1459             : 
    1460             : /*
    1461             :  * pg_mcv_list_in       - input routine for type pg_mcv_list.
    1462             :  *
    1463             :  * pg_mcv_list is real enough to be a table column, but it has no operations
    1464             :  * of its own, and disallows input too
    1465             :  */
    1466             : Datum
    1467           0 : pg_mcv_list_in(PG_FUNCTION_ARGS)
    1468             : {
    1469             :     /*
    1470             :      * pg_mcv_list stores the data in binary form and parsing text input is
    1471             :      * not needed, so disallow this.
    1472             :      */
    1473           0 :     ereport(ERROR,
    1474             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1475             :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
    1476             : 
    1477             :     PG_RETURN_VOID();           /* keep compiler quiet */
    1478             : }
    1479             : 
    1480             : 
    1481             : /*
    1482             :  * pg_mcv_list_out      - output routine for type pg_mcv_list.
    1483             :  *
    1484             :  * MCV lists are serialized into a bytea value, so we simply call byteaout()
    1485             :  * to serialize the value into text. But it'd be nice to serialize that into
    1486             :  * a meaningful representation (e.g. for inspection by people).
    1487             :  *
    1488             :  * XXX This should probably return something meaningful, similar to what
    1489             :  * pg_dependencies_out does. Not sure how to deal with the deduplicated
    1490             :  * values, though - do we want to expand that or not?
    1491             :  */
    1492             : Datum
    1493           0 : pg_mcv_list_out(PG_FUNCTION_ARGS)
    1494             : {
    1495           0 :     return byteaout(fcinfo);
    1496             : }
    1497             : 
    1498             : /*
    1499             :  * pg_mcv_list_recv     - binary input routine for type pg_mcv_list.
    1500             :  */
    1501             : Datum
    1502           0 : pg_mcv_list_recv(PG_FUNCTION_ARGS)
    1503             : {
    1504           0 :     ereport(ERROR,
    1505             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1506             :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
    1507             : 
    1508             :     PG_RETURN_VOID();           /* keep compiler quiet */
    1509             : }
    1510             : 
    1511             : /*
    1512             :  * pg_mcv_list_send     - binary output routine for type pg_mcv_list.
    1513             :  *
    1514             :  * MCV lists are serialized in a bytea value (although the type is named
    1515             :  * differently), so let's just send that.
    1516             :  */
    1517             : Datum
    1518           0 : pg_mcv_list_send(PG_FUNCTION_ARGS)
    1519             : {
    1520           0 :     return byteasend(fcinfo);
    1521             : }
    1522             : 
    1523             : /*
    1524             :  * mcv_get_match_bitmap
    1525             :  *  Evaluate clauses using the MCV list, and update the match bitmap.
    1526             :  *
    1527             :  * A match bitmap keeps match/mismatch status for each MCV item, and we
    1528             :  * update it based on additional clauses. We also use it to skip items
    1529             :  * that can't possibly match (e.g. item marked as "mismatch" can't change
    1530             :  * to "match" when evaluating AND clause list).
    1531             :  *
    1532             :  * The function also returns a flag indicating whether there was an
    1533             :  * equality condition for all attributes, the minimum frequency in the MCV
    1534             :  * list, and a total MCV frequency (sum of frequencies for all items).
    1535             :  *
    1536             :  * XXX Currently the match bitmap uses a bool for each MCV item, which is
    1537             :  * somewhat wasteful as we could do with just a single bit, thus reducing
    1538             :  * the size to ~1/8. It would also allow us to combine bitmaps simply using
    1539             :  * & and |, which should be faster than min/max. The bitmaps are fairly
    1540             :  * small, though (thanks to the cap on the MCV list size).
    1541             :  */
    1542             : static bool *
    1543          88 : mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
    1544             :                      Bitmapset *keys, MCVList *mcvlist, bool is_or)
    1545             : {
    1546             :     int         i;
    1547             :     ListCell   *l;
    1548             :     bool       *matches;
    1549             : 
    1550             :     /* The bitmap may be partially built. */
    1551             :     Assert(clauses != NIL);
    1552             :     Assert(list_length(clauses) >= 1);
    1553             :     Assert(mcvlist != NULL);
    1554             :     Assert(mcvlist->nitems > 0);
    1555             :     Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
    1556             : 
    1557          88 :     matches = palloc(sizeof(bool) * mcvlist->nitems);
    1558          88 :     memset(matches, (is_or) ? false : true,
    1559          88 :            sizeof(bool) * mcvlist->nitems);
    1560             : 
    1561             :     /*
    1562             :      * Loop through the list of clauses, and for each of them evaluate all the
    1563             :      * MCV items not yet eliminated by the preceding clauses.
    1564             :      */
    1565         276 :     foreach(l, clauses)
    1566             :     {
    1567         188 :         Node       *clause = (Node *) lfirst(l);
    1568             : 
    1569             :         /* if it's a RestrictInfo, then extract the clause */
    1570         188 :         if (IsA(clause, RestrictInfo))
    1571         160 :             clause = (Node *) ((RestrictInfo *) clause)->clause;
    1572             : 
    1573             :         /*
    1574             :          * Handle the various types of clauses - OpClause, NullTest and
    1575             :          * AND/OR/NOT
    1576             :          */
    1577         188 :         if (is_opclause(clause))
    1578             :         {
    1579          96 :             OpExpr     *expr = (OpExpr *) clause;
    1580             :             FmgrInfo    opproc;
    1581             : 
    1582             :             /* valid only after examine_opclause_expression returns true */
    1583             :             Var        *var;
    1584             :             Const      *cst;
    1585             :             bool        varonleft;
    1586             : 
    1587          96 :             fmgr_info(get_opcode(expr->opno), &opproc);
    1588             : 
    1589             :             /* extract the var and const from the expression */
    1590          96 :             if (examine_opclause_expression(expr, &var, &cst, &varonleft))
    1591             :             {
    1592             :                 int         idx;
    1593             : 
    1594             :                 /* match the attribute to a dimension of the statistic */
    1595          96 :                 idx = bms_member_index(keys, var->varattno);
    1596             : 
    1597             :                 /*
    1598             :                  * Walk through the MCV items and evaluate the current clause.
    1599             :                  * We can skip items that were already ruled out, and
    1600             :                  * terminate if there are no remaining MCV items that might
    1601             :                  * possibly match.
    1602             :                  */
    1603        8912 :                 for (i = 0; i < mcvlist->nitems; i++)
    1604             :                 {
    1605        8816 :                     bool        match = true;
    1606        8816 :                     MCVItem    *item = &mcvlist->items[i];
    1607             : 
    1608             :                     /*
    1609             :                      * When the MCV item or the Const value is NULL we can treat
    1610             :                      * this as a mismatch. We must not call the operator because
    1611             :                      * of strictness.
    1612             :                      */
    1613        8816 :                     if (item->isnull[idx] || cst->constisnull)
    1614             :                     {
    1615           8 :                         matches[i] = RESULT_MERGE(matches[i], is_or, false);
    1616           8 :                         continue;
    1617             :                     }
    1618             : 
    1619             :                     /*
    1620             :                      * Skip MCV items that can't change result in the bitmap.
    1621             :                      * Once the value gets false for AND-lists, or true for
    1622             :                      * OR-lists, we don't need to look at more clauses.
    1623             :                      */
    1624        8808 :                     if (RESULT_IS_FINAL(matches[i], is_or))
    1625        5132 :                         continue;
    1626             : 
    1627             :                     /*
    1628             :                      * First check whether the constant is below the lower
    1629             :                      * boundary (in that case we can skip the bucket, because
    1630             :                      * there's no overlap).
    1631             :                      *
    1632             :                      * We don't store collations used to build the statistics,
    1633             :                      * but we can use the collation for the attribute itself,
    1634             :                      * as stored in varcollid. We do reset the statistics after
    1635             :                      * a type change (including collation change), so this is
    1636             :                      * OK. We may need to relax this after allowing extended
    1637             :                      * statistics on expressions.
    1638             :                      */
    1639        3676 :                     if (varonleft)
    1640        3676 :                         match = DatumGetBool(FunctionCall2Coll(&opproc,
    1641             :                                                                var->varcollid,
    1642             :                                                                item->values[idx],
    1643             :                                                                cst->constvalue));
    1644             :                     else
    1645           0 :                         match = DatumGetBool(FunctionCall2Coll(&opproc,
    1646             :                                                                var->varcollid,
    1647             :                                                                cst->constvalue,
    1648             :                                                                item->values[idx]));
    1649             : 
    1650             :                     /* update the match bitmap with the result */
    1651        3676 :                     matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1652             :                 }
    1653             :             }
    1654             :         }
    1655          92 :         else if (IsA(clause, NullTest))
    1656             :         {
    1657          20 :             NullTest   *expr = (NullTest *) clause;
    1658          20 :             Var        *var = (Var *) (expr->arg);
    1659             : 
    1660             :             /* match the attribute to a dimension of the statistic */
    1661          20 :             int         idx = bms_member_index(keys, var->varattno);
    1662             : 
    1663             :             /*
    1664             :              * Walk through the MCV items and evaluate the current clause. We
    1665             :              * can skip items that were already ruled out, and terminate if
    1666             :              * there are no remaining MCV items that might possibly match.
    1667             :              */
    1668        2020 :             for (i = 0; i < mcvlist->nitems; i++)
    1669             :             {
    1670        2000 :                 bool        match = false;  /* assume mismatch */
    1671        2000 :                 MCVItem    *item = &mcvlist->items[i];
    1672             : 
    1673             :                 /* if the clause mismatches the MCV item, update the bitmap */
    1674        2000 :                 switch (expr->nulltesttype)
    1675             :                 {
    1676             :                     case IS_NULL:
    1677        2000 :                         match = (item->isnull[idx]) ? true : match;
    1678        2000 :                         break;
    1679             : 
    1680             :                     case IS_NOT_NULL:
    1681           0 :                         match = (!item->isnull[idx]) ? true : match;
    1682           0 :                         break;
    1683             :                 }
    1684             : 
    1685             :                 /* now, update the match bitmap, depending on OR/AND type */
    1686        2000 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1687             :             }
    1688             :         }
    1689          72 :         else if (is_orclause(clause) || is_andclause(clause))
    1690           4 :         {
    1691             :             /* AND/OR clause, with all subclauses being compatible */
    1692             : 
    1693             :             int         i;
    1694           4 :             BoolExpr   *bool_clause = ((BoolExpr *) clause);
    1695           4 :             List       *bool_clauses = bool_clause->args;
    1696             : 
    1697             :             /* match/mismatch bitmap for each MCV item */
    1698           4 :             bool       *bool_matches = NULL;
    1699             : 
    1700             :             Assert(bool_clauses != NIL);
    1701             :             Assert(list_length(bool_clauses) >= 2);
    1702             : 
    1703             :             /* build the match bitmap for the OR-clauses */
    1704           4 :             bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
    1705           4 :                                                 mcvlist, is_orclause(clause));
    1706             : 
    1707             :             /*
    1708             :              * Merge the bitmap produced by mcv_get_match_bitmap into the
    1709             :              * current one. We need to consider if we're evaluating AND or OR
    1710             :              * condition when merging the results.
    1711             :              */
    1712          12 :             for (i = 0; i < mcvlist->nitems; i++)
    1713           8 :                 matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
    1714             : 
    1715           4 :             pfree(bool_matches);
    1716             :         }
    1717          68 :         else if (is_notclause(clause))
    1718             :         {
    1719             :             /* NOT clause, with all subclauses compatible */
    1720             : 
    1721             :             int         i;
    1722          20 :             BoolExpr   *not_clause = ((BoolExpr *) clause);
    1723          20 :             List       *not_args = not_clause->args;
    1724             : 
    1725             :             /* match/mismatch bitmap for each MCV item */
    1726          20 :             bool       *not_matches = NULL;
    1727             : 
    1728             :             Assert(not_args != NIL);
    1729             :             Assert(list_length(not_args) == 1);
    1730             : 
    1731             :             /* build the match bitmap for the NOT-clause */
    1732          20 :             not_matches = mcv_get_match_bitmap(root, not_args, keys,
    1733             :                                                mcvlist, false);
    1734             : 
    1735             :             /*
    1736             :              * Merge the bitmap produced by mcv_get_match_bitmap into the
    1737             :              * current one. We're handling a NOT clause, so invert the result
    1738             :              * before merging it into the global bitmap.
    1739             :              */
    1740         100 :             for (i = 0; i < mcvlist->nitems; i++)
    1741          80 :                 matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
    1742             : 
    1743          20 :             pfree(not_matches);
    1744             :         }
    1745          48 :         else if (IsA(clause, Var))
    1746             :         {
    1747             :             /* Var (has to be a boolean Var, possibly from below NOT) */
    1748             : 
    1749          48 :             Var        *var = (Var *) (clause);
    1750             : 
    1751             :             /* match the attribute to a dimension of the statistic */
    1752          48 :             int         idx = bms_member_index(keys, var->varattno);
    1753             : 
    1754             :             Assert(var->vartype == BOOLOID);
    1755             : 
    1756             :             /*
    1757             :              * Walk through the MCV items and evaluate the current clause. We
    1758             :              * can skip items that were already ruled out, and terminate if
    1759             :              * there are no remaining MCV items that might possibly match.
    1760             :              */
    1761         240 :             for (i = 0; i < mcvlist->nitems; i++)
    1762             :             {
    1763         192 :                 MCVItem    *item = &mcvlist->items[i];
    1764         192 :                 bool        match = false;
    1765             : 
    1766             :                 /* if the item is NULL, it's a mismatch */
    1767         192 :                 if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
    1768          96 :                     match = true;
    1769             : 
    1770             :                 /* update the result bitmap */
    1771         192 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1772             :             }
    1773             :         }
    1774             :         else
    1775           0 :             elog(ERROR, "unknown clause type: %d", clause->type);
    1776             :     }
    1777             : 
    1778          88 :     return matches;
    1779             : }
    1780             : 
    1781             : 
    1782             : /*
    1783             :  * mcv_clauselist_selectivity
    1784             :  *      Return the selectivity estimate computed using an MCV list.
    1785             :  *
    1786             :  * First builds a bitmap of MCV items matching the clauses, and then sums
    1787             :  * the frequencies of matching items.
    1788             :  *
    1789             :  * It also produces two additional interesting selectivities - total
    1790             :  * selectivity of all the MCV items (not just the matching ones), and the
    1791             :  * base frequency computed on the assumption of independence.
    1792             :  */
    1793             : Selectivity
    1794          64 : mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
    1795             :                            List *clauses, int varRelid,
    1796             :                            JoinType jointype, SpecialJoinInfo *sjinfo,
    1797             :                            RelOptInfo *rel,
    1798             :                            Selectivity *basesel, Selectivity *totalsel)
    1799             : {
    1800             :     int         i;
    1801             :     MCVList    *mcv;
    1802          64 :     Selectivity s = 0.0;
    1803             : 
    1804             :     /* match/mismatch bitmap for each MCV item */
    1805          64 :     bool       *matches = NULL;
    1806             : 
    1807             :     /* load the MCV list stored in the statistics object */
    1808          64 :     mcv = statext_mcv_load(stat->statOid);
    1809             : 
    1810             :     /* build a match bitmap for the clauses */
    1811          64 :     matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
    1812             : 
    1813             :     /* sum frequencies for all the matching MCV items */
    1814          64 :     *basesel = 0.0;
    1815          64 :     *totalsel = 0.0;
    1816        4536 :     for (i = 0; i < mcv->nitems; i++)
    1817             :     {
    1818        4472 :         *totalsel += mcv->items[i].frequency;
    1819             : 
    1820        4472 :         if (matches[i] != false)
    1821             :         {
    1822             :             /* XXX Shouldn't the basesel be outside the if condition? */
    1823          44 :             *basesel += mcv->items[i].base_frequency;
    1824          44 :             s += mcv->items[i].frequency;
    1825             :         }
    1826             :     }
    1827             : 
    1828          64 :     return s;
    1829             : }

Generated by: LCOV version 1.13