LCOV - code coverage report
Current view: top level - src/backend/statistics - mcv.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 475 521 91.2 %
Date: 2020-05-25 05:06:35 Functions: 14 18 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mcv.c
       4             :  *    POSTGRES multivariate MCV lists
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/statistics/mcv.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include <math.h>
      18             : 
      19             : #include "access/htup_details.h"
      20             : #include "catalog/pg_collation.h"
      21             : #include "catalog/pg_statistic_ext.h"
      22             : #include "catalog/pg_statistic_ext_data.h"
      23             : #include "fmgr.h"
      24             : #include "funcapi.h"
      25             : #include "nodes/nodeFuncs.h"
      26             : #include "optimizer/clauses.h"
      27             : #include "statistics/extended_stats_internal.h"
      28             : #include "statistics/statistics.h"
      29             : #include "utils/array.h"
      30             : #include "utils/builtins.h"
      31             : #include "utils/bytea.h"
      32             : #include "utils/fmgroids.h"
      33             : #include "utils/fmgrprotos.h"
      34             : #include "utils/lsyscache.h"
      35             : #include "utils/syscache.h"
      36             : #include "utils/typcache.h"
      37             : 
      38             : /*
      39             :  * Computes size of a serialized MCV item, depending on the number of
      40             :  * dimensions (columns) the statistic is defined on. The datum values are
      41             :  * stored in a separate array (deduplicated, to minimize the size), and
      42             :  * so the serialized items only store uint16 indexes into that array.
      43             :  *
      44             :  * Each serialized item stores (in this order):
      45             :  *
      46             :  * - indexes to values    (ndim * sizeof(uint16))
      47             :  * - null flags           (ndim * sizeof(bool))
      48             :  * - frequency            (sizeof(double))
      49             :  * - base_frequency       (sizeof(double))
      50             :  *
      51             :  * There is no alignment padding within an MCV item.
      52             :  * So in total each MCV item requires this many bytes:
      53             :  *
      54             :  *   ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
      55             :  */
      56             : #define ITEM_SIZE(ndims)    \
      57             :     ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
      58             : 
      59             : /*
      60             :  * Used to compute size of serialized MCV list representation.
      61             :  */
      62             : #define MinSizeOfMCVList        \
      63             :     (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
      64             : 
      65             : /*
      66             :  * Size of the serialized MCV list, excluding the space needed for
      67             :  * deduplicated per-dimension values. The macro is meant to be used
      68             :  * when it's not yet safe to access the serialized info about amount
      69             :  * of data for each column.
      70             :  */
      71             : #define SizeOfMCVList(ndims,nitems) \
      72             :     ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
      73             :      ((ndims) * sizeof(DimensionInfo)) + \
      74             :      ((nitems) * ITEM_SIZE(ndims)))
      75             : 
      76             : static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
      77             : 
      78             : static SortItem *build_distinct_groups(int numrows, SortItem *items,
      79             :                                        MultiSortSupport mss, int *ndistinct);
      80             : 
      81             : static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
      82             :                                            MultiSortSupport mss, int *ncounts);
      83             : 
      84             : static int  count_distinct_groups(int numrows, SortItem *items,
      85             :                                   MultiSortSupport mss);
      86             : 
      87             : /*
      88             :  * Compute new value for bitmap item, considering whether it's used for
      89             :  * clauses connected by AND/OR.
      90             :  */
      91             : #define RESULT_MERGE(value, is_or, match) \
      92             :     ((is_or) ? ((value) || (match)) : ((value) && (match)))
      93             : 
      94             : /*
      95             :  * When processing a list of clauses, the bitmap item may get set to a value
      96             :  * such that additional clauses can't change it. For example, when processing
      97             :  * a list of clauses connected to AND, as soon as the item gets set to 'false'
      98             :  * then it'll remain like that. Similarly clauses connected by OR and 'true'.
      99             :  *
     100             :  * Returns true when the value in the bitmap can't change no matter how the
     101             :  * remaining clauses are evaluated.
     102             :  */
     103             : #define RESULT_IS_FINAL(value, is_or)   ((is_or) ? (value) : (!(value)))
     104             : 
     105             : /*
     106             :  * get_mincount_for_mcv_list
     107             :  *      Determine the minimum number of times a value needs to appear in
     108             :  *      the sample for it to be included in the MCV list.
     109             :  *
     110             :  * We want to keep only values that appear sufficiently often in the
     111             :  * sample that it is reasonable to extrapolate their sample frequencies to
     112             :  * the entire table.  We do this by placing an upper bound on the relative
     113             :  * standard error of the sample frequency, so that any estimates the
     114             :  * planner generates from the MCV statistics can be expected to be
     115             :  * reasonably accurate.
     116             :  *
     117             :  * Since we are sampling without replacement, the sample frequency of a
     118             :  * particular value is described by a hypergeometric distribution.  A
     119             :  * common rule of thumb when estimating errors in this situation is to
     120             :  * require at least 10 instances of the value in the sample, in which case
     121             :  * the distribution can be approximated by a normal distribution, and
     122             :  * standard error analysis techniques can be applied.  Given a sample size
     123             :  * of n, a population size of N, and a sample frequency of p=cnt/n, the
     124             :  * standard error of the proportion p is given by
     125             :  *      SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
     126             :  * where the second term is the finite population correction.  To get
     127             :  * reasonably accurate planner estimates, we impose an upper bound on the
     128             :  * relative standard error of 20% -- i.e., SE/p < 0.2.  This 20% relative
     129             :  * error bound is fairly arbitrary, but has been found empirically to work
     130             :  * well.  Rearranging this formula gives a lower bound on the number of
     131             :  * instances of the value seen:
     132             :  *      cnt > n*(N-n) / (N-n+0.04*n*(N-1))
     133             :  * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
     134             :  * case where n approaches 0 cannot happen in practice, since the sample
     135             :  * size is at least 300.  The case where n approaches N corresponds to
     136             :  * sampling the whole the table, in which case it is reasonable to keep
     137             :  * the whole MCV list (have no lower bound), so it makes sense to apply
     138             :  * this formula for all inputs, even though the above derivation is
     139             :  * technically only valid when the right hand side is at least around 10.
     140             :  *
     141             :  * An alternative way to look at this formula is as follows -- assume that
     142             :  * the number of instances of the value seen scales up to the entire
     143             :  * table, so that the population count is K=N*cnt/n. Then the distribution
     144             :  * in the sample is a hypergeometric distribution parameterised by N, n
     145             :  * and K, and the bound above is mathematically equivalent to demanding
     146             :  * that the standard deviation of that distribution is less than 20% of
     147             :  * its mean.  Thus the relative errors in any planner estimates produced
     148             :  * from the MCV statistics are likely to be not too large.
     149             :  */
     150             : static double
     151          82 : get_mincount_for_mcv_list(int samplerows, double totalrows)
     152             : {
     153          82 :     double      n = samplerows;
     154          82 :     double      N = totalrows;
     155             :     double      numer,
     156             :                 denom;
     157             : 
     158          82 :     numer = n * (N - n);
     159          82 :     denom = N - n + 0.04 * n * (N - 1);
     160             : 
     161             :     /* Guard against division by zero (possible if n = N = 1) */
     162          82 :     if (denom == 0.0)
     163           4 :         return 0.0;
     164             : 
     165          78 :     return numer / denom;
     166             : }
     167             : 
     168             : /*
     169             :  * Builds MCV list from the set of sampled rows.
     170             :  *
     171             :  * The algorithm is quite simple:
     172             :  *
     173             :  *     (1) sort the data (default collation, '<' for the data type)
     174             :  *
     175             :  *     (2) count distinct groups, decide how many to keep
     176             :  *
     177             :  *     (3) build the MCV list using the threshold determined in (2)
     178             :  *
     179             :  *     (4) remove rows represented by the MCV from the sample
     180             :  *
     181             :  */
     182             : MCVList *
     183          82 : statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
     184             :                   VacAttrStats **stats, double totalrows, int stattarget)
     185             : {
     186             :     int         i,
     187             :                 numattrs,
     188             :                 ngroups,
     189             :                 nitems;
     190             :     AttrNumber *attnums;
     191             :     double      mincount;
     192             :     SortItem   *items;
     193             :     SortItem   *groups;
     194          82 :     MCVList    *mcvlist = NULL;
     195             :     MultiSortSupport mss;
     196             : 
     197          82 :     attnums = build_attnums_array(attrs, &numattrs);
     198             : 
     199             :     /* comparator for all the columns */
     200          82 :     mss = build_mss(stats, numattrs);
     201             : 
     202             :     /* sort the rows */
     203          82 :     items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
     204             :                                mss, numattrs, attnums);
     205             : 
     206          82 :     if (!items)
     207           0 :         return NULL;
     208             : 
     209             :     /* transform the sorted rows into groups (sorted by frequency) */
     210          82 :     groups = build_distinct_groups(nitems, items, mss, &ngroups);
     211             : 
     212             :     /*
     213             :      * Maximum number of MCV items to store, based on the statistics target we
     214             :      * computed for the statistics object (from target set for the object
     215             :      * itself, attributes and the system default). In any case, we can't keep
     216             :      * more groups than we have available.
     217             :      */
     218          82 :     nitems = stattarget;
     219          82 :     if (nitems > ngroups)
     220          46 :         nitems = ngroups;
     221             : 
     222             :     /*
     223             :      * Decide how many items to keep in the MCV list. We can't use the same
     224             :      * algorithm as per-column MCV lists, because that only considers the
     225             :      * actual group frequency - but we're primarily interested in how the
     226             :      * actual frequency differs from the base frequency (product of simple
     227             :      * per-column frequencies, as if the columns were independent).
     228             :      *
     229             :      * Using the same algorithm might exclude items that are close to the
     230             :      * "average" frequency of the sample. But that does not say whether the
     231             :      * observed frequency is close to the base frequency or not. We also need
     232             :      * to consider unexpectedly uncommon items (again, compared to the base
     233             :      * frequency), and the single-column algorithm does not have to.
     234             :      *
     235             :      * We simply decide how many items to keep by computing minimum count
     236             :      * using get_mincount_for_mcv_list() and then keep all items that seem to
     237             :      * be more common than that.
     238             :      */
     239          82 :     mincount = get_mincount_for_mcv_list(numrows, totalrows);
     240             : 
     241             :     /*
     242             :      * Walk the groups until we find the first group with a count below the
     243             :      * mincount threshold (the index of that group is the number of groups we
     244             :      * want to keep).
     245             :      */
     246        3400 :     for (i = 0; i < nitems; i++)
     247             :     {
     248        3320 :         if (groups[i].count < mincount)
     249             :         {
     250           2 :             nitems = i;
     251           2 :             break;
     252             :         }
     253             :     }
     254             : 
     255             :     /*
     256             :      * At this point we know the number of items for the MCV list. There might
     257             :      * be none (for uniform distribution with many groups), and in that case
     258             :      * there will be no MCV list. Otherwise construct the MCV list.
     259             :      */
     260          82 :     if (nitems > 0)
     261             :     {
     262             :         int         j;
     263             :         SortItem    key;
     264             :         MultiSortSupport tmp;
     265             : 
     266             :         /* frequencies for values in each attribute */
     267             :         SortItem  **freqs;
     268             :         int        *nfreqs;
     269             : 
     270             :         /* used to search values */
     271          80 :         tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
     272             :                                         + sizeof(SortSupportData));
     273             : 
     274             :         /* compute frequencies for values in each column */
     275          80 :         nfreqs = (int *) palloc0(sizeof(int) * numattrs);
     276          80 :         freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
     277             : 
     278             :         /*
     279             :          * Allocate the MCV list structure, set the global parameters.
     280             :          */
     281          80 :         mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
     282          80 :                                       sizeof(MCVItem) * nitems);
     283             : 
     284          80 :         mcvlist->magic = STATS_MCV_MAGIC;
     285          80 :         mcvlist->type = STATS_MCV_TYPE_BASIC;
     286          80 :         mcvlist->ndimensions = numattrs;
     287          80 :         mcvlist->nitems = nitems;
     288             : 
     289             :         /* store info about data type OIDs */
     290         300 :         for (i = 0; i < numattrs; i++)
     291         220 :             mcvlist->types[i] = stats[i]->attrtypid;
     292             : 
     293             :         /* Copy the first chunk of groups into the result. */
     294        3398 :         for (i = 0; i < nitems; i++)
     295             :         {
     296             :             /* just pointer to the proper place in the list */
     297        3318 :             MCVItem    *item = &mcvlist->items[i];
     298             : 
     299        3318 :             item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
     300        3318 :             item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
     301             : 
     302             :             /* copy values for the group */
     303        3318 :             memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
     304        3318 :             memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
     305             : 
     306             :             /* groups should be sorted by frequency in descending order */
     307             :             Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
     308             : 
     309             :             /* group frequency */
     310        3318 :             item->frequency = (double) groups[i].count / numrows;
     311             : 
     312             :             /* base frequency, if the attributes were independent */
     313        3318 :             item->base_frequency = 1.0;
     314       12780 :             for (j = 0; j < numattrs; j++)
     315             :             {
     316             :                 SortItem   *freq;
     317             : 
     318             :                 /* single dimension */
     319        9462 :                 tmp->ndims = 1;
     320        9462 :                 tmp->ssup[0] = mss->ssup[j];
     321             : 
     322             :                 /* fill search key */
     323        9462 :                 key.values = &groups[i].values[j];
     324        9462 :                 key.isnull = &groups[i].isnull[j];
     325             : 
     326        9462 :                 freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
     327             :                                                 sizeof(SortItem),
     328             :                                                 multi_sort_compare, tmp);
     329             : 
     330        9462 :                 item->base_frequency *= ((double) freq->count) / numrows;
     331             :             }
     332             :         }
     333             : 
     334          80 :         pfree(nfreqs);
     335          80 :         pfree(freqs);
     336             :     }
     337             : 
     338          82 :     pfree(items);
     339          82 :     pfree(groups);
     340             : 
     341          82 :     return mcvlist;
     342             : }
     343             : 
     344             : /*
     345             :  * build_mss
     346             :  *  build MultiSortSupport for the attributes passed in attrs
     347             :  */
     348             : static MultiSortSupport
     349          82 : build_mss(VacAttrStats **stats, int numattrs)
     350             : {
     351             :     int         i;
     352             : 
     353             :     /* Sort by multiple columns (using array of SortSupport) */
     354          82 :     MultiSortSupport mss = multi_sort_init(numattrs);
     355             : 
     356             :     /* prepare the sort functions for all the attributes */
     357         308 :     for (i = 0; i < numattrs; i++)
     358             :     {
     359         226 :         VacAttrStats *colstat = stats[i];
     360             :         TypeCacheEntry *type;
     361             : 
     362         226 :         type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
     363         226 :         if (type->lt_opr == InvalidOid) /* shouldn't happen */
     364           0 :             elog(ERROR, "cache lookup failed for ordering operator for type %u",
     365             :                  colstat->attrtypid);
     366             : 
     367         226 :         multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
     368             :     }
     369             : 
     370          82 :     return mss;
     371             : }
     372             : 
     373             : /*
     374             :  * count_distinct_groups
     375             :  *  count distinct combinations of SortItems in the array
     376             :  *
     377             :  * The array is assumed to be sorted according to the MultiSortSupport.
     378             :  */
     379             : static int
     380          82 : count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
     381             : {
     382             :     int         i;
     383             :     int         ndistinct;
     384             : 
     385          82 :     ndistinct = 1;
     386      312204 :     for (i = 1; i < numrows; i++)
     387             :     {
     388             :         /* make sure the array really is sorted */
     389             :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
     390             : 
     391      312122 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
     392       46224 :             ndistinct += 1;
     393             :     }
     394             : 
     395          82 :     return ndistinct;
     396             : }
     397             : 
     398             : /*
     399             :  * compare_sort_item_count
     400             :  *  comparator for sorting items by count (frequencies) in descending order
     401             :  */
     402             : static int
     403       47110 : compare_sort_item_count(const void *a, const void *b)
     404             : {
     405       47110 :     SortItem   *ia = (SortItem *) a;
     406       47110 :     SortItem   *ib = (SortItem *) b;
     407             : 
     408       47110 :     if (ia->count == ib->count)
     409       46408 :         return 0;
     410         702 :     else if (ia->count > ib->count)
     411         290 :         return -1;
     412             : 
     413         412 :     return 1;
     414             : }
     415             : 
     416             : /*
     417             :  * build_distinct_groups
     418             :  *  build an array of SortItems for distinct groups and counts matching items
     419             :  *
     420             :  * The input array is assumed to be sorted
     421             :  */
     422             : static SortItem *
     423          82 : build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
     424             :                       int *ndistinct)
     425             : {
     426             :     int         i,
     427             :                 j;
     428          82 :     int         ngroups = count_distinct_groups(numrows, items, mss);
     429             : 
     430          82 :     SortItem   *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
     431             : 
     432          82 :     j = 0;
     433          82 :     groups[0] = items[0];
     434          82 :     groups[0].count = 1;
     435             : 
     436      312204 :     for (i = 1; i < numrows; i++)
     437             :     {
     438             :         /* Assume sorted in ascending order. */
     439             :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
     440             : 
     441             :         /* New distinct group detected. */
     442      312122 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
     443             :         {
     444       46224 :             groups[++j] = items[i];
     445       46224 :             groups[j].count = 0;
     446             :         }
     447             : 
     448      312122 :         groups[j].count++;
     449             :     }
     450             : 
     451             :     /* ensure we filled the expected number of distinct groups */
     452             :     Assert(j + 1 == ngroups);
     453             : 
     454             :     /* Sort the distinct groups by frequency (in descending order). */
     455          82 :     pg_qsort((void *) groups, ngroups, sizeof(SortItem),
     456             :              compare_sort_item_count);
     457             : 
     458          82 :     *ndistinct = ngroups;
     459          82 :     return groups;
     460             : }
     461             : 
     462             : /* compare sort items (single dimension) */
     463             : static int
     464      625394 : sort_item_compare(const void *a, const void *b, void *arg)
     465             : {
     466      625394 :     SortSupport ssup = (SortSupport) arg;
     467      625394 :     SortItem   *ia = (SortItem *) a;
     468      625394 :     SortItem   *ib = (SortItem *) b;
     469             : 
     470     1876182 :     return ApplySortComparator(ia->values[0], ia->isnull[0],
     471      625394 :                                ib->values[0], ib->isnull[0],
     472             :                                ssup);
     473             : }
     474             : 
     475             : /*
     476             :  * build_column_frequencies
     477             :  *  compute frequencies of values in each column
     478             :  *
     479             :  * This returns an array of SortItems for each attribute the MCV is built
     480             :  * on, with a frequency (number of occurrences) for each value. This is
     481             :  * then used to compute "base" frequency of MCV items.
     482             :  *
     483             :  * All the memory is allocated in a single chunk, so that a single pfree
     484             :  * is enough to release it. We do not allocate space for values/isnull
     485             :  * arrays in the SortItems, because we can simply point into the input
     486             :  * groups directly.
     487             :  */
     488             : static SortItem **
     489          80 : build_column_frequencies(SortItem *groups, int ngroups,
     490             :                          MultiSortSupport mss, int *ncounts)
     491             : {
     492             :     int         i,
     493             :                 dim;
     494             :     SortItem  **result;
     495             :     char       *ptr;
     496             : 
     497             :     Assert(groups);
     498             :     Assert(ncounts);
     499             : 
     500             :     /* allocate arrays for all columns as a single chunk */
     501         160 :     ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
     502          80 :                  mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
     503             : 
     504             :     /* initial array of pointers */
     505          80 :     result = (SortItem **) ptr;
     506          80 :     ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
     507             : 
     508         300 :     for (dim = 0; dim < mss->ndims; dim++)
     509             :     {
     510         220 :         SortSupport ssup = &mss->ssup[dim];
     511             : 
     512             :         /* array of values for a single column */
     513         220 :         result[dim] = (SortItem *) ptr;
     514         220 :         ptr += MAXALIGN(sizeof(SortItem) * ngroups);
     515             : 
     516             :         /* extract data for the dimension */
     517      134746 :         for (i = 0; i < ngroups; i++)
     518             :         {
     519             :             /* point into the input groups */
     520      134526 :             result[dim][i].values = &groups[i].values[dim];
     521      134526 :             result[dim][i].isnull = &groups[i].isnull[dim];
     522      134526 :             result[dim][i].count = groups[i].count;
     523             :         }
     524             : 
     525             :         /* sort the values, deduplicate */
     526         220 :         qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
     527             :                   sort_item_compare, ssup);
     528             : 
     529             :         /*
     530             :          * Identify distinct values, compute frequency (there might be
     531             :          * multiple MCV items containing this value, so we need to sum counts
     532             :          * from all of them.
     533             :          */
     534         220 :         ncounts[dim] = 1;
     535      134526 :         for (i = 1; i < ngroups; i++)
     536             :         {
     537      134306 :             if (sort_item_compare(&result[dim][i - 1], &result[dim][i], ssup) == 0)
     538             :             {
     539      124982 :                 result[dim][ncounts[dim] - 1].count += result[dim][i].count;
     540      124982 :                 continue;
     541             :             }
     542             : 
     543        9324 :             result[dim][ncounts[dim]] = result[dim][i];
     544             : 
     545        9324 :             ncounts[dim]++;
     546             :         }
     547             :     }
     548             : 
     549          80 :     return result;
     550             : }
     551             : 
     552             : /*
     553             :  * statext_mcv_load
     554             :  *      Load the MCV list for the indicated pg_statistic_ext tuple
     555             :  */
     556             : MCVList *
     557         172 : statext_mcv_load(Oid mvoid)
     558             : {
     559             :     MCVList    *result;
     560             :     bool        isnull;
     561             :     Datum       mcvlist;
     562         172 :     HeapTuple   htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
     563             : 
     564         172 :     if (!HeapTupleIsValid(htup))
     565           0 :         elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
     566             : 
     567         172 :     mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
     568             :                               Anum_pg_statistic_ext_data_stxdmcv, &isnull);
     569             : 
     570         172 :     if (isnull)
     571           0 :         elog(ERROR,
     572             :              "requested statistic kind \"%c\" is not yet built for statistics object %u",
     573             :              STATS_EXT_DEPENDENCIES, mvoid);
     574             : 
     575         172 :     result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
     576             : 
     577         172 :     ReleaseSysCache(htup);
     578             : 
     579         172 :     return result;
     580             : }
     581             : 
     582             : 
     583             : /*
     584             :  * statext_mcv_serialize
     585             :  *      Serialize MCV list into a pg_mcv_list value.
     586             :  *
     587             :  * The MCV items may include values of various data types, and it's reasonable
     588             :  * to expect redundancy (values for a given attribute, repeated for multiple
     589             :  * MCV list items). So we deduplicate the values into arrays, and then replace
     590             :  * the values by indexes into those arrays.
     591             :  *
     592             :  * The overall structure of the serialized representation looks like this:
     593             :  *
     594             :  * +---------------+----------------+---------------------+-------+
     595             :  * | header fields | dimension info | deduplicated values | items |
     596             :  * +---------------+----------------+---------------------+-------+
     597             :  *
     598             :  * Where dimension info stores information about type of K-th attribute (e.g.
     599             :  * typlen, typbyval and length of deduplicated values).  Deduplicated values
     600             :  * store deduplicated values for each attribute.  And items store the actual
     601             :  * MCV list items, with values replaced by indexes into the arrays.
     602             :  *
     603             :  * When serializing the items, we use uint16 indexes. The number of MCV items
     604             :  * is limited by the statistics target (which is capped to 10k at the moment).
     605             :  * We might increase this to 65k and still fit into uint16, so there's a bit of
     606             :  * slack. Furthermore, this limit is on the number of distinct values per column,
     607             :  * and we usually have few of those (and various combinations of them for the
     608             :  * those MCV list). So uint16 seems fine for now.
     609             :  *
     610             :  * We don't really expect the serialization to save as much space as for
     611             :  * histograms, as we are not doing any bucket splits (which is the source
     612             :  * of high redundancy in histograms).
     613             :  *
     614             :  * TODO: Consider packing boolean flags (NULL) for each item into a single char
     615             :  * (or a longer type) instead of using an array of bool items.
     616             :  */
     617             : bytea *
     618          80 : statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
     619             : {
     620             :     int         i;
     621             :     int         dim;
     622          80 :     int         ndims = mcvlist->ndimensions;
     623             : 
     624             :     SortSupport ssup;
     625             :     DimensionInfo *info;
     626             : 
     627             :     Size        total_length;
     628             : 
     629             :     /* serialized items (indexes into arrays, etc.) */
     630             :     bytea      *raw;
     631             :     char       *ptr;
     632             :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
     633             : 
     634             :     /* values per dimension (and number of non-NULL values) */
     635          80 :     Datum     **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
     636          80 :     int        *counts = (int *) palloc0(sizeof(int) * ndims);
     637             : 
     638             :     /*
     639             :      * We'll include some rudimentary information about the attribute types
     640             :      * (length, by-val flag), so that we don't have to look them up while
     641             :      * deserializating the MCV list (we already have the type OID in the
     642             :      * header).  This is safe, because when changing type of the attribute the
     643             :      * statistics gets dropped automatically.  We need to store the info about
     644             :      * the arrays of deduplicated values anyway.
     645             :      */
     646          80 :     info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
     647             : 
     648             :     /* sort support data for all attributes included in the MCV list */
     649          80 :     ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
     650             : 
     651             :     /* collect and deduplicate values for each dimension (attribute) */
     652         300 :     for (dim = 0; dim < ndims; dim++)
     653             :     {
     654             :         int         ndistinct;
     655             :         TypeCacheEntry *typentry;
     656             : 
     657             :         /*
     658             :          * Lookup the LT operator (can't get it from stats extra_data, as we
     659             :          * don't know how to interpret that - scalar vs. array etc.).
     660             :          */
     661         220 :         typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
     662             : 
     663             :         /* copy important info about the data type (length, by-value) */
     664         220 :         info[dim].typlen = stats[dim]->attrtype->typlen;
     665         220 :         info[dim].typbyval = stats[dim]->attrtype->typbyval;
     666             : 
     667             :         /* allocate space for values in the attribute and collect them */
     668         220 :         values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
     669             : 
     670        9682 :         for (i = 0; i < mcvlist->nitems; i++)
     671             :         {
     672             :             /* skip NULL values - we don't need to deduplicate those */
     673        9462 :             if (mcvlist->items[i].isnull[dim])
     674          62 :                 continue;
     675             : 
     676             :             /* append the value at the end */
     677        9400 :             values[dim][counts[dim]] = mcvlist->items[i].values[dim];
     678        9400 :             counts[dim] += 1;
     679             :         }
     680             : 
     681             :         /* if there are just NULL values in this dimension, we're done */
     682         220 :         if (counts[dim] == 0)
     683          10 :             continue;
     684             : 
     685             :         /* sort and deduplicate the data */
     686         210 :         ssup[dim].ssup_cxt = CurrentMemoryContext;
     687         210 :         ssup[dim].ssup_collation = stats[dim]->attrcollid;
     688         210 :         ssup[dim].ssup_nulls_first = false;
     689             : 
     690         210 :         PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
     691             : 
     692         210 :         qsort_arg(values[dim], counts[dim], sizeof(Datum),
     693         210 :                   compare_scalars_simple, &ssup[dim]);
     694             : 
     695             :         /*
     696             :          * Walk through the array and eliminate duplicate values, but keep the
     697             :          * ordering (so that we can do bsearch later). We know there's at
     698             :          * least one item as (counts[dim] != 0), so we can skip the first
     699             :          * element.
     700             :          */
     701         210 :         ndistinct = 1;          /* number of distinct values */
     702        9400 :         for (i = 1; i < counts[dim]; i++)
     703             :         {
     704             :             /* expect sorted array */
     705             :             Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
     706             : 
     707             :             /* if the value is the same as the previous one, we can skip it */
     708        9190 :             if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
     709        4188 :                 continue;
     710             : 
     711        5002 :             values[dim][ndistinct] = values[dim][i];
     712        5002 :             ndistinct += 1;
     713             :         }
     714             : 
     715             :         /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
     716             :         Assert(ndistinct <= PG_UINT16_MAX);
     717             : 
     718             :         /*
     719             :          * Store additional info about the attribute - number of deduplicated
     720             :          * values, and also size of the serialized data. For fixed-length data
     721             :          * types this is trivial to compute, for varwidth types we need to
     722             :          * actually walk the array and sum the sizes.
     723             :          */
     724         210 :         info[dim].nvalues = ndistinct;
     725             : 
     726         210 :         if (info[dim].typbyval) /* by-value data types */
     727             :         {
     728         122 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
     729             : 
     730             :             /*
     731             :              * We copy the data into the MCV item during deserialization, so
     732             :              * we don't need to allocate any extra space.
     733             :              */
     734         122 :             info[dim].nbytes_aligned = 0;
     735             :         }
     736          88 :         else if (info[dim].typlen > 0)   /* fixed-length by-ref */
     737             :         {
     738             :             /*
     739             :              * We don't care about alignment in the serialized data, so we
     740             :              * pack the data as much as possible. But we also track how much
     741             :              * data will be needed after deserialization, and in that case we
     742             :              * need to account for alignment of each item.
     743             :              *
     744             :              * Note: As the items are fixed-length, we could easily compute
     745             :              * this during deserialization, but we do it here anyway.
     746             :              */
     747          12 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
     748          12 :             info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
     749             :         }
     750          76 :         else if (info[dim].typlen == -1)    /* varlena */
     751             :         {
     752          76 :             info[dim].nbytes = 0;
     753          76 :             info[dim].nbytes_aligned = 0;
     754        1998 :             for (i = 0; i < info[dim].nvalues; i++)
     755             :             {
     756             :                 Size        len;
     757             : 
     758             :                 /*
     759             :                  * For varlena values, we detoast the values and store the
     760             :                  * length and data separately. We don't bother with alignment
     761             :                  * here, which means that during deserialization we need to
     762             :                  * copy the fields and only access the copies.
     763             :                  */
     764        1922 :                 values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
     765             : 
     766             :                 /* serialized length (uint32 length + data) */
     767        1922 :                 len = VARSIZE_ANY_EXHDR(values[dim][i]);
     768        1922 :                 info[dim].nbytes += sizeof(uint32); /* length */
     769        1922 :                 info[dim].nbytes += len;    /* value (no header) */
     770             : 
     771             :                 /*
     772             :                  * During deserialization we'll build regular varlena values
     773             :                  * with full headers, and we need to align them properly.
     774             :                  */
     775        1922 :                 info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
     776             :             }
     777             :         }
     778           0 :         else if (info[dim].typlen == -2)    /* cstring */
     779             :         {
     780           0 :             info[dim].nbytes = 0;
     781           0 :             info[dim].nbytes_aligned = 0;
     782           0 :             for (i = 0; i < info[dim].nvalues; i++)
     783             :             {
     784             :                 Size        len;
     785             : 
     786             :                 /*
     787             :                  * For cstring, we do similar thing as for varlena - first we
     788             :                  * store the length as uint32 and then the data. We don't care
     789             :                  * about alignment, which means that during deserialization we
     790             :                  * need to copy the fields and only access the copies.
     791             :                  */
     792             : 
     793             :                 /* c-strings include terminator, so +1 byte */
     794           0 :                 len = strlen(DatumGetCString(values[dim][i])) + 1;
     795           0 :                 info[dim].nbytes += sizeof(uint32); /* length */
     796           0 :                 info[dim].nbytes += len;    /* value */
     797             : 
     798             :                 /* space needed for properly aligned deserialized copies */
     799           0 :                 info[dim].nbytes_aligned += MAXALIGN(len);
     800             :             }
     801             :         }
     802             : 
     803             :         /* we know (count>0) so there must be some data */
     804             :         Assert(info[dim].nbytes > 0);
     805             :     }
     806             : 
     807             :     /*
     808             :      * Now we can finally compute how much space we'll actually need for the
     809             :      * whole serialized MCV list (varlena header, MCV header, dimension info
     810             :      * for each attribute, deduplicated values and items).
     811             :      */
     812          80 :     total_length = (3 * sizeof(uint32)) /* magic + type + nitems */
     813             :         + sizeof(AttrNumber)    /* ndimensions */
     814          80 :         + (ndims * sizeof(Oid));    /* attribute types */
     815             : 
     816             :     /* dimension info */
     817          80 :     total_length += ndims * sizeof(DimensionInfo);
     818             : 
     819             :     /* add space for the arrays of deduplicated values */
     820         300 :     for (i = 0; i < ndims; i++)
     821         220 :         total_length += info[i].nbytes;
     822             : 
     823             :     /*
     824             :      * And finally account for the items (those are fixed-length, thanks to
     825             :      * replacing values with uint16 indexes into the deduplicated arrays).
     826             :      */
     827          80 :     total_length += mcvlist->nitems * ITEM_SIZE(dim);
     828             : 
     829             :     /*
     830             :      * Allocate space for the whole serialized MCV list (we'll skip bytes, so
     831             :      * we set them to zero to make the result more compressible).
     832             :      */
     833          80 :     raw = (bytea *) palloc0(VARHDRSZ + total_length);
     834          80 :     SET_VARSIZE(raw, VARHDRSZ + total_length);
     835             : 
     836          80 :     ptr = VARDATA(raw);
     837          80 :     endptr = ptr + total_length;
     838             : 
     839             :     /* copy the MCV list header fields, one by one */
     840          80 :     memcpy(ptr, &mcvlist->magic, sizeof(uint32));
     841          80 :     ptr += sizeof(uint32);
     842             : 
     843          80 :     memcpy(ptr, &mcvlist->type, sizeof(uint32));
     844          80 :     ptr += sizeof(uint32);
     845             : 
     846          80 :     memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
     847          80 :     ptr += sizeof(uint32);
     848             : 
     849          80 :     memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
     850          80 :     ptr += sizeof(AttrNumber);
     851             : 
     852          80 :     memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
     853          80 :     ptr += (sizeof(Oid) * ndims);
     854             : 
     855             :     /* store information about the attributes (data amounts, ...) */
     856          80 :     memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
     857          80 :     ptr += sizeof(DimensionInfo) * ndims;
     858             : 
     859             :     /* Copy the deduplicated values for all attributes to the output. */
     860         300 :     for (dim = 0; dim < ndims; dim++)
     861             :     {
     862             :         /* remember the starting point for Asserts later */
     863         220 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
     864             : 
     865        5432 :         for (i = 0; i < info[dim].nvalues; i++)
     866             :         {
     867        5212 :             Datum       value = values[dim][i];
     868             : 
     869        5212 :             if (info[dim].typbyval) /* passed by value */
     870             :             {
     871             :                 Datum       tmp;
     872             : 
     873             :                 /*
     874             :                  * For values passed by value, we need to copy just the
     875             :                  * significant bytes - we can't use memcpy directly, as that
     876             :                  * assumes little endian behavior.  store_att_byval does
     877             :                  * almost what we need, but it requires properly aligned
     878             :                  * buffer - the output buffer does not guarantee that. So we
     879             :                  * simply use a local Datum variable (which guarantees proper
     880             :                  * alignment), and then copy the value from it.
     881             :                  */
     882        2590 :                 store_att_byval(&tmp, value, info[dim].typlen);
     883             : 
     884        2590 :                 memcpy(ptr, &tmp, info[dim].typlen);
     885        2590 :                 ptr += info[dim].typlen;
     886             :             }
     887        2622 :             else if (info[dim].typlen > 0)   /* passed by reference */
     888             :             {
     889             :                 /* no special alignment needed, treated as char array */
     890         700 :                 memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
     891         700 :                 ptr += info[dim].typlen;
     892             :             }
     893        1922 :             else if (info[dim].typlen == -1)    /* varlena */
     894             :             {
     895        1922 :                 uint32      len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
     896             : 
     897             :                 /* copy the length */
     898        1922 :                 memcpy(ptr, &len, sizeof(uint32));
     899        1922 :                 ptr += sizeof(uint32);
     900             : 
     901             :                 /* data from the varlena value (without the header) */
     902        1922 :                 memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
     903        1922 :                 ptr += len;
     904             :             }
     905           0 :             else if (info[dim].typlen == -2)    /* cstring */
     906             :             {
     907           0 :                 uint32      len = (uint32) strlen(DatumGetCString(value)) + 1;
     908             : 
     909             :                 /* copy the length */
     910           0 :                 memcpy(ptr, &len, sizeof(uint32));
     911           0 :                 ptr += sizeof(uint32);
     912             : 
     913             :                 /* value */
     914           0 :                 memcpy(ptr, DatumGetCString(value), len);
     915           0 :                 ptr += len;
     916             :             }
     917             : 
     918             :             /* no underflows or overflows */
     919             :             Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
     920             :         }
     921             : 
     922             :         /* we should get exactly nbytes of data for this dimension */
     923             :         Assert((ptr - start) == info[dim].nbytes);
     924             :     }
     925             : 
     926             :     /* Serialize the items, with uint16 indexes instead of the values. */
     927        3398 :     for (i = 0; i < mcvlist->nitems; i++)
     928             :     {
     929        3318 :         MCVItem    *mcvitem = &mcvlist->items[i];
     930             : 
     931             :         /* don't write beyond the allocated space */
     932             :         Assert(ptr <= (endptr - ITEM_SIZE(dim)));
     933             : 
     934             :         /* copy NULL and frequency flags into the serialized MCV */
     935        3318 :         memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
     936        3318 :         ptr += sizeof(bool) * ndims;
     937             : 
     938        3318 :         memcpy(ptr, &mcvitem->frequency, sizeof(double));
     939        3318 :         ptr += sizeof(double);
     940             : 
     941        3318 :         memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
     942        3318 :         ptr += sizeof(double);
     943             : 
     944             :         /* store the indexes last */
     945       12780 :         for (dim = 0; dim < ndims; dim++)
     946             :         {
     947        9462 :             uint16      index = 0;
     948             :             Datum      *value;
     949             : 
     950             :             /* do the lookup only for non-NULL values */
     951        9462 :             if (!mcvitem->isnull[dim])
     952             :             {
     953        9400 :                 value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
     954        9400 :                                               info[dim].nvalues, sizeof(Datum),
     955        9400 :                                               compare_scalars_simple, &ssup[dim]);
     956             : 
     957             :                 Assert(value != NULL);  /* serialization or deduplication
     958             :                                          * error */
     959             : 
     960             :                 /* compute index within the deduplicated array */
     961        9400 :                 index = (uint16) (value - values[dim]);
     962             : 
     963             :                 /* check the index is within expected bounds */
     964             :                 Assert(index < info[dim].nvalues);
     965             :             }
     966             : 
     967             :             /* copy the index into the serialized MCV */
     968        9462 :             memcpy(ptr, &index, sizeof(uint16));
     969        9462 :             ptr += sizeof(uint16);
     970             :         }
     971             : 
     972             :         /* make sure we don't overflow the allocated value */
     973             :         Assert(ptr <= endptr);
     974             :     }
     975             : 
     976             :     /* at this point we expect to match the total_length exactly */
     977             :     Assert(ptr == endptr);
     978             : 
     979          80 :     pfree(values);
     980          80 :     pfree(counts);
     981             : 
     982          80 :     return raw;
     983             : }
     984             : 
     985             : /*
     986             :  * statext_mcv_deserialize
     987             :  *      Reads serialized MCV list into MCVList structure.
     988             :  *
     989             :  * All the memory needed by the MCV list is allocated as a single chunk, so
     990             :  * it's possible to simply pfree() it at once.
     991             :  */
     992             : MCVList *
     993         180 : statext_mcv_deserialize(bytea *data)
     994             : {
     995             :     int         dim,
     996             :                 i;
     997             :     Size        expected_size;
     998             :     MCVList    *mcvlist;
     999             :     char       *raw;
    1000             :     char       *ptr;
    1001             :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
    1002             : 
    1003             :     int         ndims,
    1004             :                 nitems;
    1005         180 :     DimensionInfo *info = NULL;
    1006             : 
    1007             :     /* local allocation buffer (used only for deserialization) */
    1008         180 :     Datum     **map = NULL;
    1009             : 
    1010             :     /* MCV list */
    1011             :     Size        mcvlen;
    1012             : 
    1013             :     /* buffer used for the result */
    1014             :     Size        datalen;
    1015             :     char       *dataptr;
    1016             :     char       *valuesptr;
    1017             :     char       *isnullptr;
    1018             : 
    1019         180 :     if (data == NULL)
    1020           0 :         return NULL;
    1021             : 
    1022             :     /*
    1023             :      * We can't possibly deserialize a MCV list if there's not even a complete
    1024             :      * header. We need an explicit formula here, because we serialize the
    1025             :      * header fields one by one, so we need to ignore struct alignment.
    1026             :      */
    1027         180 :     if (VARSIZE_ANY(data) < MinSizeOfMCVList)
    1028           0 :         elog(ERROR, "invalid MCV size %zd (expected at least %zu)",
    1029             :              VARSIZE_ANY(data), MinSizeOfMCVList);
    1030             : 
    1031             :     /* read the MCV list header */
    1032         180 :     mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
    1033             : 
    1034             :     /* pointer to the data part (skip the varlena header) */
    1035         180 :     raw = (char *) data;
    1036         180 :     ptr = VARDATA_ANY(raw);
    1037         180 :     endptr = (char *) raw + VARSIZE_ANY(data);
    1038             : 
    1039             :     /* get the header and perform further sanity checks */
    1040         180 :     memcpy(&mcvlist->magic, ptr, sizeof(uint32));
    1041         180 :     ptr += sizeof(uint32);
    1042             : 
    1043         180 :     memcpy(&mcvlist->type, ptr, sizeof(uint32));
    1044         180 :     ptr += sizeof(uint32);
    1045             : 
    1046         180 :     memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
    1047         180 :     ptr += sizeof(uint32);
    1048             : 
    1049         180 :     memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
    1050         180 :     ptr += sizeof(AttrNumber);
    1051             : 
    1052         180 :     if (mcvlist->magic != STATS_MCV_MAGIC)
    1053           0 :         elog(ERROR, "invalid MCV magic %u (expected %u)",
    1054             :              mcvlist->magic, STATS_MCV_MAGIC);
    1055             : 
    1056         180 :     if (mcvlist->type != STATS_MCV_TYPE_BASIC)
    1057           0 :         elog(ERROR, "invalid MCV type %u (expected %u)",
    1058             :              mcvlist->type, STATS_MCV_TYPE_BASIC);
    1059             : 
    1060         180 :     if (mcvlist->ndimensions == 0)
    1061           0 :         elog(ERROR, "invalid zero-length dimension array in MCVList");
    1062         180 :     else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
    1063         180 :              (mcvlist->ndimensions < 0))
    1064           0 :         elog(ERROR, "invalid length (%d) dimension array in MCVList",
    1065             :              mcvlist->ndimensions);
    1066             : 
    1067         180 :     if (mcvlist->nitems == 0)
    1068           0 :         elog(ERROR, "invalid zero-length item array in MCVList");
    1069         180 :     else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
    1070           0 :         elog(ERROR, "invalid length (%u) item array in MCVList",
    1071             :              mcvlist->nitems);
    1072             : 
    1073         180 :     nitems = mcvlist->nitems;
    1074         180 :     ndims = mcvlist->ndimensions;
    1075             : 
    1076             :     /*
    1077             :      * Check amount of data including DimensionInfo for all dimensions and
    1078             :      * also the serialized items (including uint16 indexes). Also, walk
    1079             :      * through the dimension information and add it to the sum.
    1080             :      */
    1081         180 :     expected_size = SizeOfMCVList(ndims, nitems);
    1082             : 
    1083             :     /*
    1084             :      * Check that we have at least the dimension and info records, along with
    1085             :      * the items. We don't know the size of the serialized values yet. We need
    1086             :      * to do this check first, before accessing the dimension info.
    1087             :      */
    1088         180 :     if (VARSIZE_ANY(data) < expected_size)
    1089           0 :         elog(ERROR, "invalid MCV size %zd (expected %zu)",
    1090             :              VARSIZE_ANY(data), expected_size);
    1091             : 
    1092             :     /* Now copy the array of type Oids. */
    1093         180 :     memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
    1094         180 :     ptr += (sizeof(Oid) * ndims);
    1095             : 
    1096             :     /* Now it's safe to access the dimension info. */
    1097         180 :     info = palloc(ndims * sizeof(DimensionInfo));
    1098             : 
    1099         180 :     memcpy(info, ptr, ndims * sizeof(DimensionInfo));
    1100         180 :     ptr += (ndims * sizeof(DimensionInfo));
    1101             : 
    1102             :     /* account for the value arrays */
    1103         704 :     for (dim = 0; dim < ndims; dim++)
    1104             :     {
    1105             :         /*
    1106             :          * XXX I wonder if we can/should rely on asserts here. Maybe those
    1107             :          * checks should be done every time?
    1108             :          */
    1109             :         Assert(info[dim].nvalues >= 0);
    1110             :         Assert(info[dim].nbytes >= 0);
    1111             : 
    1112         524 :         expected_size += info[dim].nbytes;
    1113             :     }
    1114             : 
    1115             :     /*
    1116             :      * Now we know the total expected MCV size, including all the pieces
    1117             :      * (header, dimension info. items and deduplicated data). So do the final
    1118             :      * check on size.
    1119             :      */
    1120         180 :     if (VARSIZE_ANY(data) != expected_size)
    1121           0 :         elog(ERROR, "invalid MCV size %zd (expected %zu)",
    1122             :              VARSIZE_ANY(data), expected_size);
    1123             : 
    1124             :     /*
    1125             :      * We need an array of Datum values for each dimension, so that we can
    1126             :      * easily translate the uint16 indexes later. We also need a top-level
    1127             :      * array of pointers to those per-dimension arrays.
    1128             :      *
    1129             :      * While allocating the arrays for dimensions, compute how much space we
    1130             :      * need for a copy of the by-ref data, as we can't simply point to the
    1131             :      * original values (it might go away).
    1132             :      */
    1133         180 :     datalen = 0;                /* space for by-ref data */
    1134         180 :     map = (Datum **) palloc(ndims * sizeof(Datum *));
    1135             : 
    1136         704 :     for (dim = 0; dim < ndims; dim++)
    1137             :     {
    1138         524 :         map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
    1139             : 
    1140             :         /* space needed for a copy of data for by-ref types */
    1141         524 :         datalen += info[dim].nbytes_aligned;
    1142             :     }
    1143             : 
    1144             :     /*
    1145             :      * Now resize the MCV list so that the allocation includes all the data.
    1146             :      *
    1147             :      * Allocate space for a copy of the data, as we can't simply reference the
    1148             :      * serialized data - it's not aligned properly, and it may disappear while
    1149             :      * we're still using the MCV list, e.g. due to catcache release.
    1150             :      *
    1151             :      * We do care about alignment here, because we will allocate all the
    1152             :      * pieces at once, but then use pointers to different parts.
    1153             :      */
    1154         180 :     mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
    1155             : 
    1156             :     /* arrays of values and isnull flags for all MCV items */
    1157         180 :     mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
    1158         180 :     mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
    1159             : 
    1160             :     /* we don't quite need to align this, but it makes some asserts easier */
    1161         180 :     mcvlen += MAXALIGN(datalen);
    1162             : 
    1163             :     /* now resize the deserialized MCV list, and compute pointers to parts */
    1164         180 :     mcvlist = repalloc(mcvlist, mcvlen);
    1165             : 
    1166             :     /* pointer to the beginning of values/isnull arrays */
    1167         180 :     valuesptr = (char *) mcvlist
    1168         180 :         + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
    1169             : 
    1170         180 :     isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
    1171             : 
    1172         180 :     dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
    1173             : 
    1174             :     /*
    1175             :      * Build mapping (index => value) for translating the serialized data into
    1176             :      * the in-memory representation.
    1177             :      */
    1178         704 :     for (dim = 0; dim < ndims; dim++)
    1179             :     {
    1180             :         /* remember start position in the input array */
    1181         524 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
    1182             : 
    1183         524 :         if (info[dim].typbyval)
    1184             :         {
    1185             :             /* for by-val types we simply copy data into the mapping */
    1186       14340 :             for (i = 0; i < info[dim].nvalues; i++)
    1187             :             {
    1188       14024 :                 Datum       v = 0;
    1189             : 
    1190       14024 :                 memcpy(&v, ptr, info[dim].typlen);
    1191       14024 :                 ptr += info[dim].typlen;
    1192             : 
    1193       14024 :                 map[dim][i] = fetch_att(&v, true, info[dim].typlen);
    1194             : 
    1195             :                 /* no under/overflow of input array */
    1196             :                 Assert(ptr <= (start + info[dim].nbytes));
    1197             :             }
    1198             :         }
    1199             :         else
    1200             :         {
    1201             :             /* for by-ref types we need to also make a copy of the data */
    1202             : 
    1203             :             /* passed by reference, but fixed length (name, tid, ...) */
    1204         208 :             if (info[dim].typlen > 0)
    1205             :             {
    1206        1424 :                 for (i = 0; i < info[dim].nvalues; i++)
    1207             :                 {
    1208        1400 :                     memcpy(dataptr, ptr, info[dim].typlen);
    1209        1400 :                     ptr += info[dim].typlen;
    1210             : 
    1211             :                     /* just point into the array */
    1212        1400 :                     map[dim][i] = PointerGetDatum(dataptr);
    1213        1400 :                     dataptr += MAXALIGN(info[dim].typlen);
    1214             :                 }
    1215             :             }
    1216         184 :             else if (info[dim].typlen == -1)
    1217             :             {
    1218             :                 /* varlena */
    1219        6632 :                 for (i = 0; i < info[dim].nvalues; i++)
    1220             :                 {
    1221             :                     uint32      len;
    1222             : 
    1223             :                     /* read the uint32 length */
    1224        6448 :                     memcpy(&len, ptr, sizeof(uint32));
    1225        6448 :                     ptr += sizeof(uint32);
    1226             : 
    1227             :                     /* the length is data-only */
    1228        6448 :                     SET_VARSIZE(dataptr, len + VARHDRSZ);
    1229        6448 :                     memcpy(VARDATA(dataptr), ptr, len);
    1230        6448 :                     ptr += len;
    1231             : 
    1232             :                     /* just point into the array */
    1233        6448 :                     map[dim][i] = PointerGetDatum(dataptr);
    1234             : 
    1235             :                     /* skip to place of the next deserialized value */
    1236        6448 :                     dataptr += MAXALIGN(len + VARHDRSZ);
    1237             :                 }
    1238             :             }
    1239           0 :             else if (info[dim].typlen == -2)
    1240             :             {
    1241             :                 /* cstring */
    1242           0 :                 for (i = 0; i < info[dim].nvalues; i++)
    1243             :                 {
    1244             :                     uint32      len;
    1245             : 
    1246           0 :                     memcpy(&len, ptr, sizeof(uint32));
    1247           0 :                     ptr += sizeof(uint32);
    1248             : 
    1249           0 :                     memcpy(dataptr, ptr, len);
    1250           0 :                     ptr += len;
    1251             : 
    1252             :                     /* just point into the array */
    1253           0 :                     map[dim][i] = PointerGetDatum(dataptr);
    1254           0 :                     dataptr += MAXALIGN(len);
    1255             :                 }
    1256             :             }
    1257             : 
    1258             :             /* no under/overflow of input array */
    1259             :             Assert(ptr <= (start + info[dim].nbytes));
    1260             : 
    1261             :             /* no overflow of the output mcv value */
    1262             :             Assert(dataptr <= ((char *) mcvlist + mcvlen));
    1263             :         }
    1264             : 
    1265             :         /* check we consumed input data for this dimension exactly */
    1266             :         Assert(ptr == (start + info[dim].nbytes));
    1267             :     }
    1268             : 
    1269             :     /* we should have also filled the MCV list exactly */
    1270             :     Assert(dataptr == ((char *) mcvlist + mcvlen));
    1271             : 
    1272             :     /* deserialize the MCV items and translate the indexes to Datums */
    1273       13176 :     for (i = 0; i < nitems; i++)
    1274             :     {
    1275       12996 :         MCVItem    *item = &mcvlist->items[i];
    1276             : 
    1277       12996 :         item->values = (Datum *) valuesptr;
    1278       12996 :         valuesptr += MAXALIGN(sizeof(Datum) * ndims);
    1279             : 
    1280       12996 :         item->isnull = (bool *) isnullptr;
    1281       12996 :         isnullptr += MAXALIGN(sizeof(bool) * ndims);
    1282             : 
    1283       12996 :         memcpy(item->isnull, ptr, sizeof(bool) * ndims);
    1284       12996 :         ptr += sizeof(bool) * ndims;
    1285             : 
    1286       12996 :         memcpy(&item->frequency, ptr, sizeof(double));
    1287       12996 :         ptr += sizeof(double);
    1288             : 
    1289       12996 :         memcpy(&item->base_frequency, ptr, sizeof(double));
    1290       12996 :         ptr += sizeof(double);
    1291             : 
    1292             :         /* finally translate the indexes (for non-NULL only) */
    1293       51888 :         for (dim = 0; dim < ndims; dim++)
    1294             :         {
    1295             :             uint16      index;
    1296             : 
    1297       38892 :             memcpy(&index, ptr, sizeof(uint16));
    1298       38892 :             ptr += sizeof(uint16);
    1299             : 
    1300       38892 :             if (item->isnull[dim])
    1301         204 :                 continue;
    1302             : 
    1303       38688 :             item->values[dim] = map[dim][index];
    1304             :         }
    1305             : 
    1306             :         /* check we're not overflowing the input */
    1307             :         Assert(ptr <= endptr);
    1308             :     }
    1309             : 
    1310             :     /* check that we processed all the data */
    1311             :     Assert(ptr == endptr);
    1312             : 
    1313             :     /* release the buffers used for mapping */
    1314         704 :     for (dim = 0; dim < ndims; dim++)
    1315         524 :         pfree(map[dim]);
    1316             : 
    1317         180 :     pfree(map);
    1318             : 
    1319         180 :     return mcvlist;
    1320             : }
    1321             : 
    1322             : /*
    1323             :  * SRF with details about buckets of a histogram:
    1324             :  *
    1325             :  * - item ID (0...nitems)
    1326             :  * - values (string array)
    1327             :  * - nulls only (boolean array)
    1328             :  * - frequency (double precision)
    1329             :  * - base_frequency (double precision)
    1330             :  *
    1331             :  * The input is the OID of the statistics, and there are no rows returned if
    1332             :  * the statistics contains no histogram.
    1333             :  */
    1334             : Datum
    1335          20 : pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
    1336             : {
    1337             :     FuncCallContext *funcctx;
    1338             : 
    1339             :     /* stuff done only on the first call of the function */
    1340          20 :     if (SRF_IS_FIRSTCALL())
    1341             :     {
    1342             :         MemoryContext oldcontext;
    1343             :         MCVList    *mcvlist;
    1344             :         TupleDesc   tupdesc;
    1345             : 
    1346             :         /* create a function context for cross-call persistence */
    1347           8 :         funcctx = SRF_FIRSTCALL_INIT();
    1348             : 
    1349             :         /* switch to memory context appropriate for multiple function calls */
    1350           8 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1351             : 
    1352           8 :         mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
    1353             : 
    1354           8 :         funcctx->user_fctx = mcvlist;
    1355             : 
    1356             :         /* total number of tuples to be returned */
    1357           8 :         funcctx->max_calls = 0;
    1358           8 :         if (funcctx->user_fctx != NULL)
    1359           8 :             funcctx->max_calls = mcvlist->nitems;
    1360             : 
    1361             :         /* Build a tuple descriptor for our result type */
    1362           8 :         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
    1363           0 :             ereport(ERROR,
    1364             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1365             :                      errmsg("function returning record called in context "
    1366             :                             "that cannot accept type record")));
    1367           8 :         tupdesc = BlessTupleDesc(tupdesc);
    1368             : 
    1369             :         /*
    1370             :          * generate attribute metadata needed later to produce tuples from raw
    1371             :          * C strings
    1372             :          */
    1373           8 :         funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
    1374             : 
    1375           8 :         MemoryContextSwitchTo(oldcontext);
    1376             :     }
    1377             : 
    1378             :     /* stuff done on every call of the function */
    1379          20 :     funcctx = SRF_PERCALL_SETUP();
    1380             : 
    1381          20 :     if (funcctx->call_cntr < funcctx->max_calls)   /* do when there is more
    1382             :                                                      * left to send */
    1383             :     {
    1384             :         Datum       values[5];
    1385             :         bool        nulls[5];
    1386             :         HeapTuple   tuple;
    1387             :         Datum       result;
    1388          12 :         ArrayBuildState *astate_values = NULL;
    1389          12 :         ArrayBuildState *astate_nulls = NULL;
    1390             : 
    1391             :         int         i;
    1392             :         MCVList    *mcvlist;
    1393             :         MCVItem    *item;
    1394             : 
    1395          12 :         mcvlist = (MCVList *) funcctx->user_fctx;
    1396             : 
    1397             :         Assert(funcctx->call_cntr < mcvlist->nitems);
    1398             : 
    1399          12 :         item = &mcvlist->items[funcctx->call_cntr];
    1400             : 
    1401          48 :         for (i = 0; i < mcvlist->ndimensions; i++)
    1402             :         {
    1403             : 
    1404          36 :             astate_nulls = accumArrayResult(astate_nulls,
    1405          36 :                                             BoolGetDatum(item->isnull[i]),
    1406             :                                             false,
    1407             :                                             BOOLOID,
    1408             :                                             CurrentMemoryContext);
    1409             : 
    1410          36 :             if (!item->isnull[i])
    1411             :             {
    1412             :                 bool        isvarlena;
    1413             :                 Oid         outfunc;
    1414             :                 FmgrInfo    fmgrinfo;
    1415             :                 Datum       val;
    1416             :                 text       *txt;
    1417             : 
    1418             :                 /* lookup output func for the type */
    1419          20 :                 getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
    1420          20 :                 fmgr_info(outfunc, &fmgrinfo);
    1421             : 
    1422          20 :                 val = FunctionCall1(&fmgrinfo, item->values[i]);
    1423          20 :                 txt = cstring_to_text(DatumGetPointer(val));
    1424             : 
    1425          20 :                 astate_values = accumArrayResult(astate_values,
    1426             :                                                  PointerGetDatum(txt),
    1427             :                                                  false,
    1428             :                                                  TEXTOID,
    1429             :                                                  CurrentMemoryContext);
    1430             :             }
    1431             :             else
    1432          16 :                 astate_values = accumArrayResult(astate_values,
    1433             :                                                  (Datum) 0,
    1434             :                                                  true,
    1435             :                                                  TEXTOID,
    1436             :                                                  CurrentMemoryContext);
    1437             :         }
    1438             : 
    1439          12 :         values[0] = Int32GetDatum(funcctx->call_cntr);
    1440          12 :         values[1] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
    1441          12 :         values[2] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext));
    1442          12 :         values[3] = Float8GetDatum(item->frequency);
    1443          12 :         values[4] = Float8GetDatum(item->base_frequency);
    1444             : 
    1445             :         /* no NULLs in the tuple */
    1446          12 :         memset(nulls, 0, sizeof(nulls));
    1447             : 
    1448             :         /* build a tuple */
    1449          12 :         tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
    1450             : 
    1451             :         /* make the tuple into a datum */
    1452          12 :         result = HeapTupleGetDatum(tuple);
    1453             : 
    1454          12 :         SRF_RETURN_NEXT(funcctx, result);
    1455             :     }
    1456             :     else                        /* do when there is no more left */
    1457             :     {
    1458           8 :         SRF_RETURN_DONE(funcctx);
    1459             :     }
    1460             : }
    1461             : 
    1462             : /*
    1463             :  * pg_mcv_list_in       - input routine for type pg_mcv_list.
    1464             :  *
    1465             :  * pg_mcv_list is real enough to be a table column, but it has no operations
    1466             :  * of its own, and disallows input too
    1467             :  */
    1468             : Datum
    1469           0 : pg_mcv_list_in(PG_FUNCTION_ARGS)
    1470             : {
    1471             :     /*
    1472             :      * pg_mcv_list stores the data in binary form and parsing text input is
    1473             :      * not needed, so disallow this.
    1474             :      */
    1475           0 :     ereport(ERROR,
    1476             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1477             :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
    1478             : 
    1479             :     PG_RETURN_VOID();           /* keep compiler quiet */
    1480             : }
    1481             : 
    1482             : 
    1483             : /*
    1484             :  * pg_mcv_list_out      - output routine for type pg_mcv_list.
    1485             :  *
    1486             :  * MCV lists are serialized into a bytea value, so we simply call byteaout()
    1487             :  * to serialize the value into text. But it'd be nice to serialize that into
    1488             :  * a meaningful representation (e.g. for inspection by people).
    1489             :  *
    1490             :  * XXX This should probably return something meaningful, similar to what
    1491             :  * pg_dependencies_out does. Not sure how to deal with the deduplicated
    1492             :  * values, though - do we want to expand that or not?
    1493             :  */
    1494             : Datum
    1495           0 : pg_mcv_list_out(PG_FUNCTION_ARGS)
    1496             : {
    1497           0 :     return byteaout(fcinfo);
    1498             : }
    1499             : 
    1500             : /*
    1501             :  * pg_mcv_list_recv     - binary input routine for type pg_mcv_list.
    1502             :  */
    1503             : Datum
    1504           0 : pg_mcv_list_recv(PG_FUNCTION_ARGS)
    1505             : {
    1506           0 :     ereport(ERROR,
    1507             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1508             :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
    1509             : 
    1510             :     PG_RETURN_VOID();           /* keep compiler quiet */
    1511             : }
    1512             : 
    1513             : /*
    1514             :  * pg_mcv_list_send     - binary output routine for type pg_mcv_list.
    1515             :  *
    1516             :  * MCV lists are serialized in a bytea value (although the type is named
    1517             :  * differently), so let's just send that.
    1518             :  */
    1519             : Datum
    1520           0 : pg_mcv_list_send(PG_FUNCTION_ARGS)
    1521             : {
    1522           0 :     return byteasend(fcinfo);
    1523             : }
    1524             : 
    1525             : /*
    1526             :  * mcv_get_match_bitmap
    1527             :  *  Evaluate clauses using the MCV list, and update the match bitmap.
    1528             :  *
    1529             :  * A match bitmap keeps match/mismatch status for each MCV item, and we
    1530             :  * update it based on additional clauses. We also use it to skip items
    1531             :  * that can't possibly match (e.g. item marked as "mismatch" can't change
    1532             :  * to "match" when evaluating AND clause list).
    1533             :  *
    1534             :  * The function also returns a flag indicating whether there was an
    1535             :  * equality condition for all attributes, the minimum frequency in the MCV
    1536             :  * list, and a total MCV frequency (sum of frequencies for all items).
    1537             :  *
    1538             :  * XXX Currently the match bitmap uses a bool for each MCV item, which is
    1539             :  * somewhat wasteful as we could do with just a single bit, thus reducing
    1540             :  * the size to ~1/8. It would also allow us to combine bitmaps simply using
    1541             :  * & and |, which should be faster than min/max. The bitmaps are fairly
    1542             :  * small, though (thanks to the cap on the MCV list size).
    1543             :  */
    1544             : static bool *
    1545         208 : mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
    1546             :                      Bitmapset *keys, MCVList *mcvlist, bool is_or)
    1547             : {
    1548             :     int         i;
    1549             :     ListCell   *l;
    1550             :     bool       *matches;
    1551             : 
    1552             :     /* The bitmap may be partially built. */
    1553             :     Assert(clauses != NIL);
    1554             :     Assert(list_length(clauses) >= 1);
    1555             :     Assert(mcvlist != NULL);
    1556             :     Assert(mcvlist->nitems > 0);
    1557             :     Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
    1558             : 
    1559         208 :     matches = palloc(sizeof(bool) * mcvlist->nitems);
    1560         208 :     memset(matches, (is_or) ? false : true,
    1561         208 :            sizeof(bool) * mcvlist->nitems);
    1562             : 
    1563             :     /*
    1564             :      * Loop through the list of clauses, and for each of them evaluate all the
    1565             :      * MCV items not yet eliminated by the preceding clauses.
    1566             :      */
    1567         660 :     foreach(l, clauses)
    1568             :     {
    1569         452 :         Node       *clause = (Node *) lfirst(l);
    1570             : 
    1571             :         /* if it's a RestrictInfo, then extract the clause */
    1572         452 :         if (IsA(clause, RestrictInfo))
    1573         392 :             clause = (Node *) ((RestrictInfo *) clause)->clause;
    1574             : 
    1575             :         /*
    1576             :          * Handle the various types of clauses - OpClause, NullTest and
    1577             :          * AND/OR/NOT
    1578             :          */
    1579         452 :         if (is_opclause(clause))
    1580             :         {
    1581         228 :             OpExpr     *expr = (OpExpr *) clause;
    1582             :             FmgrInfo    opproc;
    1583             : 
    1584             :             /* valid only after examine_clause_args returns true */
    1585             :             Var        *var;
    1586             :             Const      *cst;
    1587             :             bool        varonleft;
    1588             : 
    1589         228 :             fmgr_info(get_opcode(expr->opno), &opproc);
    1590             : 
    1591             :             /* extract the var and const from the expression */
    1592         228 :             if (examine_clause_args(expr->args, &var, &cst, &varonleft))
    1593             :             {
    1594             :                 int         idx;
    1595             : 
    1596             :                 /* match the attribute to a dimension of the statistic */
    1597         228 :                 idx = bms_member_index(keys, var->varattno);
    1598             : 
    1599             :                 /*
    1600             :                  * Walk through the MCV items and evaluate the current clause.
    1601             :                  * We can skip items that were already ruled out, and
    1602             :                  * terminate if there are no remaining MCV items that might
    1603             :                  * possibly match.
    1604             :                  */
    1605       17276 :                 for (i = 0; i < mcvlist->nitems; i++)
    1606             :                 {
    1607       17048 :                     bool        match = true;
    1608       17048 :                     MCVItem    *item = &mcvlist->items[i];
    1609             : 
    1610             :                     /*
    1611             :                      * When the MCV item or the Const value is NULL we can
    1612             :                      * treat this as a mismatch. We must not call the operator
    1613             :                      * because of strictness.
    1614             :                      */
    1615       17048 :                     if (item->isnull[idx] || cst->constisnull)
    1616             :                     {
    1617          32 :                         matches[i] = RESULT_MERGE(matches[i], is_or, false);
    1618          32 :                         continue;
    1619             :                     }
    1620             : 
    1621             :                     /*
    1622             :                      * Skip MCV items that can't change result in the bitmap.
    1623             :                      * Once the value gets false for AND-lists, or true for
    1624             :                      * OR-lists, we don't need to look at more clauses.
    1625             :                      */
    1626       17016 :                     if (RESULT_IS_FINAL(matches[i], is_or))
    1627        9160 :                         continue;
    1628             : 
    1629             :                     /*
    1630             :                      * First check whether the constant is below the lower
    1631             :                      * boundary (in that case we can skip the bucket, because
    1632             :                      * there's no overlap).
    1633             :                      *
    1634             :                      * We don't store collations used to build the statistics,
    1635             :                      * but we can use the collation for the attribute itself,
    1636             :                      * as stored in varcollid. We do reset the statistics
    1637             :                      * after a type change (including collation change), so
    1638             :                      * this is OK. We may need to relax this after allowing
    1639             :                      * extended statistics on expressions.
    1640             :                      */
    1641        7856 :                     if (varonleft)
    1642        6196 :                         match = DatumGetBool(FunctionCall2Coll(&opproc,
    1643             :                                                                var->varcollid,
    1644             :                                                                item->values[idx],
    1645             :                                                                cst->constvalue));
    1646             :                     else
    1647        1660 :                         match = DatumGetBool(FunctionCall2Coll(&opproc,
    1648             :                                                                var->varcollid,
    1649             :                                                                cst->constvalue,
    1650             :                                                                item->values[idx]));
    1651             : 
    1652             :                     /* update the match bitmap with the result */
    1653        7856 :                     matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1654             :                 }
    1655             :             }
    1656             :         }
    1657         224 :         else if (IsA(clause, ScalarArrayOpExpr))
    1658             :         {
    1659          96 :             ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
    1660             :             FmgrInfo    opproc;
    1661             : 
    1662             :             /* valid only after examine_clause_args returns true */
    1663             :             Var        *var;
    1664             :             Const      *cst;
    1665             :             bool        varonleft;
    1666             : 
    1667          96 :             fmgr_info(get_opcode(expr->opno), &opproc);
    1668             : 
    1669             :             /* extract the var and const from the expression */
    1670          96 :             if (examine_clause_args(expr->args, &var, &cst, &varonleft))
    1671             :             {
    1672             :                 int         idx;
    1673             : 
    1674             :                 ArrayType  *arrayval;
    1675             :                 int16       elmlen;
    1676             :                 bool        elmbyval;
    1677             :                 char        elmalign;
    1678             :                 int         num_elems;
    1679             :                 Datum      *elem_values;
    1680             :                 bool       *elem_nulls;
    1681             : 
    1682             :                 /* ScalarArrayOpExpr has the Var always on the left */
    1683             :                 Assert(varonleft);
    1684             : 
    1685          96 :                 if (!cst->constisnull)
    1686             :                 {
    1687          96 :                     arrayval = DatumGetArrayTypeP(cst->constvalue);
    1688          96 :                     get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
    1689             :                                          &elmlen, &elmbyval, &elmalign);
    1690          96 :                     deconstruct_array(arrayval,
    1691             :                                       ARR_ELEMTYPE(arrayval),
    1692             :                                       elmlen, elmbyval, elmalign,
    1693             :                                       &elem_values, &elem_nulls, &num_elems);
    1694             :                 }
    1695             : 
    1696             :                 /* match the attribute to a dimension of the statistic */
    1697          96 :                 idx = bms_member_index(keys, var->varattno);
    1698             : 
    1699             :                 /*
    1700             :                  * Walk through the MCV items and evaluate the current clause.
    1701             :                  * We can skip items that were already ruled out, and
    1702             :                  * terminate if there are no remaining MCV items that might
    1703             :                  * possibly match.
    1704             :                  */
    1705        9696 :                 for (i = 0; i < mcvlist->nitems; i++)
    1706             :                 {
    1707             :                     int         j;
    1708        9600 :                     bool        match = (expr->useOr ? false : true);
    1709        9600 :                     MCVItem    *item = &mcvlist->items[i];
    1710             : 
    1711             :                     /*
    1712             :                      * When the MCV item or the Const value is NULL we can
    1713             :                      * treat this as a mismatch. We must not call the operator
    1714             :                      * because of strictness.
    1715             :                      */
    1716        9600 :                     if (item->isnull[idx] || cst->constisnull)
    1717             :                     {
    1718          12 :                         matches[i] = RESULT_MERGE(matches[i], is_or, false);
    1719          12 :                         continue;
    1720             :                     }
    1721             : 
    1722             :                     /*
    1723             :                      * Skip MCV items that can't change result in the bitmap.
    1724             :                      * Once the value gets false for AND-lists, or true for
    1725             :                      * OR-lists, we don't need to look at more clauses.
    1726             :                      */
    1727        9588 :                     if (RESULT_IS_FINAL(matches[i], is_or))
    1728        5004 :                         continue;
    1729             : 
    1730       17288 :                     for (j = 0; j < num_elems; j++)
    1731             :                     {
    1732       14416 :                         Datum       elem_value = elem_values[j];
    1733       14416 :                         bool        elem_isnull = elem_nulls[j];
    1734             :                         bool        elem_match;
    1735             : 
    1736             :                         /* NULL values always evaluate as not matching. */
    1737       14416 :                         if (elem_isnull)
    1738             :                         {
    1739        1240 :                             match = RESULT_MERGE(match, expr->useOr, false);
    1740        1240 :                             continue;
    1741             :                         }
    1742             : 
    1743             :                         /*
    1744             :                          * Stop evaluating the array elements once we reach
    1745             :                          * match value that can't change - ALL() is the same
    1746             :                          * as AND-list, ANY() is the same as OR-list.
    1747             :                          */
    1748       13176 :                         if (RESULT_IS_FINAL(match, expr->useOr))
    1749        1712 :                             break;
    1750             : 
    1751       11464 :                         elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
    1752             :                                                                     var->varcollid,
    1753             :                                                                     item->values[idx],
    1754             :                                                                     elem_value));
    1755             : 
    1756       11464 :                         match = RESULT_MERGE(match, expr->useOr, elem_match);
    1757             :                     }
    1758             : 
    1759             :                     /* update the match bitmap with the result */
    1760        4584 :                     matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1761             :                 }
    1762             :             }
    1763             :         }
    1764         128 :         else if (IsA(clause, NullTest))
    1765             :         {
    1766          44 :             NullTest   *expr = (NullTest *) clause;
    1767          44 :             Var        *var = (Var *) (expr->arg);
    1768             : 
    1769             :             /* match the attribute to a dimension of the statistic */
    1770          44 :             int         idx = bms_member_index(keys, var->varattno);
    1771             : 
    1772             :             /*
    1773             :              * Walk through the MCV items and evaluate the current clause. We
    1774             :              * can skip items that were already ruled out, and terminate if
    1775             :              * there are no remaining MCV items that might possibly match.
    1776             :              */
    1777        4052 :             for (i = 0; i < mcvlist->nitems; i++)
    1778             :             {
    1779        4008 :                 bool        match = false;  /* assume mismatch */
    1780        4008 :                 MCVItem    *item = &mcvlist->items[i];
    1781             : 
    1782             :                 /* if the clause mismatches the MCV item, update the bitmap */
    1783        4008 :                 switch (expr->nulltesttype)
    1784             :                 {
    1785        2808 :                     case IS_NULL:
    1786        2808 :                         match = (item->isnull[idx]) ? true : match;
    1787        2808 :                         break;
    1788             : 
    1789        1200 :                     case IS_NOT_NULL:
    1790        1200 :                         match = (!item->isnull[idx]) ? true : match;
    1791        1200 :                         break;
    1792             :                 }
    1793             : 
    1794             :                 /* now, update the match bitmap, depending on OR/AND type */
    1795        4008 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1796             :             }
    1797             :         }
    1798          84 :         else if (is_orclause(clause) || is_andclause(clause))
    1799          16 :         {
    1800             :             /* AND/OR clause, with all subclauses being compatible */
    1801             : 
    1802             :             int         i;
    1803          16 :             BoolExpr   *bool_clause = ((BoolExpr *) clause);
    1804          16 :             List       *bool_clauses = bool_clause->args;
    1805             : 
    1806             :             /* match/mismatch bitmap for each MCV item */
    1807          16 :             bool       *bool_matches = NULL;
    1808             : 
    1809             :             Assert(bool_clauses != NIL);
    1810             :             Assert(list_length(bool_clauses) >= 2);
    1811             : 
    1812             :             /* build the match bitmap for the OR-clauses */
    1813          16 :             bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
    1814          16 :                                                 mcvlist, is_orclause(clause));
    1815             : 
    1816             :             /*
    1817             :              * Merge the bitmap produced by mcv_get_match_bitmap into the
    1818             :              * current one. We need to consider if we're evaluating AND or OR
    1819             :              * condition when merging the results.
    1820             :              */
    1821         440 :             for (i = 0; i < mcvlist->nitems; i++)
    1822         424 :                 matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
    1823             : 
    1824          16 :             pfree(bool_matches);
    1825             :         }
    1826          68 :         else if (is_notclause(clause))
    1827             :         {
    1828             :             /* NOT clause, with all subclauses compatible */
    1829             : 
    1830             :             int         i;
    1831          20 :             BoolExpr   *not_clause = ((BoolExpr *) clause);
    1832          20 :             List       *not_args = not_clause->args;
    1833             : 
    1834             :             /* match/mismatch bitmap for each MCV item */
    1835          20 :             bool       *not_matches = NULL;
    1836             : 
    1837             :             Assert(not_args != NIL);
    1838             :             Assert(list_length(not_args) == 1);
    1839             : 
    1840             :             /* build the match bitmap for the NOT-clause */
    1841          20 :             not_matches = mcv_get_match_bitmap(root, not_args, keys,
    1842             :                                                mcvlist, false);
    1843             : 
    1844             :             /*
    1845             :              * Merge the bitmap produced by mcv_get_match_bitmap into the
    1846             :              * current one. We're handling a NOT clause, so invert the result
    1847             :              * before merging it into the global bitmap.
    1848             :              */
    1849         100 :             for (i = 0; i < mcvlist->nitems; i++)
    1850          80 :                 matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
    1851             : 
    1852          20 :             pfree(not_matches);
    1853             :         }
    1854          48 :         else if (IsA(clause, Var))
    1855             :         {
    1856             :             /* Var (has to be a boolean Var, possibly from below NOT) */
    1857             : 
    1858          48 :             Var        *var = (Var *) (clause);
    1859             : 
    1860             :             /* match the attribute to a dimension of the statistic */
    1861          48 :             int         idx = bms_member_index(keys, var->varattno);
    1862             : 
    1863             :             Assert(var->vartype == BOOLOID);
    1864             : 
    1865             :             /*
    1866             :              * Walk through the MCV items and evaluate the current clause. We
    1867             :              * can skip items that were already ruled out, and terminate if
    1868             :              * there are no remaining MCV items that might possibly match.
    1869             :              */
    1870         240 :             for (i = 0; i < mcvlist->nitems; i++)
    1871             :             {
    1872         192 :                 MCVItem    *item = &mcvlist->items[i];
    1873         192 :                 bool        match = false;
    1874             : 
    1875             :                 /* if the item is NULL, it's a mismatch */
    1876         192 :                 if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
    1877          96 :                     match = true;
    1878             : 
    1879             :                 /* update the result bitmap */
    1880         192 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
    1881             :             }
    1882             :         }
    1883             :         else
    1884           0 :             elog(ERROR, "unknown clause type: %d", clause->type);
    1885             :     }
    1886             : 
    1887         208 :     return matches;
    1888             : }
    1889             : 
    1890             : 
    1891             : /*
    1892             :  * mcv_clauselist_selectivity
    1893             :  *      Return the selectivity estimate computed using an MCV list.
    1894             :  *
    1895             :  * First builds a bitmap of MCV items matching the clauses, and then sums
    1896             :  * the frequencies of matching items.
    1897             :  *
    1898             :  * It also produces two additional interesting selectivities - total
    1899             :  * selectivity of all the MCV items (not just the matching ones), and the
    1900             :  * base frequency computed on the assumption of independence.
    1901             :  */
    1902             : Selectivity
    1903         172 : mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
    1904             :                            List *clauses, int varRelid,
    1905             :                            JoinType jointype, SpecialJoinInfo *sjinfo,
    1906             :                            RelOptInfo *rel,
    1907             :                            Selectivity *basesel, Selectivity *totalsel)
    1908             : {
    1909             :     int         i;
    1910             :     MCVList    *mcv;
    1911         172 :     Selectivity s = 0.0;
    1912             : 
    1913             :     /* match/mismatch bitmap for each MCV item */
    1914         172 :     bool       *matches = NULL;
    1915             : 
    1916             :     /* load the MCV list stored in the statistics object */
    1917         172 :     mcv = statext_mcv_load(stat->statOid);
    1918             : 
    1919             :     /* build a match bitmap for the clauses */
    1920         172 :     matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
    1921             : 
    1922             :     /* sum frequencies for all the matching MCV items */
    1923         172 :     *basesel = 0.0;
    1924         172 :     *totalsel = 0.0;
    1925       13156 :     for (i = 0; i < mcv->nitems; i++)
    1926             :     {
    1927       12984 :         *totalsel += mcv->items[i].frequency;
    1928             : 
    1929       12984 :         if (matches[i] != false)
    1930             :         {
    1931             :             /* XXX Shouldn't the basesel be outside the if condition? */
    1932         236 :             *basesel += mcv->items[i].base_frequency;
    1933         236 :             s += mcv->items[i].frequency;
    1934             :         }
    1935             :     }
    1936             : 
    1937         172 :     return s;
    1938             : }

Generated by: LCOV version 1.13