LCOV - code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 43.5 % 85 37
Test Date: 2026-03-03 03:15:11 Functions: 70.0 % 10 7
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * sampling.c
       4              :  *    Relation block sampling routines.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/utils/misc/sampling.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #include "postgres.h"
      17              : 
      18              : #include <math.h>
      19              : 
      20              : #include "utils/sampling.h"
      21              : 
      22              : 
      23              : /*
      24              :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25              :  *
      26              :  * BlockSampler provides algorithm for block level sampling of a relation
      27              :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28              :  * It selects a random sample of samplesize blocks out of
      29              :  * the nblocks blocks in the table. If the table has less than
      30              :  * samplesize blocks, all blocks are selected.
      31              :  *
      32              :  * Since we know the total number of blocks in advance, we can use the
      33              :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34              :  * algorithm.
      35              :  *
      36              :  * Returns the number of blocks that BlockSampler_Next will return.
      37              :  */
      38              : BlockNumber
      39         9021 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      40              :                   uint32 randseed)
      41              : {
      42         9021 :     bs->N = nblocks;         /* measured table size */
      43              : 
      44              :     /*
      45              :      * If we decide to reduce samplesize for tables that have less or not much
      46              :      * more than samplesize blocks, here is the place to do it.
      47              :      */
      48         9021 :     bs->n = samplesize;
      49         9021 :     bs->t = 0;                   /* blocks scanned so far */
      50         9021 :     bs->m = 0;                   /* blocks selected so far */
      51              : 
      52         9021 :     sampler_random_init_state(randseed, &bs->randstate);
      53              : 
      54         9021 :     return Min(bs->n, bs->N);
      55              : }
      56              : 
      57              : bool
      58        83283 : BlockSampler_HasMore(BlockSampler bs)
      59              : {
      60        83283 :     return (bs->t < bs->N) && (bs->m < bs->n);
      61              : }
      62              : 
      63              : BlockNumber
      64        74262 : BlockSampler_Next(BlockSampler bs)
      65              : {
      66        74262 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      67        74262 :     int         k = bs->n - bs->m;    /* blocks still to sample */
      68              :     double      p;              /* probability to skip block */
      69              :     double      V;              /* random */
      70              : 
      71              :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
      72              : 
      73        74262 :     if ((BlockNumber) k >= K)
      74              :     {
      75              :         /* need all the rest */
      76        74262 :         bs->m++;
      77        74262 :         return bs->t++;
      78              :     }
      79              : 
      80              :     /*----------
      81              :      * It is not obvious that this code matches Knuth's Algorithm S.
      82              :      * Knuth says to skip the current block with probability 1 - k/K.
      83              :      * If we are to skip, we should advance t (hence decrease K), and
      84              :      * repeat the same probabilistic test for the next block.  The naive
      85              :      * implementation thus requires a sampler_random_fract() call for each
      86              :      * block number.  But we can reduce this to one sampler_random_fract()
      87              :      * call per selected block, by noting that each time the while-test
      88              :      * succeeds, we can reinterpret V as a uniform random number in the range
      89              :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      90              :      * the appropriate fraction of its former value, and our next loop
      91              :      * makes the appropriate probabilistic test.
      92              :      *
      93              :      * We have initially K > k > 0.  If the loop reduces K to equal k,
      94              :      * the next while-test must fail since p will become exactly zero
      95              :      * (we assume there will not be roundoff error in the division).
      96              :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      97              :      * to be doubly sure about roundoff error.)  Therefore K cannot become
      98              :      * less than k, which means that we cannot fail to select enough blocks.
      99              :      *----------
     100              :      */
     101            0 :     V = sampler_random_fract(&bs->randstate);
     102            0 :     p = 1.0 - (double) k / (double) K;
     103            0 :     while (V < p)
     104              :     {
     105              :         /* skip */
     106            0 :         bs->t++;
     107            0 :         K--;                    /* keep K == N - t */
     108              : 
     109              :         /* adjust p to be new cutoff point in reduced range */
     110            0 :         p *= 1.0 - (double) k / (double) K;
     111              :     }
     112              : 
     113              :     /* select */
     114            0 :     bs->m++;
     115            0 :     return bs->t++;
     116              : }
     117              : 
     118              : /*
     119              :  * These two routines embody Algorithm Z from "Random sampling with a
     120              :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     121              :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     122              :  * of the count S of records to skip before processing another record.
     123              :  * It is computed primarily based on t, the number of records already read.
     124              :  * The only extra state needed between calls is W, a random state variable.
     125              :  *
     126              :  * reservoir_init_selection_state computes the initial W value.
     127              :  *
     128              :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     129              :  * determines the number of records to skip before the next record is
     130              :  * processed.
     131              :  */
     132              : void
     133         9069 : reservoir_init_selection_state(ReservoirState rs, int n)
     134              : {
     135              :     /*
     136              :      * Reservoir sampling is not used anywhere where it would need to return
     137              :      * repeatable results so we can initialize it randomly.
     138              :      */
     139         9069 :     sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     140              :                               &rs->randstate);
     141              : 
     142              :     /* Initial value of W (for use when Algorithm Z is first applied) */
     143         9069 :     rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
     144         9069 : }
     145              : 
     146              : double
     147        58461 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     148              : {
     149              :     double      S;
     150              : 
     151              :     /* The magic constant here is T from Vitter's paper */
     152        58461 :     if (t <= (22.0 * n))
     153              :     {
     154              :         /* Process records using Algorithm X until t is large enough */
     155              :         double      V,
     156              :                     quot;
     157              : 
     158        58461 :         V = sampler_random_fract(&rs->randstate);    /* Generate V */
     159        58461 :         S = 0;
     160        58461 :         t += 1;
     161              :         /* Note: "num" in Vitter's code is always equal to t - n */
     162        58461 :         quot = (t - (double) n) / t;
     163              :         /* Find min S satisfying (4.1) */
     164       127909 :         while (quot > V)
     165              :         {
     166        69448 :             S += 1;
     167        69448 :             t += 1;
     168        69448 :             quot *= (t - (double) n) / t;
     169              :         }
     170              :     }
     171              :     else
     172              :     {
     173              :         /* Now apply Algorithm Z */
     174            0 :         double      W = rs->W;
     175            0 :         double      term = t - (double) n + 1;
     176              : 
     177              :         for (;;)
     178            0 :         {
     179              :             double      numer,
     180              :                         numer_lim,
     181              :                         denom;
     182              :             double      U,
     183              :                         X,
     184              :                         lhs,
     185              :                         rhs,
     186              :                         y,
     187              :                         tmp;
     188              : 
     189              :             /* Generate U and X */
     190            0 :             U = sampler_random_fract(&rs->randstate);
     191            0 :             X = t * (W - 1.0);
     192            0 :             S = floor(X);       /* S is tentatively set to floor(X) */
     193              :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     194            0 :             tmp = (t + 1) / term;
     195            0 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     196            0 :             rhs = (((t + X) / (term + S)) * term) / t;
     197            0 :             if (lhs <= rhs)
     198              :             {
     199            0 :                 W = rhs / lhs;
     200            0 :                 break;
     201              :             }
     202              :             /* Test if U <= f(S)/cg(X) */
     203            0 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     204            0 :             if ((double) n < S)
     205              :             {
     206            0 :                 denom = t;
     207            0 :                 numer_lim = term + S;
     208              :             }
     209              :             else
     210              :             {
     211            0 :                 denom = t - (double) n + S;
     212            0 :                 numer_lim = t + 1;
     213              :             }
     214            0 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
     215              :             {
     216            0 :                 y *= numer / denom;
     217            0 :                 denom -= 1;
     218              :             }
     219            0 :             W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
     220            0 :             if (exp(log(y) / n) <= (t + X) / t)
     221            0 :                 break;
     222              :         }
     223            0 :         rs->W = W;
     224              :     }
     225        58461 :     return S;
     226              : }
     227              : 
     228              : 
     229              : /*----------
     230              :  * Random number generator used by sampling
     231              :  *----------
     232              :  */
     233              : void
     234        18100 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
     235              : {
     236        18100 :     pg_prng_seed(randstate, (uint64) seed);
     237        18100 : }
     238              : 
     239              : /* Select a random value R uniformly distributed in (0 - 1) */
     240              : double
     241       125982 : sampler_random_fract(pg_prng_state *randstate)
     242              : {
     243              :     double      res;
     244              : 
     245              :     /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
     246              :     do
     247              :     {
     248       125982 :         res = pg_prng_double(randstate);
     249       125982 :     } while (unlikely(res == 0.0));
     250       125982 :     return res;
     251              : }
     252              : 
     253              : 
     254              : /*
     255              :  * Backwards-compatible API for block sampling
     256              :  *
     257              :  * This code is now deprecated, but since it's still in use by many FDWs,
     258              :  * we should keep it for awhile at least.  The functionality is the same as
     259              :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     260              :  * except that a common random state is used across all callers.
     261              :  */
     262              : static ReservoirStateData oldrs;
     263              : static bool oldrs_initialized = false;
     264              : 
     265              : double
     266            0 : anl_random_fract(void)
     267              : {
     268              :     /* initialize if first time through */
     269            0 :     if (unlikely(!oldrs_initialized))
     270              :     {
     271            0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     272              :                                   &oldrs.randstate);
     273            0 :         oldrs_initialized = true;
     274              :     }
     275              : 
     276              :     /* and compute a random fraction */
     277            0 :     return sampler_random_fract(&oldrs.randstate);
     278              : }
     279              : 
     280              : double
     281            0 : anl_init_selection_state(int n)
     282              : {
     283              :     /* initialize if first time through */
     284            0 :     if (unlikely(!oldrs_initialized))
     285              :     {
     286            0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     287              :                                   &oldrs.randstate);
     288            0 :         oldrs_initialized = true;
     289              :     }
     290              : 
     291              :     /* Initial value of W (for use when Algorithm Z is first applied) */
     292            0 :     return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
     293              : }
     294              : 
     295              : double
     296            0 : anl_get_next_S(double t, int n, double *stateptr)
     297              : {
     298              :     double      result;
     299              : 
     300            0 :     oldrs.W = *stateptr;
     301            0 :     result = reservoir_get_next_S(&oldrs, t, n);
     302            0 :     *stateptr = oldrs.W;
     303            0 :     return result;
     304              : }
        

Generated by: LCOV version 2.0-1