LCOV - code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 37 85 43.5 %
Date: 2024-11-21 08:14:44 Functions: 7 10 70.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * sampling.c
       4             :  *    Relation block sampling routines.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/misc/sampling.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include <math.h>
      19             : 
      20             : #include "utils/sampling.h"
      21             : 
      22             : 
      23             : /*
      24             :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25             :  *
      26             :  * BlockSampler provides algorithm for block level sampling of a relation
      27             :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28             :  * It selects a random sample of samplesize blocks out of
      29             :  * the nblocks blocks in the table. If the table has less than
      30             :  * samplesize blocks, all blocks are selected.
      31             :  *
      32             :  * Since we know the total number of blocks in advance, we can use the
      33             :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34             :  * algorithm.
      35             :  *
      36             :  * Returns the number of blocks that BlockSampler_Next will return.
      37             :  */
      38             : BlockNumber
      39       14514 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      40             :                   uint32 randseed)
      41             : {
      42       14514 :     bs->N = nblocks;         /* measured table size */
      43             : 
      44             :     /*
      45             :      * If we decide to reduce samplesize for tables that have less or not much
      46             :      * more than samplesize blocks, here is the place to do it.
      47             :      */
      48       14514 :     bs->n = samplesize;
      49       14514 :     bs->t = 0;                   /* blocks scanned so far */
      50       14514 :     bs->m = 0;                   /* blocks selected so far */
      51             : 
      52       14514 :     sampler_random_init_state(randseed, &bs->randstate);
      53             : 
      54       14514 :     return Min(bs->n, bs->N);
      55             : }
      56             : 
      57             : bool
      58      127058 : BlockSampler_HasMore(BlockSampler bs)
      59             : {
      60      127058 :     return (bs->t < bs->N) && (bs->m < bs->n);
      61             : }
      62             : 
      63             : BlockNumber
      64      112544 : BlockSampler_Next(BlockSampler bs)
      65             : {
      66      112544 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      67      112544 :     int         k = bs->n - bs->m;    /* blocks still to sample */
      68             :     double      p;              /* probability to skip block */
      69             :     double      V;              /* random */
      70             : 
      71             :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
      72             : 
      73      112544 :     if ((BlockNumber) k >= K)
      74             :     {
      75             :         /* need all the rest */
      76      112544 :         bs->m++;
      77      112544 :         return bs->t++;
      78             :     }
      79             : 
      80             :     /*----------
      81             :      * It is not obvious that this code matches Knuth's Algorithm S.
      82             :      * Knuth says to skip the current block with probability 1 - k/K.
      83             :      * If we are to skip, we should advance t (hence decrease K), and
      84             :      * repeat the same probabilistic test for the next block.  The naive
      85             :      * implementation thus requires a sampler_random_fract() call for each
      86             :      * block number.  But we can reduce this to one sampler_random_fract()
      87             :      * call per selected block, by noting that each time the while-test
      88             :      * succeeds, we can reinterpret V as a uniform random number in the range
      89             :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      90             :      * the appropriate fraction of its former value, and our next loop
      91             :      * makes the appropriate probabilistic test.
      92             :      *
      93             :      * We have initially K > k > 0.  If the loop reduces K to equal k,
      94             :      * the next while-test must fail since p will become exactly zero
      95             :      * (we assume there will not be roundoff error in the division).
      96             :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      97             :      * to be doubly sure about roundoff error.)  Therefore K cannot become
      98             :      * less than k, which means that we cannot fail to select enough blocks.
      99             :      *----------
     100             :      */
     101           0 :     V = sampler_random_fract(&bs->randstate);
     102           0 :     p = 1.0 - (double) k / (double) K;
     103           0 :     while (V < p)
     104             :     {
     105             :         /* skip */
     106           0 :         bs->t++;
     107           0 :         K--;                    /* keep K == N - t */
     108             : 
     109             :         /* adjust p to be new cutoff point in reduced range */
     110           0 :         p *= 1.0 - (double) k / (double) K;
     111             :     }
     112             : 
     113             :     /* select */
     114           0 :     bs->m++;
     115           0 :     return bs->t++;
     116             : }
     117             : 
     118             : /*
     119             :  * These two routines embody Algorithm Z from "Random sampling with a
     120             :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     121             :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     122             :  * of the count S of records to skip before processing another record.
     123             :  * It is computed primarily based on t, the number of records already read.
     124             :  * The only extra state needed between calls is W, a random state variable.
     125             :  *
     126             :  * reservoir_init_selection_state computes the initial W value.
     127             :  *
     128             :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     129             :  * determines the number of records to skip before the next record is
     130             :  * processed.
     131             :  */
     132             : void
     133       14600 : reservoir_init_selection_state(ReservoirState rs, int n)
     134             : {
     135             :     /*
     136             :      * Reservoir sampling is not used anywhere where it would need to return
     137             :      * repeatable results so we can initialize it randomly.
     138             :      */
     139       14600 :     sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     140             :                               &rs->randstate);
     141             : 
     142             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     143       14600 :     rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
     144       14600 : }
     145             : 
     146             : double
     147      114570 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     148             : {
     149             :     double      S;
     150             : 
     151             :     /* The magic constant here is T from Vitter's paper */
     152      114570 :     if (t <= (22.0 * n))
     153             :     {
     154             :         /* Process records using Algorithm X until t is large enough */
     155             :         double      V,
     156             :                     quot;
     157             : 
     158      114570 :         V = sampler_random_fract(&rs->randstate);    /* Generate V */
     159      114570 :         S = 0;
     160      114570 :         t += 1;
     161             :         /* Note: "num" in Vitter's code is always equal to t - n */
     162      114570 :         quot = (t - (double) n) / t;
     163             :         /* Find min S satisfying (4.1) */
     164      250436 :         while (quot > V)
     165             :         {
     166      135866 :             S += 1;
     167      135866 :             t += 1;
     168      135866 :             quot *= (t - (double) n) / t;
     169             :         }
     170             :     }
     171             :     else
     172             :     {
     173             :         /* Now apply Algorithm Z */
     174           0 :         double      W = rs->W;
     175           0 :         double      term = t - (double) n + 1;
     176             : 
     177             :         for (;;)
     178           0 :         {
     179             :             double      numer,
     180             :                         numer_lim,
     181             :                         denom;
     182             :             double      U,
     183             :                         X,
     184             :                         lhs,
     185             :                         rhs,
     186             :                         y,
     187             :                         tmp;
     188             : 
     189             :             /* Generate U and X */
     190           0 :             U = sampler_random_fract(&rs->randstate);
     191           0 :             X = t * (W - 1.0);
     192           0 :             S = floor(X);       /* S is tentatively set to floor(X) */
     193             :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     194           0 :             tmp = (t + 1) / term;
     195           0 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     196           0 :             rhs = (((t + X) / (term + S)) * term) / t;
     197           0 :             if (lhs <= rhs)
     198             :             {
     199           0 :                 W = rhs / lhs;
     200           0 :                 break;
     201             :             }
     202             :             /* Test if U <= f(S)/cg(X) */
     203           0 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     204           0 :             if ((double) n < S)
     205             :             {
     206           0 :                 denom = t;
     207           0 :                 numer_lim = term + S;
     208             :             }
     209             :             else
     210             :             {
     211           0 :                 denom = t - (double) n + S;
     212           0 :                 numer_lim = t + 1;
     213             :             }
     214           0 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
     215             :             {
     216           0 :                 y *= numer / denom;
     217           0 :                 denom -= 1;
     218             :             }
     219           0 :             W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
     220           0 :             if (exp(log(y) / n) <= (t + X) / t)
     221           0 :                 break;
     222             :         }
     223           0 :         rs->W = W;
     224             :     }
     225      114570 :     return S;
     226             : }
     227             : 
     228             : 
     229             : /*----------
     230             :  * Random number generator used by sampling
     231             :  *----------
     232             :  */
     233             : void
     234       29134 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
     235             : {
     236       29134 :     pg_prng_seed(randstate, (uint64) seed);
     237       29134 : }
     238             : 
     239             : /* Select a random value R uniformly distributed in (0 - 1) */
     240             : double
     241      243726 : sampler_random_fract(pg_prng_state *randstate)
     242             : {
     243             :     double      res;
     244             : 
     245             :     /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
     246             :     do
     247             :     {
     248      243726 :         res = pg_prng_double(randstate);
     249      243726 :     } while (unlikely(res == 0.0));
     250      243726 :     return res;
     251             : }
     252             : 
     253             : 
     254             : /*
     255             :  * Backwards-compatible API for block sampling
     256             :  *
     257             :  * This code is now deprecated, but since it's still in use by many FDWs,
     258             :  * we should keep it for awhile at least.  The functionality is the same as
     259             :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     260             :  * except that a common random state is used across all callers.
     261             :  */
     262             : static ReservoirStateData oldrs;
     263             : static bool oldrs_initialized = false;
     264             : 
     265             : double
     266           0 : anl_random_fract(void)
     267             : {
     268             :     /* initialize if first time through */
     269           0 :     if (unlikely(!oldrs_initialized))
     270             :     {
     271           0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     272             :                                   &oldrs.randstate);
     273           0 :         oldrs_initialized = true;
     274             :     }
     275             : 
     276             :     /* and compute a random fraction */
     277           0 :     return sampler_random_fract(&oldrs.randstate);
     278             : }
     279             : 
     280             : double
     281           0 : anl_init_selection_state(int n)
     282             : {
     283             :     /* initialize if first time through */
     284           0 :     if (unlikely(!oldrs_initialized))
     285             :     {
     286           0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     287             :                                   &oldrs.randstate);
     288           0 :         oldrs_initialized = true;
     289             :     }
     290             : 
     291             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     292           0 :     return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
     293             : }
     294             : 
     295             : double
     296           0 : anl_get_next_S(double t, int n, double *stateptr)
     297             : {
     298             :     double      result;
     299             : 
     300           0 :     oldrs.W = *stateptr;
     301           0 :     result = reservoir_get_next_S(&oldrs, t, n);
     302           0 :     *stateptr = oldrs.W;
     303           0 :     return result;
     304             : }

Generated by: LCOV version 1.14