LCOV - code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 70 85 82.4 %
Date: 2020-02-27 19:06:28 Functions: 7 10 70.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * sampling.c
       4             :  *    Relation block sampling routines.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/misc/sampling.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include <math.h>
      19             : 
      20             : #include "utils/sampling.h"
      21             : 
      22             : 
      23             : /*
      24             :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25             :  *
      26             :  * BlockSampler provides algorithm for block level sampling of a relation
      27             :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28             :  * It selects a random sample of samplesize blocks out of
      29             :  * the nblocks blocks in the table. If the table has less than
      30             :  * samplesize blocks, all blocks are selected.
      31             :  *
      32             :  * Since we know the total number of blocks in advance, we can use the
      33             :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34             :  * algorithm.
      35             :  *
      36             :  * Returns the number of blocks that BlockSampler_Next will return.
      37             :  */
      38             : BlockNumber
      39       31304 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      40             :                   long randseed)
      41             : {
      42       31304 :     bs->N = nblocks;         /* measured table size */
      43             : 
      44             :     /*
      45             :      * If we decide to reduce samplesize for tables that have less or not much
      46             :      * more than samplesize blocks, here is the place to do it.
      47             :      */
      48       31304 :     bs->n = samplesize;
      49       31304 :     bs->t = 0;                   /* blocks scanned so far */
      50       31304 :     bs->m = 0;                   /* blocks selected so far */
      51             : 
      52       31304 :     sampler_random_init_state(randseed, bs->randstate);
      53             : 
      54       31304 :     return Min(bs->n, bs->N);
      55             : }
      56             : 
      57             : bool
      58      239964 : BlockSampler_HasMore(BlockSampler bs)
      59             : {
      60      239964 :     return (bs->t < bs->N) && (bs->m < bs->n);
      61             : }
      62             : 
      63             : BlockNumber
      64      208660 : BlockSampler_Next(BlockSampler bs)
      65             : {
      66      208660 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      67      208660 :     int         k = bs->n - bs->m;    /* blocks still to sample */
      68             :     double      p;              /* probability to skip block */
      69             :     double      V;              /* random */
      70             : 
      71             :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
      72             : 
      73      208660 :     if ((BlockNumber) k >= K)
      74             :     {
      75             :         /* need all the rest */
      76      206268 :         bs->m++;
      77      206268 :         return bs->t++;
      78             :     }
      79             : 
      80             :     /*----------
      81             :      * It is not obvious that this code matches Knuth's Algorithm S.
      82             :      * Knuth says to skip the current block with probability 1 - k/K.
      83             :      * If we are to skip, we should advance t (hence decrease K), and
      84             :      * repeat the same probabilistic test for the next block.  The naive
      85             :      * implementation thus requires a sampler_random_fract() call for each
      86             :      * block number.  But we can reduce this to one sampler_random_fract()
      87             :      * call per selected block, by noting that each time the while-test
      88             :      * succeeds, we can reinterpret V as a uniform random number in the range
      89             :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      90             :      * the appropriate fraction of its former value, and our next loop
      91             :      * makes the appropriate probabilistic test.
      92             :      *
      93             :      * We have initially K > k > 0.  If the loop reduces K to equal k,
      94             :      * the next while-test must fail since p will become exactly zero
      95             :      * (we assume there will not be roundoff error in the division).
      96             :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      97             :      * to be doubly sure about roundoff error.)  Therefore K cannot become
      98             :      * less than k, which means that we cannot fail to select enough blocks.
      99             :      *----------
     100             :      */
     101        2392 :     V = sampler_random_fract(bs->randstate);
     102        2392 :     p = 1.0 - (double) k / (double) K;
     103        3258 :     while (V < p)
     104             :     {
     105             :         /* skip */
     106         866 :         bs->t++;
     107         866 :         K--;                    /* keep K == N - t */
     108             : 
     109             :         /* adjust p to be new cutoff point in reduced range */
     110         866 :         p *= 1.0 - (double) k / (double) K;
     111             :     }
     112             : 
     113             :     /* select */
     114        2392 :     bs->m++;
     115        2392 :     return bs->t++;
     116             : }
     117             : 
     118             : /*
     119             :  * These two routines embody Algorithm Z from "Random sampling with a
     120             :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     121             :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     122             :  * of the count S of records to skip before processing another record.
     123             :  * It is computed primarily based on t, the number of records already read.
     124             :  * The only extra state needed between calls is W, a random state variable.
     125             :  *
     126             :  * reservoir_init_selection_state computes the initial W value.
     127             :  *
     128             :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     129             :  * determines the number of records to skip before the next record is
     130             :  * processed.
     131             :  */
     132             : void
     133       31352 : reservoir_init_selection_state(ReservoirState rs, int n)
     134             : {
     135             :     /*
     136             :      * Reservoir sampling is not used anywhere where it would need to return
     137             :      * repeatable results so we can initialize it randomly.
     138             :      */
     139       31352 :     sampler_random_init_state(random(), rs->randstate);
     140             : 
     141             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     142       31352 :     rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
     143       31352 : }
     144             : 
     145             : double
     146      560826 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     147             : {
     148             :     double      S;
     149             : 
     150             :     /* The magic constant here is T from Vitter's paper */
     151      560826 :     if (t <= (22.0 * n))
     152             :     {
     153             :         /* Process records using Algorithm X until t is large enough */
     154             :         double      V,
     155             :                     quot;
     156             : 
     157      553042 :         V = sampler_random_fract(rs->randstate); /* Generate V */
     158      553042 :         S = 0;
     159      553042 :         t += 1;
     160             :         /* Note: "num" in Vitter's code is always equal to t - n */
     161      553042 :         quot = (t - (double) n) / t;
     162             :         /* Find min S satisfying (4.1) */
     163     1530654 :         while (quot > V)
     164             :         {
     165      977612 :             S += 1;
     166      977612 :             t += 1;
     167      977612 :             quot *= (t - (double) n) / t;
     168             :         }
     169             :     }
     170             :     else
     171             :     {
     172             :         /* Now apply Algorithm Z */
     173        7784 :         double      W = rs->W;
     174        7784 :         double      term = t - (double) n + 1;
     175             : 
     176             :         for (;;)
     177         310 :         {
     178             :             double      numer,
     179             :                         numer_lim,
     180             :                         denom;
     181             :             double      U,
     182             :                         X,
     183             :                         lhs,
     184             :                         rhs,
     185             :                         y,
     186             :                         tmp;
     187             : 
     188             :             /* Generate U and X */
     189        8094 :             U = sampler_random_fract(rs->randstate);
     190        8094 :             X = t * (W - 1.0);
     191        8094 :             S = floor(X);       /* S is tentatively set to floor(X) */
     192             :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     193        8094 :             tmp = (t + 1) / term;
     194        8094 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     195        8094 :             rhs = (((t + X) / (term + S)) * term) / t;
     196        8094 :             if (lhs <= rhs)
     197             :             {
     198        7676 :                 W = rhs / lhs;
     199        7676 :                 break;
     200             :             }
     201             :             /* Test if U <= f(S)/cg(X) */
     202         418 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     203         418 :             if ((double) n < S)
     204             :             {
     205           0 :                 denom = t;
     206           0 :                 numer_lim = term + S;
     207             :             }
     208             :             else
     209             :             {
     210         418 :                 denom = t - (double) n + S;
     211         418 :                 numer_lim = t + 1;
     212             :             }
     213       19536 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
     214             :             {
     215       19118 :                 y *= numer / denom;
     216       19118 :                 denom -= 1;
     217             :             }
     218         418 :             W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
     219         418 :             if (exp(log(y) / n) <= (t + X) / t)
     220         108 :                 break;
     221             :         }
     222        7784 :         rs->W = W;
     223             :     }
     224      560826 :     return S;
     225             : }
     226             : 
     227             : 
     228             : /*----------
     229             :  * Random number generator used by sampling
     230             :  *----------
     231             :  */
     232             : void
     233       62676 : sampler_random_init_state(long seed, SamplerRandomState randstate)
     234             : {
     235       62676 :     randstate[0] = 0x330e;      /* same as pg_erand48, but could be anything */
     236       62676 :     randstate[1] = (unsigned short) seed;
     237       62676 :     randstate[2] = (unsigned short) (seed >> 16);
     238       62676 : }
     239             : 
     240             : /* Select a random value R uniformly distributed in (0 - 1) */
     241             : double
     242     1155842 : sampler_random_fract(SamplerRandomState randstate)
     243             : {
     244             :     double      res;
     245             : 
     246             :     /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
     247             :     do
     248             :     {
     249     1155842 :         res = pg_erand48(randstate);
     250     1155842 :     } while (res == 0.0);
     251     1155842 :     return res;
     252             : }
     253             : 
     254             : 
     255             : /*
     256             :  * Backwards-compatible API for block sampling
     257             :  *
     258             :  * This code is now deprecated, but since it's still in use by many FDWs,
     259             :  * we should keep it for awhile at least.  The functionality is the same as
     260             :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     261             :  * except that a common random state is used across all callers.
     262             :  */
     263             : static ReservoirStateData oldrs;
     264             : 
     265             : double
     266           0 : anl_random_fract(void)
     267             : {
     268             :     /* initialize if first time through */
     269           0 :     if (oldrs.randstate[0] == 0)
     270           0 :         sampler_random_init_state(random(), oldrs.randstate);
     271             : 
     272             :     /* and compute a random fraction */
     273           0 :     return sampler_random_fract(oldrs.randstate);
     274             : }
     275             : 
     276             : double
     277           0 : anl_init_selection_state(int n)
     278             : {
     279             :     /* initialize if first time through */
     280           0 :     if (oldrs.randstate[0] == 0)
     281           0 :         sampler_random_init_state(random(), oldrs.randstate);
     282             : 
     283             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     284           0 :     return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
     285             : }
     286             : 
     287             : double
     288           0 : anl_get_next_S(double t, int n, double *stateptr)
     289             : {
     290             :     double      result;
     291             : 
     292           0 :     oldrs.W = *stateptr;
     293           0 :     result = reservoir_get_next_S(&oldrs, t, n);
     294           0 :     *stateptr = oldrs.W;
     295           0 :     return result;
     296             : }

Generated by: LCOV version 1.13