LCOV - code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 70 85 82.4 %
Date: 2019-06-19 16:07:09 Functions: 7 10 70.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * sampling.c
       4             :  *    Relation block sampling routines.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/misc/sampling.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include <math.h>
      19             : 
      20             : #include "utils/sampling.h"
      21             : 
      22             : 
      23             : /*
      24             :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25             :  *
      26             :  * BlockSampler provides algorithm for block level sampling of a relation
      27             :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28             :  * It selects a random sample of samplesize blocks out of
      29             :  * the nblocks blocks in the table. If the table has less than
      30             :  * samplesize blocks, all blocks are selected.
      31             :  *
      32             :  * Since we know the total number of blocks in advance, we can use the
      33             :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34             :  * algorithm.
      35             :  */
      36             : void
      37       32284 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      38             :                   long randseed)
      39             : {
      40       32284 :     bs->N = nblocks;         /* measured table size */
      41             : 
      42             :     /*
      43             :      * If we decide to reduce samplesize for tables that have less or not much
      44             :      * more than samplesize blocks, here is the place to do it.
      45             :      */
      46       32284 :     bs->n = samplesize;
      47       32284 :     bs->t = 0;                   /* blocks scanned so far */
      48       32284 :     bs->m = 0;                   /* blocks selected so far */
      49             : 
      50       32284 :     sampler_random_init_state(randseed, bs->randstate);
      51       32284 : }
      52             : 
      53             : bool
      54      238748 : BlockSampler_HasMore(BlockSampler bs)
      55             : {
      56      238748 :     return (bs->t < bs->N) && (bs->m < bs->n);
      57             : }
      58             : 
      59             : BlockNumber
      60      206464 : BlockSampler_Next(BlockSampler bs)
      61             : {
      62      206464 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      63      206464 :     int         k = bs->n - bs->m;    /* blocks still to sample */
      64             :     double      p;              /* probability to skip block */
      65             :     double      V;              /* random */
      66             : 
      67             :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
      68             : 
      69      206464 :     if ((BlockNumber) k >= K)
      70             :     {
      71             :         /* need all the rest */
      72      204110 :         bs->m++;
      73      204110 :         return bs->t++;
      74             :     }
      75             : 
      76             :     /*----------
      77             :      * It is not obvious that this code matches Knuth's Algorithm S.
      78             :      * Knuth says to skip the current block with probability 1 - k/K.
      79             :      * If we are to skip, we should advance t (hence decrease K), and
      80             :      * repeat the same probabilistic test for the next block.  The naive
      81             :      * implementation thus requires a sampler_random_fract() call for each
      82             :      * block number.  But we can reduce this to one sampler_random_fract()
      83             :      * call per selected block, by noting that each time the while-test
      84             :      * succeeds, we can reinterpret V as a uniform random number in the range
      85             :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      86             :      * the appropriate fraction of its former value, and our next loop
      87             :      * makes the appropriate probabilistic test.
      88             :      *
      89             :      * We have initially K > k > 0.  If the loop reduces K to equal k,
      90             :      * the next while-test must fail since p will become exactly zero
      91             :      * (we assume there will not be roundoff error in the division).
      92             :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      93             :      * to be doubly sure about roundoff error.)  Therefore K cannot become
      94             :      * less than k, which means that we cannot fail to select enough blocks.
      95             :      *----------
      96             :      */
      97        2354 :     V = sampler_random_fract(bs->randstate);
      98        2354 :     p = 1.0 - (double) k / (double) K;
      99        5578 :     while (V < p)
     100             :     {
     101             :         /* skip */
     102         870 :         bs->t++;
     103         870 :         K--;                    /* keep K == N - t */
     104             : 
     105             :         /* adjust p to be new cutoff point in reduced range */
     106         870 :         p *= 1.0 - (double) k / (double) K;
     107             :     }
     108             : 
     109             :     /* select */
     110        2354 :     bs->m++;
     111        2354 :     return bs->t++;
     112             : }
     113             : 
     114             : /*
     115             :  * These two routines embody Algorithm Z from "Random sampling with a
     116             :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     117             :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     118             :  * of the count S of records to skip before processing another record.
     119             :  * It is computed primarily based on t, the number of records already read.
     120             :  * The only extra state needed between calls is W, a random state variable.
     121             :  *
     122             :  * reservoir_init_selection_state computes the initial W value.
     123             :  *
     124             :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     125             :  * determines the number of records to skip before the next record is
     126             :  * processed.
     127             :  */
     128             : void
     129       32332 : reservoir_init_selection_state(ReservoirState rs, int n)
     130             : {
     131             :     /*
     132             :      * Reservoir sampling is not used anywhere where it would need to return
     133             :      * repeatable results so we can initialize it randomly.
     134             :      */
     135       32332 :     sampler_random_init_state(random(), rs->randstate);
     136             : 
     137             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     138       32332 :     rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
     139       32332 : }
     140             : 
     141             : double
     142      544562 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     143             : {
     144             :     double      S;
     145             : 
     146             :     /* The magic constant here is T from Vitter's paper */
     147      544562 :     if (t <= (22.0 * n))
     148             :     {
     149             :         /* Process records using Algorithm X until t is large enough */
     150             :         double      V,
     151             :                     quot;
     152             : 
     153      537140 :         V = sampler_random_fract(rs->randstate); /* Generate V */
     154      537140 :         S = 0;
     155      537140 :         t += 1;
     156             :         /* Note: "num" in Vitter's code is always equal to t - n */
     157      537140 :         quot = (t - (double) n) / t;
     158             :         /* Find min S satisfying (4.1) */
     159     2021618 :         while (quot > V)
     160             :         {
     161      947338 :             S += 1;
     162      947338 :             t += 1;
     163      947338 :             quot *= (t - (double) n) / t;
     164             :         }
     165             :     }
     166             :     else
     167             :     {
     168             :         /* Now apply Algorithm Z */
     169        7422 :         double      W = rs->W;
     170        7422 :         double      term = t - (double) n + 1;
     171             : 
     172             :         for (;;)
     173         266 :         {
     174             :             double      numer,
     175             :                         numer_lim,
     176             :                         denom;
     177             :             double      U,
     178             :                         X,
     179             :                         lhs,
     180             :                         rhs,
     181             :                         y,
     182             :                         tmp;
     183             : 
     184             :             /* Generate U and X */
     185        7688 :             U = sampler_random_fract(rs->randstate);
     186        7688 :             X = t * (W - 1.0);
     187        7688 :             S = floor(X);       /* S is tentatively set to floor(X) */
     188             :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     189        7688 :             tmp = (t + 1) / term;
     190        7688 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     191        7688 :             rhs = (((t + X) / (term + S)) * term) / t;
     192        7688 :             if (lhs <= rhs)
     193             :             {
     194        7282 :                 W = rhs / lhs;
     195        7282 :                 break;
     196             :             }
     197             :             /* Test if U <= f(S)/cg(X) */
     198         406 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     199         406 :             if ((double) n < S)
     200             :             {
     201           0 :                 denom = t;
     202           0 :                 numer_lim = term + S;
     203             :             }
     204             :             else
     205             :             {
     206         406 :                 denom = t - (double) n + S;
     207         406 :                 numer_lim = t + 1;
     208             :             }
     209       17868 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
     210             :             {
     211       17462 :                 y *= numer / denom;
     212       17462 :                 denom -= 1;
     213             :             }
     214         406 :             W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
     215         406 :             if (exp(log(y) / n) <= (t + X) / t)
     216         140 :                 break;
     217             :         }
     218        7422 :         rs->W = W;
     219             :     }
     220      544562 :     return S;
     221             : }
     222             : 
     223             : 
     224             : /*----------
     225             :  * Random number generator used by sampling
     226             :  *----------
     227             :  */
     228             : void
     229       64636 : sampler_random_init_state(long seed, SamplerRandomState randstate)
     230             : {
     231       64636 :     randstate[0] = 0x330e;      /* same as pg_erand48, but could be anything */
     232       64636 :     randstate[1] = (unsigned short) seed;
     233       64636 :     randstate[2] = (unsigned short) (seed >> 16);
     234       64636 : }
     235             : 
     236             : /* Select a random value R uniformly distributed in (0 - 1) */
     237             : double
     238     1124220 : sampler_random_fract(SamplerRandomState randstate)
     239             : {
     240             :     double      res;
     241             : 
     242             :     /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
     243             :     do
     244             :     {
     245     1124220 :         res = pg_erand48(randstate);
     246     1124220 :     } while (res == 0.0);
     247     1124220 :     return res;
     248             : }
     249             : 
     250             : 
     251             : /*
     252             :  * Backwards-compatible API for block sampling
     253             :  *
     254             :  * This code is now deprecated, but since it's still in use by many FDWs,
     255             :  * we should keep it for awhile at least.  The functionality is the same as
     256             :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     257             :  * except that a common random state is used across all callers.
     258             :  */
     259             : static ReservoirStateData oldrs;
     260             : 
     261             : double
     262           0 : anl_random_fract(void)
     263             : {
     264             :     /* initialize if first time through */
     265           0 :     if (oldrs.randstate[0] == 0)
     266           0 :         sampler_random_init_state(random(), oldrs.randstate);
     267             : 
     268             :     /* and compute a random fraction */
     269           0 :     return sampler_random_fract(oldrs.randstate);
     270             : }
     271             : 
     272             : double
     273           0 : anl_init_selection_state(int n)
     274             : {
     275             :     /* initialize if first time through */
     276           0 :     if (oldrs.randstate[0] == 0)
     277           0 :         sampler_random_init_state(random(), oldrs.randstate);
     278             : 
     279             :     /* Initial value of W (for use when Algorithm Z is first applied) */
     280           0 :     return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
     281             : }
     282             : 
     283             : double
     284           0 : anl_get_next_S(double t, int n, double *stateptr)
     285             : {
     286             :     double      result;
     287             : 
     288           0 :     oldrs.W = *stateptr;
     289           0 :     result = reservoir_get_next_S(&oldrs, t, n);
     290           0 :     *stateptr = oldrs.W;
     291           0 :     return result;
     292             : }

Generated by: LCOV version 1.13