LCOV - code coverage report
Current view: top level - src/backend/access/tablesample - bernoulli.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 100.0 % 57 57
Test Date: 2026-02-17 17:20:33 Functions: 100.0 % 5 5
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * bernoulli.c
       4              :  *    support routines for BERNOULLI tablesample method
       5              :  *
       6              :  * To ensure repeatability of samples, it is necessary that selection of a
       7              :  * given tuple be history-independent; otherwise syncscanning would break
       8              :  * repeatability, to say nothing of logically-irrelevant maintenance such
       9              :  * as physical extension or shortening of the relation.
      10              :  *
      11              :  * To achieve that, we proceed by hashing each candidate TID together with
      12              :  * the active seed, and then selecting it if the hash is less than the
      13              :  * cutoff value computed from the selection probability by BeginSampleScan.
      14              :  *
      15              :  *
      16              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      17              :  * Portions Copyright (c) 1994, Regents of the University of California
      18              :  *
      19              :  * IDENTIFICATION
      20              :  *    src/backend/access/tablesample/bernoulli.c
      21              :  *
      22              :  *-------------------------------------------------------------------------
      23              :  */
      24              : 
      25              : #include "postgres.h"
      26              : 
      27              : #include <math.h>
      28              : 
      29              : #include "access/tsmapi.h"
      30              : #include "catalog/pg_type.h"
      31              : #include "common/hashfn.h"
      32              : #include "optimizer/optimizer.h"
      33              : #include "utils/fmgrprotos.h"
      34              : 
      35              : 
      36              : /* Private state */
      37              : typedef struct
      38              : {
      39              :     uint64      cutoff;         /* select tuples with hash less than this */
      40              :     uint32      seed;           /* random seed */
      41              :     OffsetNumber lt;            /* last tuple returned from current block */
      42              : } BernoulliSamplerData;
      43              : 
      44              : 
      45              : static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
      46              :                                               RelOptInfo *baserel,
      47              :                                               List *paramexprs,
      48              :                                               BlockNumber *pages,
      49              :                                               double *tuples);
      50              : static void bernoulli_initsamplescan(SampleScanState *node,
      51              :                                      int eflags);
      52              : static void bernoulli_beginsamplescan(SampleScanState *node,
      53              :                                       Datum *params,
      54              :                                       int nparams,
      55              :                                       uint32 seed);
      56              : static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
      57              :                                               BlockNumber blockno,
      58              :                                               OffsetNumber maxoffset);
      59              : 
      60              : 
      61              : /*
      62              :  * Create a TsmRoutine descriptor for the BERNOULLI method.
      63              :  */
      64              : Datum
      65          228 : tsm_bernoulli_handler(PG_FUNCTION_ARGS)
      66              : {
      67          228 :     TsmRoutine *tsm = makeNode(TsmRoutine);
      68              : 
      69          228 :     tsm->parameterTypes = list_make1_oid(FLOAT4OID);
      70          228 :     tsm->repeatable_across_queries = true;
      71          228 :     tsm->repeatable_across_scans = true;
      72          228 :     tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
      73          228 :     tsm->InitSampleScan = bernoulli_initsamplescan;
      74          228 :     tsm->BeginSampleScan = bernoulli_beginsamplescan;
      75          228 :     tsm->NextSampleBlock = NULL;
      76          228 :     tsm->NextSampleTuple = bernoulli_nextsampletuple;
      77          228 :     tsm->EndSampleScan = NULL;
      78              : 
      79          228 :     PG_RETURN_POINTER(tsm);
      80              : }
      81              : 
      82              : /*
      83              :  * Sample size estimation.
      84              :  */
      85              : static void
      86           60 : bernoulli_samplescangetsamplesize(PlannerInfo *root,
      87              :                                   RelOptInfo *baserel,
      88              :                                   List *paramexprs,
      89              :                                   BlockNumber *pages,
      90              :                                   double *tuples)
      91              : {
      92              :     Node       *pctnode;
      93              :     float4      samplefract;
      94              : 
      95              :     /* Try to extract an estimate for the sample percentage */
      96           60 :     pctnode = (Node *) linitial(paramexprs);
      97           60 :     pctnode = estimate_expression_value(root, pctnode);
      98              : 
      99           60 :     if (IsA(pctnode, Const) &&
     100           51 :         !((Const *) pctnode)->constisnull)
     101              :     {
     102           51 :         samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
     103           51 :         if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
     104           45 :             samplefract /= 100.0f;
     105              :         else
     106              :         {
     107              :             /* Default samplefract if the value is bogus */
     108            6 :             samplefract = 0.1f;
     109              :         }
     110              :     }
     111              :     else
     112              :     {
     113              :         /* Default samplefract if we didn't obtain a non-null Const */
     114            9 :         samplefract = 0.1f;
     115              :     }
     116              : 
     117              :     /* We'll visit all pages of the baserel */
     118           60 :     *pages = baserel->pages;
     119              : 
     120           60 :     *tuples = clamp_row_est(baserel->tuples * samplefract);
     121           60 : }
     122              : 
     123              : /*
     124              :  * Initialize during executor setup.
     125              :  */
     126              : static void
     127           60 : bernoulli_initsamplescan(SampleScanState *node, int eflags)
     128              : {
     129           60 :     node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
     130           60 : }
     131              : 
     132              : /*
     133              :  * Examine parameters and prepare for a sample scan.
     134              :  */
     135              : static void
     136           45 : bernoulli_beginsamplescan(SampleScanState *node,
     137              :                           Datum *params,
     138              :                           int nparams,
     139              :                           uint32 seed)
     140              : {
     141           45 :     BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
     142           45 :     double      percent = DatumGetFloat4(params[0]);
     143              :     double      dcutoff;
     144              : 
     145           45 :     if (percent < 0 || percent > 100 || isnan(percent))
     146            6 :         ereport(ERROR,
     147              :                 (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
     148              :                  errmsg("sample percentage must be between 0 and 100")));
     149              : 
     150              :     /*
     151              :      * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
     152              :      * store that as a uint64, of course.  Note that this gives strictly
     153              :      * correct behavior at the limits of zero or one probability.
     154              :      */
     155           39 :     dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
     156           39 :     sampler->cutoff = (uint64) dcutoff;
     157           39 :     sampler->seed = seed;
     158           39 :     sampler->lt = InvalidOffsetNumber;
     159              : 
     160              :     /*
     161              :      * Use bulkread, since we're scanning all pages.  But pagemode visibility
     162              :      * checking is a win only at larger sampling fractions.  The 25% cutoff
     163              :      * here is based on very limited experimentation.
     164              :      */
     165           39 :     node->use_bulkread = true;
     166           39 :     node->use_pagemode = (percent >= 25);
     167           39 : }
     168              : 
     169              : /*
     170              :  * Select next sampled tuple in current block.
     171              :  *
     172              :  * It is OK here to return an offset without knowing if the tuple is visible
     173              :  * (or even exists).  The reason is that we do the coinflip for every tuple
     174              :  * offset in the table.  Since all tuples have the same probability of being
     175              :  * returned, it doesn't matter if we do extra coinflips for invisible tuples.
     176              :  *
     177              :  * When we reach end of the block, return InvalidOffsetNumber which tells
     178              :  * SampleScan to go to next block.
     179              :  */
     180              : static OffsetNumber
     181        64416 : bernoulli_nextsampletuple(SampleScanState *node,
     182              :                           BlockNumber blockno,
     183              :                           OffsetNumber maxoffset)
     184              : {
     185        64416 :     BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
     186        64416 :     OffsetNumber tupoffset = sampler->lt;
     187              :     uint32      hashinput[3];
     188              : 
     189              :     /* Advance to first/next tuple in block */
     190        64416 :     if (tupoffset == InvalidOffsetNumber)
     191         4194 :         tupoffset = FirstOffsetNumber;
     192              :     else
     193        60222 :         tupoffset++;
     194              : 
     195              :     /*
     196              :      * We compute the hash by applying hash_any to an array of 3 uint32's
     197              :      * containing the block, offset, and seed.  This is efficient to set up,
     198              :      * and with the current implementation of hash_any, it gives
     199              :      * machine-independent results, which is a nice property for regression
     200              :      * testing.
     201              :      *
     202              :      * These words in the hash input are the same throughout the block:
     203              :      */
     204        64416 :     hashinput[0] = blockno;
     205        64416 :     hashinput[2] = sampler->seed;
     206              : 
     207              :     /*
     208              :      * Loop over tuple offsets until finding suitable TID or reaching end of
     209              :      * block.
     210              :      */
     211       124518 :     for (; tupoffset <= maxoffset; tupoffset++)
     212              :     {
     213              :         uint32      hash;
     214              : 
     215       120324 :         hashinput[1] = tupoffset;
     216              : 
     217       120324 :         hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
     218              :                                        (int) sizeof(hashinput)));
     219       120324 :         if (hash < sampler->cutoff)
     220        60222 :             break;
     221              :     }
     222              : 
     223        64416 :     if (tupoffset > maxoffset)
     224         4194 :         tupoffset = InvalidOffsetNumber;
     225              : 
     226        64416 :     sampler->lt = tupoffset;
     227              : 
     228        64416 :     return tupoffset;
     229              : }
        

Generated by: LCOV version 2.0-1