Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * system.c 4 : * support routines for SYSTEM tablesample method 5 : * 6 : * To ensure repeatability of samples, it is necessary that selection of a 7 : * given tuple be history-independent; otherwise syncscanning would break 8 : * repeatability, to say nothing of logically-irrelevant maintenance such 9 : * as physical extension or shortening of the relation. 10 : * 11 : * To achieve that, we proceed by hashing each candidate block number together 12 : * with the active seed, and then selecting it if the hash is less than the 13 : * cutoff value computed from the selection probability by BeginSampleScan. 14 : * 15 : * 16 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 17 : * Portions Copyright (c) 1994, Regents of the University of California 18 : * 19 : * IDENTIFICATION 20 : * src/backend/access/tablesample/system.c 21 : * 22 : *------------------------------------------------------------------------- 23 : */ 24 : 25 : #include "postgres.h" 26 : 27 : #include <math.h> 28 : 29 : #include "access/tsmapi.h" 30 : #include "catalog/pg_type.h" 31 : #include "common/hashfn.h" 32 : #include "optimizer/optimizer.h" 33 : #include "utils/fmgrprotos.h" 34 : 35 : 36 : /* Private state */ 37 : typedef struct 38 : { 39 : uint64 cutoff; /* select blocks with hash less than this */ 40 : uint32 seed; /* random seed */ 41 : BlockNumber nextblock; /* next block to consider sampling */ 42 : OffsetNumber lt; /* last tuple returned from current block */ 43 : } SystemSamplerData; 44 : 45 : 46 : static void system_samplescangetsamplesize(PlannerInfo *root, 47 : RelOptInfo *baserel, 48 : List *paramexprs, 49 : BlockNumber *pages, 50 : double *tuples); 51 : static void system_initsamplescan(SampleScanState *node, 52 : int eflags); 53 : static void system_beginsamplescan(SampleScanState *node, 54 : Datum *params, 55 : int nparams, 56 : uint32 seed); 57 : static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); 58 : static OffsetNumber system_nextsampletuple(SampleScanState *node, 59 : BlockNumber blockno, 60 : OffsetNumber maxoffset); 61 : 62 : 63 : /* 64 : * Create a TsmRoutine descriptor for the SYSTEM method. 65 : */ 66 : Datum 67 610 : tsm_system_handler(PG_FUNCTION_ARGS) 68 : { 69 610 : TsmRoutine *tsm = makeNode(TsmRoutine); 70 : 71 610 : tsm->parameterTypes = list_make1_oid(FLOAT4OID); 72 610 : tsm->repeatable_across_queries = true; 73 610 : tsm->repeatable_across_scans = true; 74 610 : tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; 75 610 : tsm->InitSampleScan = system_initsamplescan; 76 610 : tsm->BeginSampleScan = system_beginsamplescan; 77 610 : tsm->NextSampleBlock = system_nextsampleblock; 78 610 : tsm->NextSampleTuple = system_nextsampletuple; 79 610 : tsm->EndSampleScan = NULL; 80 : 81 610 : PG_RETURN_POINTER(tsm); 82 : } 83 : 84 : /* 85 : * Sample size estimation. 86 : */ 87 : static void 88 144 : system_samplescangetsamplesize(PlannerInfo *root, 89 : RelOptInfo *baserel, 90 : List *paramexprs, 91 : BlockNumber *pages, 92 : double *tuples) 93 : { 94 : Node *pctnode; 95 : float4 samplefract; 96 : 97 : /* Try to extract an estimate for the sample percentage */ 98 144 : pctnode = (Node *) linitial(paramexprs); 99 144 : pctnode = estimate_expression_value(root, pctnode); 100 : 101 144 : if (IsA(pctnode, Const) && 102 84 : !((Const *) pctnode)->constisnull) 103 : { 104 78 : samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); 105 78 : if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) 106 66 : samplefract /= 100.0f; 107 : else 108 : { 109 : /* Default samplefract if the value is bogus */ 110 12 : samplefract = 0.1f; 111 : } 112 : } 113 : else 114 : { 115 : /* Default samplefract if we didn't obtain a non-null Const */ 116 66 : samplefract = 0.1f; 117 : } 118 : 119 : /* We'll visit a sample of the pages ... */ 120 144 : *pages = clamp_row_est(baserel->pages * samplefract); 121 : 122 : /* ... and hopefully get a representative number of tuples from them */ 123 144 : *tuples = clamp_row_est(baserel->tuples * samplefract); 124 144 : } 125 : 126 : /* 127 : * Initialize during executor setup. 128 : */ 129 : static void 130 144 : system_initsamplescan(SampleScanState *node, int eflags) 131 : { 132 144 : node->tsm_state = palloc0(sizeof(SystemSamplerData)); 133 144 : } 134 : 135 : /* 136 : * Examine parameters and prepare for a sample scan. 137 : */ 138 : static void 139 84 : system_beginsamplescan(SampleScanState *node, 140 : Datum *params, 141 : int nparams, 142 : uint32 seed) 143 : { 144 84 : SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; 145 84 : double percent = DatumGetFloat4(params[0]); 146 : double dcutoff; 147 : 148 84 : if (percent < 0 || percent > 100 || isnan(percent)) 149 12 : ereport(ERROR, 150 : (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), 151 : errmsg("sample percentage must be between 0 and 100"))); 152 : 153 : /* 154 : * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to 155 : * store that as a uint64, of course. Note that this gives strictly 156 : * correct behavior at the limits of zero or one probability. 157 : */ 158 72 : dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); 159 72 : sampler->cutoff = (uint64) dcutoff; 160 72 : sampler->seed = seed; 161 72 : sampler->nextblock = 0; 162 72 : sampler->lt = InvalidOffsetNumber; 163 : 164 : /* 165 : * Bulkread buffer access strategy probably makes sense unless we're 166 : * scanning a very small fraction of the table. The 1% cutoff here is a 167 : * guess. We should use pagemode visibility checking, since we scan all 168 : * tuples on each selected page. 169 : */ 170 72 : node->use_bulkread = (percent >= 1); 171 72 : node->use_pagemode = true; 172 72 : } 173 : 174 : /* 175 : * Select next block to sample. 176 : */ 177 : static BlockNumber 178 4326 : system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) 179 : { 180 4326 : SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; 181 4326 : BlockNumber nextblock = sampler->nextblock; 182 : uint32 hashinput[2]; 183 : 184 : /* 185 : * We compute the hash by applying hash_any to an array of 2 uint32's 186 : * containing the block number and seed. This is efficient to set up, and 187 : * with the current implementation of hash_any, it gives 188 : * machine-independent results, which is a nice property for regression 189 : * testing. 190 : * 191 : * These words in the hash input are the same throughout the block: 192 : */ 193 4326 : hashinput[1] = sampler->seed; 194 : 195 : /* 196 : * Loop over block numbers until finding suitable block or reaching end of 197 : * relation. 198 : */ 199 8532 : for (; nextblock < nblocks; nextblock++) 200 : { 201 : uint32 hash; 202 : 203 8466 : hashinput[0] = nextblock; 204 : 205 8466 : hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, 206 : (int) sizeof(hashinput))); 207 8466 : if (hash < sampler->cutoff) 208 4260 : break; 209 : } 210 : 211 4326 : if (nextblock < nblocks) 212 : { 213 : /* Found a suitable block; remember where we should start next time */ 214 4260 : sampler->nextblock = nextblock + 1; 215 4260 : return nextblock; 216 : } 217 : 218 : /* Done, but let's reset nextblock to 0 for safety. */ 219 66 : sampler->nextblock = 0; 220 66 : return InvalidBlockNumber; 221 : } 222 : 223 : /* 224 : * Select next sampled tuple in current block. 225 : * 226 : * In block sampling, we just want to sample all the tuples in each selected 227 : * block. 228 : * 229 : * It is OK here to return an offset without knowing if the tuple is visible 230 : * (or even exists); nodeSamplescan.c will deal with that. 231 : * 232 : * When we reach end of the block, return InvalidOffsetNumber which tells 233 : * SampleScan to go to next block. 234 : */ 235 : static OffsetNumber 236 124578 : system_nextsampletuple(SampleScanState *node, 237 : BlockNumber blockno, 238 : OffsetNumber maxoffset) 239 : { 240 124578 : SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; 241 124578 : OffsetNumber tupoffset = sampler->lt; 242 : 243 : /* Advance to next possible offset on page */ 244 124578 : if (tupoffset == InvalidOffsetNumber) 245 4260 : tupoffset = FirstOffsetNumber; 246 : else 247 120318 : tupoffset++; 248 : 249 : /* Done? */ 250 124578 : if (tupoffset > maxoffset) 251 4254 : tupoffset = InvalidOffsetNumber; 252 : 253 124578 : sampler->lt = tupoffset; 254 : 255 124578 : return tupoffset; 256 : }