Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsm_system_rows.c
4 : * support routines for SYSTEM_ROWS tablesample method
5 : *
6 : * The desire here is to produce a random sample with a given number of rows
7 : * (or the whole relation, if that is fewer rows). We use a block-sampling
8 : * approach. To ensure that the whole relation will be visited if necessary,
9 : * we start at a randomly chosen block and then advance with a stride that
10 : * is randomly chosen but is relatively prime to the relation's nblocks.
11 : *
12 : * Because of the dependence on nblocks, this method cannot be repeatable
13 : * across queries. (Even if the user hasn't explicitly changed the relation,
14 : * maintenance activities such as autovacuum might change nblocks.) However,
15 : * we can at least make it repeatable across scans, by determining the
16 : * sampling pattern only once on the first scan. This means that rescans
17 : * won't visit blocks added after the first scan, but that is fine since
18 : * such blocks shouldn't contain any visible tuples anyway.
19 : *
20 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
21 : * Portions Copyright (c) 1994, Regents of the University of California
22 : *
23 : * IDENTIFICATION
24 : * contrib/tsm_system_rows/tsm_system_rows.c
25 : *
26 : *-------------------------------------------------------------------------
27 : */
28 :
29 : #include "postgres.h"
30 :
31 : #include "access/tsmapi.h"
32 : #include "catalog/pg_type.h"
33 : #include "miscadmin.h"
34 : #include "optimizer/optimizer.h"
35 : #include "utils/sampling.h"
36 :
37 2 : PG_MODULE_MAGIC;
38 :
39 4 : PG_FUNCTION_INFO_V1(tsm_system_rows_handler);
40 :
41 :
42 : /* Private state */
43 : typedef struct
44 : {
45 : uint32 seed; /* random seed */
46 : int64 ntuples; /* number of tuples to return */
47 : OffsetNumber lt; /* last tuple returned from current block */
48 : BlockNumber doneblocks; /* number of already-scanned blocks */
49 : BlockNumber lb; /* last block visited */
50 : /* these three values are not changed during a rescan: */
51 : BlockNumber nblocks; /* number of blocks in relation */
52 : BlockNumber firstblock; /* first block to sample from */
53 : BlockNumber step; /* step size, or 0 if not set yet */
54 : } SystemRowsSamplerData;
55 :
56 : static void system_rows_samplescangetsamplesize(PlannerInfo *root,
57 : RelOptInfo *baserel,
58 : List *paramexprs,
59 : BlockNumber *pages,
60 : double *tuples);
61 : static void system_rows_initsamplescan(SampleScanState *node,
62 : int eflags);
63 : static void system_rows_beginsamplescan(SampleScanState *node,
64 : Datum *params,
65 : int nparams,
66 : uint32 seed);
67 : static BlockNumber system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
68 : static OffsetNumber system_rows_nextsampletuple(SampleScanState *node,
69 : BlockNumber blockno,
70 : OffsetNumber maxoffset);
71 : static uint32 random_relative_prime(uint32 n, pg_prng_state *randstate);
72 :
73 :
74 : /*
75 : * Create a TsmRoutine descriptor for the SYSTEM_ROWS method.
76 : */
77 : Datum
78 80 : tsm_system_rows_handler(PG_FUNCTION_ARGS)
79 : {
80 80 : TsmRoutine *tsm = makeNode(TsmRoutine);
81 :
82 80 : tsm->parameterTypes = list_make1_oid(INT8OID);
83 :
84 : /* See notes at head of file */
85 80 : tsm->repeatable_across_queries = false;
86 80 : tsm->repeatable_across_scans = true;
87 :
88 80 : tsm->SampleScanGetSampleSize = system_rows_samplescangetsamplesize;
89 80 : tsm->InitSampleScan = system_rows_initsamplescan;
90 80 : tsm->BeginSampleScan = system_rows_beginsamplescan;
91 80 : tsm->NextSampleBlock = system_rows_nextsampleblock;
92 80 : tsm->NextSampleTuple = system_rows_nextsampletuple;
93 80 : tsm->EndSampleScan = NULL;
94 :
95 80 : PG_RETURN_POINTER(tsm);
96 : }
97 :
98 : /*
99 : * Sample size estimation.
100 : */
101 : static void
102 18 : system_rows_samplescangetsamplesize(PlannerInfo *root,
103 : RelOptInfo *baserel,
104 : List *paramexprs,
105 : BlockNumber *pages,
106 : double *tuples)
107 : {
108 : Node *limitnode;
109 : int64 ntuples;
110 : double npages;
111 :
112 : /* Try to extract an estimate for the limit rowcount */
113 18 : limitnode = (Node *) linitial(paramexprs);
114 18 : limitnode = estimate_expression_value(root, limitnode);
115 :
116 18 : if (IsA(limitnode, Const) &&
117 14 : !((Const *) limitnode)->constisnull)
118 : {
119 14 : ntuples = DatumGetInt64(((Const *) limitnode)->constvalue);
120 14 : if (ntuples < 0)
121 : {
122 : /* Default ntuples if the value is bogus */
123 4 : ntuples = 1000;
124 : }
125 : }
126 : else
127 : {
128 : /* Default ntuples if we didn't obtain a non-null Const */
129 4 : ntuples = 1000;
130 : }
131 :
132 : /* Clamp to the estimated relation size */
133 18 : if (ntuples > baserel->tuples)
134 10 : ntuples = (int64) baserel->tuples;
135 18 : ntuples = clamp_row_est(ntuples);
136 :
137 18 : if (baserel->tuples > 0 && baserel->pages > 0)
138 18 : {
139 : /* Estimate number of pages visited based on tuple density */
140 18 : double density = baserel->tuples / (double) baserel->pages;
141 :
142 18 : npages = ntuples / density;
143 : }
144 : else
145 : {
146 : /* For lack of data, assume one tuple per page */
147 0 : npages = ntuples;
148 : }
149 :
150 : /* Clamp to sane value */
151 18 : npages = clamp_row_est(Min((double) baserel->pages, npages));
152 :
153 18 : *pages = npages;
154 18 : *tuples = ntuples;
155 18 : }
156 :
157 : /*
158 : * Initialize during executor setup.
159 : */
160 : static void
161 18 : system_rows_initsamplescan(SampleScanState *node, int eflags)
162 : {
163 18 : node->tsm_state = palloc0(sizeof(SystemRowsSamplerData));
164 : /* Note the above leaves tsm_state->step equal to zero */
165 18 : }
166 :
167 : /*
168 : * Examine parameters and prepare for a sample scan.
169 : */
170 : static void
171 18 : system_rows_beginsamplescan(SampleScanState *node,
172 : Datum *params,
173 : int nparams,
174 : uint32 seed)
175 : {
176 18 : SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
177 18 : int64 ntuples = DatumGetInt64(params[0]);
178 :
179 18 : if (ntuples < 0)
180 2 : ereport(ERROR,
181 : (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
182 : errmsg("sample size must not be negative")));
183 :
184 16 : sampler->seed = seed;
185 16 : sampler->ntuples = ntuples;
186 16 : sampler->lt = InvalidOffsetNumber;
187 16 : sampler->doneblocks = 0;
188 : /* lb will be initialized during first NextSampleBlock call */
189 : /* we intentionally do not change nblocks/firstblock/step here */
190 :
191 : /*
192 : * We *must* use pagemode visibility checking in this module, so force
193 : * that even though it's currently default.
194 : */
195 16 : node->use_pagemode = true;
196 16 : }
197 :
198 : /*
199 : * Select next block to sample.
200 : *
201 : * Uses linear probing algorithm for picking next block.
202 : */
203 : static BlockNumber
204 68 : system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
205 : {
206 68 : SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
207 :
208 : /* First call within scan? */
209 68 : if (sampler->doneblocks == 0)
210 : {
211 : /* First scan within query? */
212 16 : if (sampler->step == 0)
213 : {
214 : /* Initialize now that we have scan descriptor */
215 : pg_prng_state randstate;
216 :
217 : /* If relation is empty, there's nothing to scan */
218 12 : if (nblocks == 0)
219 0 : return InvalidBlockNumber;
220 :
221 : /* We only need an RNG during this setup step */
222 12 : sampler_random_init_state(sampler->seed, &randstate);
223 :
224 : /* Compute nblocks/firstblock/step only once per query */
225 12 : sampler->nblocks = nblocks;
226 :
227 : /* Choose random starting block within the relation */
228 : /* (Actually this is the predecessor of the first block visited) */
229 12 : sampler->firstblock = sampler_random_fract(&randstate) *
230 12 : sampler->nblocks;
231 :
232 : /* Find relative prime as step size for linear probing */
233 12 : sampler->step = random_relative_prime(sampler->nblocks, &randstate);
234 : }
235 :
236 : /* Reinitialize lb */
237 16 : sampler->lb = sampler->firstblock;
238 : }
239 :
240 : /* If we've read all blocks or returned all needed tuples, we're done */
241 68 : if (++sampler->doneblocks > sampler->nblocks ||
242 64 : node->donetuples >= sampler->ntuples)
243 16 : return InvalidBlockNumber;
244 :
245 : /*
246 : * It's probably impossible for scan->rs_nblocks to decrease between scans
247 : * within a query; but just in case, loop until we select a block number
248 : * less than scan->rs_nblocks. We don't care if scan->rs_nblocks has
249 : * increased since the first scan.
250 : */
251 : do
252 : {
253 : /* Advance lb, using uint64 arithmetic to forestall overflow */
254 52 : sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
255 52 : } while (sampler->lb >= nblocks);
256 :
257 52 : return sampler->lb;
258 : }
259 :
260 : /*
261 : * Select next sampled tuple in current block.
262 : *
263 : * In block sampling, we just want to sample all the tuples in each selected
264 : * block.
265 : *
266 : * When we reach end of the block, return InvalidOffsetNumber which tells
267 : * SampleScan to go to next block.
268 : */
269 : static OffsetNumber
270 258 : system_rows_nextsampletuple(SampleScanState *node,
271 : BlockNumber blockno,
272 : OffsetNumber maxoffset)
273 : {
274 258 : SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
275 258 : OffsetNumber tupoffset = sampler->lt;
276 :
277 : /* Quit if we've returned all needed tuples */
278 258 : if (node->donetuples >= sampler->ntuples)
279 8 : return InvalidOffsetNumber;
280 :
281 : /* Advance to next possible offset on page */
282 250 : if (tupoffset == InvalidOffsetNumber)
283 52 : tupoffset = FirstOffsetNumber;
284 : else
285 198 : tupoffset++;
286 :
287 : /* Done? */
288 250 : if (tupoffset > maxoffset)
289 44 : tupoffset = InvalidOffsetNumber;
290 :
291 250 : sampler->lt = tupoffset;
292 :
293 250 : return tupoffset;
294 : }
295 :
296 : /*
297 : * Compute greatest common divisor of two uint32's.
298 : */
299 : static uint32
300 12 : gcd(uint32 a, uint32 b)
301 : {
302 : uint32 c;
303 :
304 34 : while (a != 0)
305 : {
306 22 : c = a;
307 22 : a = b % a;
308 22 : b = c;
309 : }
310 :
311 12 : return b;
312 : }
313 :
314 : /*
315 : * Pick a random value less than and relatively prime to n, if possible
316 : * (else return 1).
317 : */
318 : static uint32
319 12 : random_relative_prime(uint32 n, pg_prng_state *randstate)
320 : {
321 : uint32 r;
322 :
323 : /* Safety check to avoid infinite loop or zero result for small n. */
324 12 : if (n <= 1)
325 0 : return 1;
326 :
327 : /*
328 : * This should only take 2 or 3 iterations as the probability of 2 numbers
329 : * being relatively prime is ~61%; but just in case, we'll include a
330 : * CHECK_FOR_INTERRUPTS in the loop.
331 : */
332 : do
333 : {
334 14 : CHECK_FOR_INTERRUPTS();
335 14 : r = (uint32) (sampler_random_fract(randstate) * n);
336 14 : } while (r == 0 || gcd(r, n) > 1);
337 :
338 12 : return r;
339 : }
|