Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsm_system_time.c
4 : * support routines for SYSTEM_TIME tablesample method
5 : *
6 : * The desire here is to produce a random sample with as many rows as possible
7 : * in no more than the specified amount of time. We use a block-sampling
8 : * approach. To ensure that the whole relation will be visited if necessary,
9 : * we start at a randomly chosen block and then advance with a stride that
10 : * is randomly chosen but is relatively prime to the relation's nblocks.
11 : *
12 : * Because of the time dependence, this method is necessarily unrepeatable.
13 : * However, we do what we can to reduce surprising behavior by selecting
14 : * the sampling pattern just once per query, much as in tsm_system_rows.
15 : *
16 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : * IDENTIFICATION
20 : * contrib/tsm_system_time/tsm_system_time.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <math.h>
28 :
29 : #include "access/tsmapi.h"
30 : #include "catalog/pg_type.h"
31 : #include "miscadmin.h"
32 : #include "optimizer/optimizer.h"
33 : #include "utils/sampling.h"
34 : #include "utils/spccache.h"
35 :
36 2 : PG_MODULE_MAGIC_EXT(
37 : .name = "tsm_system_time",
38 : .version = PG_VERSION
39 : );
40 :
41 4 : PG_FUNCTION_INFO_V1(tsm_system_time_handler);
42 :
43 :
44 : /* Private state */
45 : typedef struct
46 : {
47 : uint32 seed; /* random seed */
48 : double millis; /* time limit for sampling */
49 : instr_time start_time; /* scan start time */
50 : OffsetNumber lt; /* last tuple returned from current block */
51 : BlockNumber doneblocks; /* number of already-scanned blocks */
52 : BlockNumber lb; /* last block visited */
53 : /* these three values are not changed during a rescan: */
54 : BlockNumber nblocks; /* number of blocks in relation */
55 : BlockNumber firstblock; /* first block to sample from */
56 : BlockNumber step; /* step size, or 0 if not set yet */
57 : } SystemTimeSamplerData;
58 :
59 : static void system_time_samplescangetsamplesize(PlannerInfo *root,
60 : RelOptInfo *baserel,
61 : List *paramexprs,
62 : BlockNumber *pages,
63 : double *tuples);
64 : static void system_time_initsamplescan(SampleScanState *node,
65 : int eflags);
66 : static void system_time_beginsamplescan(SampleScanState *node,
67 : Datum *params,
68 : int nparams,
69 : uint32 seed);
70 : static BlockNumber system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
71 : static OffsetNumber system_time_nextsampletuple(SampleScanState *node,
72 : BlockNumber blockno,
73 : OffsetNumber maxoffset);
74 : static uint32 random_relative_prime(uint32 n, pg_prng_state *randstate);
75 :
76 :
77 : /*
78 : * Create a TsmRoutine descriptor for the SYSTEM_TIME method.
79 : */
80 : Datum
81 82 : tsm_system_time_handler(PG_FUNCTION_ARGS)
82 : {
83 82 : TsmRoutine *tsm = makeNode(TsmRoutine);
84 :
85 82 : tsm->parameterTypes = list_make1_oid(FLOAT8OID);
86 :
87 : /* See notes at head of file */
88 82 : tsm->repeatable_across_queries = false;
89 82 : tsm->repeatable_across_scans = false;
90 :
91 82 : tsm->SampleScanGetSampleSize = system_time_samplescangetsamplesize;
92 82 : tsm->InitSampleScan = system_time_initsamplescan;
93 82 : tsm->BeginSampleScan = system_time_beginsamplescan;
94 82 : tsm->NextSampleBlock = system_time_nextsampleblock;
95 82 : tsm->NextSampleTuple = system_time_nextsampletuple;
96 82 : tsm->EndSampleScan = NULL;
97 :
98 82 : PG_RETURN_POINTER(tsm);
99 : }
100 :
101 : /*
102 : * Sample size estimation.
103 : */
104 : static void
105 18 : system_time_samplescangetsamplesize(PlannerInfo *root,
106 : RelOptInfo *baserel,
107 : List *paramexprs,
108 : BlockNumber *pages,
109 : double *tuples)
110 : {
111 : Node *limitnode;
112 : double millis;
113 : double spc_random_page_cost;
114 : double npages;
115 : double ntuples;
116 :
117 : /* Try to extract an estimate for the limit time spec */
118 18 : limitnode = (Node *) linitial(paramexprs);
119 18 : limitnode = estimate_expression_value(root, limitnode);
120 :
121 18 : if (IsA(limitnode, Const) &&
122 14 : !((Const *) limitnode)->constisnull)
123 : {
124 14 : millis = DatumGetFloat8(((Const *) limitnode)->constvalue);
125 14 : if (millis < 0 || isnan(millis))
126 : {
127 : /* Default millis if the value is bogus */
128 4 : millis = 1000;
129 : }
130 : }
131 : else
132 : {
133 : /* Default millis if we didn't obtain a non-null Const */
134 4 : millis = 1000;
135 : }
136 :
137 : /* Get the planner's idea of cost per page read */
138 18 : get_tablespace_page_costs(baserel->reltablespace,
139 : &spc_random_page_cost,
140 : NULL);
141 :
142 : /*
143 : * Estimate the number of pages we can read by assuming that the cost
144 : * figure is expressed in milliseconds. This is completely, unmistakably
145 : * bogus, but we have to do something to produce an estimate and there's
146 : * no better answer.
147 : */
148 18 : if (spc_random_page_cost > 0)
149 18 : npages = millis / spc_random_page_cost;
150 : else
151 0 : npages = millis; /* even more bogus, but whatcha gonna do? */
152 :
153 : /* Clamp to sane value */
154 18 : npages = clamp_row_est(Min((double) baserel->pages, npages));
155 :
156 18 : if (baserel->tuples > 0 && baserel->pages > 0)
157 18 : {
158 : /* Estimate number of tuples returned based on tuple density */
159 18 : double density = baserel->tuples / (double) baserel->pages;
160 :
161 18 : ntuples = npages * density;
162 : }
163 : else
164 : {
165 : /* For lack of data, assume one tuple per page */
166 0 : ntuples = npages;
167 : }
168 :
169 : /* Clamp to the estimated relation size */
170 18 : ntuples = clamp_row_est(Min(baserel->tuples, ntuples));
171 :
172 18 : *pages = npages;
173 18 : *tuples = ntuples;
174 18 : }
175 :
176 : /*
177 : * Initialize during executor setup.
178 : */
179 : static void
180 18 : system_time_initsamplescan(SampleScanState *node, int eflags)
181 : {
182 18 : node->tsm_state = palloc0(sizeof(SystemTimeSamplerData));
183 : /* Note the above leaves tsm_state->step equal to zero */
184 18 : }
185 :
186 : /*
187 : * Examine parameters and prepare for a sample scan.
188 : */
189 : static void
190 12 : system_time_beginsamplescan(SampleScanState *node,
191 : Datum *params,
192 : int nparams,
193 : uint32 seed)
194 : {
195 12 : SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state;
196 12 : double millis = DatumGetFloat8(params[0]);
197 :
198 12 : if (millis < 0 || isnan(millis))
199 2 : ereport(ERROR,
200 : (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
201 : errmsg("sample collection time must not be negative")));
202 :
203 10 : sampler->seed = seed;
204 10 : sampler->millis = millis;
205 10 : sampler->lt = InvalidOffsetNumber;
206 10 : sampler->doneblocks = 0;
207 : /* start_time, lb will be initialized during first NextSampleBlock call */
208 : /* we intentionally do not change nblocks/firstblock/step here */
209 10 : }
210 :
211 : /*
212 : * Select next block to sample.
213 : *
214 : * Uses linear probing algorithm for picking next block.
215 : */
216 : static BlockNumber
217 52 : system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
218 : {
219 52 : SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state;
220 : instr_time cur_time;
221 :
222 : /* First call within scan? */
223 52 : if (sampler->doneblocks == 0)
224 : {
225 : /* First scan within query? */
226 10 : if (sampler->step == 0)
227 : {
228 : /* Initialize now that we have scan descriptor */
229 : pg_prng_state randstate;
230 :
231 : /* If relation is empty, there's nothing to scan */
232 8 : if (nblocks == 0)
233 0 : return InvalidBlockNumber;
234 :
235 : /* We only need an RNG during this setup step */
236 8 : sampler_random_init_state(sampler->seed, &randstate);
237 :
238 : /* Compute nblocks/firstblock/step only once per query */
239 8 : sampler->nblocks = nblocks;
240 :
241 : /* Choose random starting block within the relation */
242 : /* (Actually this is the predecessor of the first block visited) */
243 8 : sampler->firstblock = sampler_random_fract(&randstate) *
244 8 : sampler->nblocks;
245 :
246 : /* Find relative prime as step size for linear probing */
247 8 : sampler->step = random_relative_prime(sampler->nblocks, &randstate);
248 : }
249 :
250 : /* Reinitialize lb and start_time */
251 10 : sampler->lb = sampler->firstblock;
252 10 : INSTR_TIME_SET_CURRENT(sampler->start_time);
253 : }
254 :
255 : /* If we've read all blocks in relation, we're done */
256 52 : if (++sampler->doneblocks > sampler->nblocks)
257 6 : return InvalidBlockNumber;
258 :
259 : /* If we've used up all the allotted time, we're done */
260 46 : INSTR_TIME_SET_CURRENT(cur_time);
261 46 : INSTR_TIME_SUBTRACT(cur_time, sampler->start_time);
262 46 : if (INSTR_TIME_GET_MILLISEC(cur_time) >= sampler->millis)
263 4 : return InvalidBlockNumber;
264 :
265 : /*
266 : * It's probably impossible for scan->rs_nblocks to decrease between scans
267 : * within a query; but just in case, loop until we select a block number
268 : * less than scan->rs_nblocks. We don't care if scan->rs_nblocks has
269 : * increased since the first scan.
270 : */
271 : do
272 : {
273 : /* Advance lb, using uint64 arithmetic to forestall overflow */
274 42 : sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
275 42 : } while (sampler->lb >= nblocks);
276 :
277 42 : return sampler->lb;
278 : }
279 :
280 : /*
281 : * Select next sampled tuple in current block.
282 : *
283 : * In block sampling, we just want to sample all the tuples in each selected
284 : * block.
285 : *
286 : * When we reach end of the block, return InvalidOffsetNumber which tells
287 : * SampleScan to go to next block.
288 : */
289 : static OffsetNumber
290 228 : system_time_nextsampletuple(SampleScanState *node,
291 : BlockNumber blockno,
292 : OffsetNumber maxoffset)
293 : {
294 228 : SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state;
295 228 : OffsetNumber tupoffset = sampler->lt;
296 :
297 : /* Advance to next possible offset on page */
298 228 : if (tupoffset == InvalidOffsetNumber)
299 42 : tupoffset = FirstOffsetNumber;
300 : else
301 186 : tupoffset++;
302 :
303 : /* Done? */
304 228 : if (tupoffset > maxoffset)
305 42 : tupoffset = InvalidOffsetNumber;
306 :
307 228 : sampler->lt = tupoffset;
308 :
309 228 : return tupoffset;
310 : }
311 :
312 : /*
313 : * Compute greatest common divisor of two uint32's.
314 : */
315 : static uint32
316 8 : gcd(uint32 a, uint32 b)
317 : {
318 : uint32 c;
319 :
320 30 : while (a != 0)
321 : {
322 22 : c = a;
323 22 : a = b % a;
324 22 : b = c;
325 : }
326 :
327 8 : return b;
328 : }
329 :
330 : /*
331 : * Pick a random value less than and relatively prime to n, if possible
332 : * (else return 1).
333 : */
334 : static uint32
335 8 : random_relative_prime(uint32 n, pg_prng_state *randstate)
336 : {
337 : uint32 r;
338 :
339 : /* Safety check to avoid infinite loop or zero result for small n. */
340 8 : if (n <= 1)
341 0 : return 1;
342 :
343 : /*
344 : * This should only take 2 or 3 iterations as the probability of 2 numbers
345 : * being relatively prime is ~61%; but just in case, we'll include a
346 : * CHECK_FOR_INTERRUPTS in the loop.
347 : */
348 : do
349 : {
350 10 : CHECK_FOR_INTERRUPTS();
351 10 : r = (uint32) (sampler_random_fract(randstate) * n);
352 10 : } while (r == 0 || gcd(r, n) > 1);
353 :
354 8 : return r;
355 : }
|