Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nodeTidrangescan.c
4 : * Routines to support TID range scans of relations
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/executor/nodeTidrangescan.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/relscan.h"
18 : #include "access/sysattr.h"
19 : #include "access/tableam.h"
20 : #include "catalog/pg_operator.h"
21 : #include "executor/execParallel.h"
22 : #include "executor/executor.h"
23 : #include "executor/instrument.h"
24 : #include "executor/nodeTidrangescan.h"
25 : #include "nodes/nodeFuncs.h"
26 : #include "utils/rel.h"
27 :
28 :
29 : /*
30 : * It's sufficient to check varattno to identify the CTID variable, as any
31 : * Var in the relation scan qual must be for our table. (Even if it's a
32 : * parameterized scan referencing some other table's CTID, the other table's
33 : * Var would have become a Param by the time it gets here.)
34 : */
35 : #define IsCTIDVar(node) \
36 : ((node) != NULL && \
37 : IsA((node), Var) && \
38 : ((Var *) (node))->varattno == SelfItemPointerAttributeNumber)
39 :
40 : typedef enum
41 : {
42 : TIDEXPR_UPPER_BOUND,
43 : TIDEXPR_LOWER_BOUND,
44 : } TidExprType;
45 :
46 : /* Upper or lower range bound for scan */
47 : typedef struct TidOpExpr
48 : {
49 : TidExprType exprtype; /* type of op; lower or upper */
50 : ExprState *exprstate; /* ExprState for a TID-yielding subexpr */
51 : bool inclusive; /* whether op is inclusive */
52 : } TidOpExpr;
53 :
54 : /*
55 : * For the given 'expr', build and return an appropriate TidOpExpr taking into
56 : * account the expr's operator and operand order.
57 : */
58 : static TidOpExpr *
59 1446 : MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate)
60 : {
61 1446 : Node *arg1 = get_leftop((Expr *) expr);
62 1446 : Node *arg2 = get_rightop((Expr *) expr);
63 1446 : ExprState *exprstate = NULL;
64 1446 : bool invert = false;
65 : TidOpExpr *tidopexpr;
66 :
67 1446 : if (IsCTIDVar(arg1))
68 1422 : exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps);
69 24 : else if (IsCTIDVar(arg2))
70 : {
71 24 : exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps);
72 24 : invert = true;
73 : }
74 : else
75 0 : elog(ERROR, "could not identify CTID variable");
76 :
77 1446 : tidopexpr = palloc_object(TidOpExpr);
78 1446 : tidopexpr->inclusive = false; /* for now */
79 :
80 1446 : switch (expr->opno)
81 : {
82 27 : case TIDLessEqOperator:
83 27 : tidopexpr->inclusive = true;
84 : pg_fallthrough;
85 163 : case TIDLessOperator:
86 163 : tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND;
87 163 : break;
88 1191 : case TIDGreaterEqOperator:
89 1191 : tidopexpr->inclusive = true;
90 : pg_fallthrough;
91 1283 : case TIDGreaterOperator:
92 1283 : tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND;
93 1283 : break;
94 0 : default:
95 0 : elog(ERROR, "could not identify CTID operator");
96 : }
97 :
98 1446 : tidopexpr->exprstate = exprstate;
99 :
100 1446 : return tidopexpr;
101 : }
102 :
103 : /*
104 : * Extract the qual subexpressions that yield TIDs to search for,
105 : * and compile them into ExprStates if they're ordinary expressions.
106 : */
107 : static void
108 1397 : TidExprListCreate(TidRangeScanState *tidrangestate)
109 : {
110 1397 : TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan;
111 1397 : List *tidexprs = NIL;
112 : ListCell *l;
113 :
114 2843 : foreach(l, node->tidrangequals)
115 : {
116 1446 : OpExpr *opexpr = lfirst(l);
117 : TidOpExpr *tidopexpr;
118 :
119 1446 : if (!IsA(opexpr, OpExpr))
120 0 : elog(ERROR, "could not identify CTID expression");
121 :
122 1446 : tidopexpr = MakeTidOpExpr(opexpr, tidrangestate);
123 1446 : tidexprs = lappend(tidexprs, tidopexpr);
124 : }
125 :
126 1397 : tidrangestate->trss_tidexprs = tidexprs;
127 1397 : }
128 :
129 : /* ----------------------------------------------------------------
130 : * TidRangeEval
131 : *
132 : * Compute and set node's block and offset range to scan by evaluating
133 : * node->trss_tidexprs. Returns false if we detect the range cannot
134 : * contain any tuples. Returns true if it's possible for the range to
135 : * contain tuples. We don't bother validating that trss_mintid is less
136 : * than or equal to trss_maxtid, as the scan_set_tidrange() table AM
137 : * function will handle that.
138 : * ----------------------------------------------------------------
139 : */
140 : static bool
141 1379 : TidRangeEval(TidRangeScanState *node)
142 : {
143 1379 : ExprContext *econtext = node->ss.ps.ps_ExprContext;
144 : ItemPointerData lowerBound;
145 : ItemPointerData upperBound;
146 : ListCell *l;
147 :
148 : /*
149 : * Set the upper and lower bounds to the absolute limits of the range of
150 : * the ItemPointer type. Below we'll try to narrow this range on either
151 : * side by looking at the TidOpExprs.
152 : */
153 1379 : ItemPointerSet(&lowerBound, 0, 0);
154 1379 : ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX);
155 :
156 2789 : foreach(l, node->trss_tidexprs)
157 : {
158 1414 : TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l);
159 : ItemPointer itemptr;
160 : bool isNull;
161 :
162 : /* Evaluate this bound. */
163 : itemptr = (ItemPointer)
164 1414 : DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate,
165 : econtext,
166 : &isNull));
167 :
168 : /* If the bound is NULL, *nothing* matches the qual. */
169 1414 : if (isNull)
170 4 : return false;
171 :
172 1410 : if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND)
173 : {
174 : ItemPointerData lb;
175 :
176 1233 : ItemPointerCopy(itemptr, &lb);
177 :
178 : /*
179 : * Normalize non-inclusive ranges to become inclusive. The
180 : * resulting ItemPointer here may not be a valid item pointer.
181 : */
182 1233 : if (!tidopexpr->inclusive)
183 70 : ItemPointerInc(&lb);
184 :
185 : /* Check if we can narrow the range using this qual */
186 1233 : if (ItemPointerCompare(&lb, &lowerBound) > 0)
187 1233 : ItemPointerCopy(&lb, &lowerBound);
188 : }
189 :
190 177 : else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND)
191 : {
192 : ItemPointerData ub;
193 :
194 177 : ItemPointerCopy(itemptr, &ub);
195 :
196 : /*
197 : * Normalize non-inclusive ranges to become inclusive. The
198 : * resulting ItemPointer here may not be a valid item pointer.
199 : */
200 177 : if (!tidopexpr->inclusive)
201 102 : ItemPointerDec(&ub);
202 :
203 : /* Check if we can narrow the range using this qual */
204 177 : if (ItemPointerCompare(&ub, &upperBound) < 0)
205 177 : ItemPointerCopy(&ub, &upperBound);
206 : }
207 : }
208 :
209 1375 : ItemPointerCopy(&lowerBound, &node->trss_mintid);
210 1375 : ItemPointerCopy(&upperBound, &node->trss_maxtid);
211 :
212 1375 : return true;
213 : }
214 :
215 : /* ----------------------------------------------------------------
216 : * TidRangeNext
217 : *
218 : * Retrieve a tuple from the TidRangeScan node's currentRelation
219 : * using the TIDs in the TidRangeScanState information.
220 : *
221 : * ----------------------------------------------------------------
222 : */
223 : static TupleTableSlot *
224 6516 : TidRangeNext(TidRangeScanState *node)
225 : {
226 : TableScanDesc scandesc;
227 : EState *estate;
228 : ScanDirection direction;
229 : TupleTableSlot *slot;
230 :
231 : /*
232 : * extract necessary information from TID scan node
233 : */
234 6516 : scandesc = node->ss.ss_currentScanDesc;
235 6516 : estate = node->ss.ps.state;
236 6516 : slot = node->ss.ss_ScanTupleSlot;
237 6516 : direction = estate->es_direction;
238 :
239 6516 : if (!node->trss_inScan)
240 : {
241 : /* First time through, compute TID range to scan */
242 1378 : if (!TidRangeEval(node))
243 4 : return NULL;
244 :
245 1374 : if (scandesc == NULL)
246 : {
247 1234 : uint32 flags = SO_NONE;
248 :
249 1234 : if (ScanRelIsReadOnly(&node->ss))
250 1230 : flags |= SO_HINT_REL_READ_ONLY;
251 :
252 1234 : if (estate->es_instrument & INSTRUMENT_IO)
253 0 : flags |= SO_SCAN_INSTRUMENT;
254 :
255 1234 : scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation,
256 : estate->es_snapshot,
257 : &node->trss_mintid,
258 : &node->trss_maxtid,
259 : flags);
260 1234 : node->ss.ss_currentScanDesc = scandesc;
261 : }
262 : else
263 : {
264 : /* rescan with the updated TID range */
265 140 : table_rescan_tidrange(scandesc, &node->trss_mintid,
266 : &node->trss_maxtid);
267 : }
268 :
269 1374 : node->trss_inScan = true;
270 : }
271 :
272 : /* Fetch the next tuple. */
273 6512 : if (!table_scan_getnextslot_tidrange(scandesc, direction, slot))
274 : {
275 212 : node->trss_inScan = false;
276 212 : ExecClearTuple(slot);
277 : }
278 :
279 6502 : return slot;
280 : }
281 :
282 : /*
283 : * TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual
284 : */
285 : static bool
286 1 : TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot)
287 : {
288 1 : if (!TidRangeEval(node))
289 0 : return false;
290 :
291 : Assert(ItemPointerIsValid(&slot->tts_tid));
292 :
293 : /* Recheck the ctid is still within range */
294 2 : if (ItemPointerCompare(&slot->tts_tid, &node->trss_mintid) < 0 ||
295 1 : ItemPointerCompare(&slot->tts_tid, &node->trss_maxtid) > 0)
296 1 : return false;
297 :
298 0 : return true;
299 : }
300 :
301 : /* ----------------------------------------------------------------
302 : * ExecTidRangeScan(node)
303 : *
304 : * Scans the relation using tids and returns the next qualifying tuple.
305 : * We call the ExecScan() routine and pass it the appropriate
306 : * access method functions.
307 : *
308 : * Conditions:
309 : * -- the "cursor" maintained by the AMI is positioned at the tuple
310 : * returned previously.
311 : *
312 : * Initial States:
313 : * -- the relation indicated is opened for TID range scanning.
314 : * ----------------------------------------------------------------
315 : */
316 : static TupleTableSlot *
317 6517 : ExecTidRangeScan(PlanState *pstate)
318 : {
319 6517 : TidRangeScanState *node = castNode(TidRangeScanState, pstate);
320 :
321 6517 : return ExecScan(&node->ss,
322 : (ExecScanAccessMtd) TidRangeNext,
323 : (ExecScanRecheckMtd) TidRangeRecheck);
324 : }
325 :
326 : /* ----------------------------------------------------------------
327 : * ExecReScanTidRangeScan(node)
328 : * ----------------------------------------------------------------
329 : */
330 : void
331 64 : ExecReScanTidRangeScan(TidRangeScanState *node)
332 : {
333 : /* mark scan as not in progress, and tid range list as not computed yet */
334 64 : node->trss_inScan = false;
335 :
336 : /*
337 : * We must wait until TidRangeNext before calling table_rescan_tidrange.
338 : */
339 64 : ExecScanReScan(&node->ss);
340 64 : }
341 :
342 : /* ----------------------------------------------------------------
343 : * ExecEndTidRangeScan
344 : *
345 : * Releases any storage allocated through C routines.
346 : * Returns nothing.
347 : * ----------------------------------------------------------------
348 : */
349 : void
350 243 : ExecEndTidRangeScan(TidRangeScanState *node)
351 : {
352 243 : TableScanDesc scan = node->ss.ss_currentScanDesc;
353 :
354 : /* Collect IO stats for this process into shared instrumentation */
355 243 : if (node->trss_sinstrument != NULL && IsParallelWorker())
356 : {
357 : TidRangeScanInstrumentation *si;
358 :
359 : Assert(ParallelWorkerNumber < node->trss_sinstrument->num_workers);
360 0 : si = &node->trss_sinstrument->sinstrument[ParallelWorkerNumber];
361 :
362 0 : if (scan && scan->rs_instrument)
363 : {
364 0 : AccumulateIOStats(&si->stats.io, &scan->rs_instrument->io);
365 : }
366 : }
367 :
368 243 : if (scan != NULL)
369 160 : table_endscan(scan);
370 243 : }
371 :
372 : /* ----------------------------------------------------------------
373 : * ExecInitTidRangeScan
374 : *
375 : * Initializes the tid range scan's state information, creates
376 : * scan keys, and opens the scan relation.
377 : *
378 : * Parameters:
379 : * node: TidRangeScan node produced by the planner.
380 : * estate: the execution state initialized in InitPlan.
381 : * ----------------------------------------------------------------
382 : */
383 : TidRangeScanState *
384 1397 : ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags)
385 : {
386 : TidRangeScanState *tidrangestate;
387 : Relation currentRelation;
388 :
389 : /*
390 : * create state structure
391 : */
392 1397 : tidrangestate = makeNode(TidRangeScanState);
393 1397 : tidrangestate->ss.ps.plan = (Plan *) node;
394 1397 : tidrangestate->ss.ps.state = estate;
395 1397 : tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan;
396 :
397 : /*
398 : * Miscellaneous initialization
399 : *
400 : * create expression context for node
401 : */
402 1397 : ExecAssignExprContext(estate, &tidrangestate->ss.ps);
403 :
404 : /*
405 : * mark scan as not in progress, and TID range as not computed yet
406 : */
407 1397 : tidrangestate->trss_inScan = false;
408 :
409 : /*
410 : * open the scan relation
411 : */
412 1397 : currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
413 :
414 1397 : tidrangestate->ss.ss_currentRelation = currentRelation;
415 1397 : tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */
416 :
417 : /*
418 : * get the scan type from the relation descriptor.
419 : */
420 1397 : ExecInitScanTupleSlot(estate, &tidrangestate->ss,
421 : RelationGetDescr(currentRelation),
422 : table_slot_callbacks(currentRelation),
423 : TTS_FLAG_OBEYS_NOT_NULL_CONSTRAINTS);
424 :
425 : /*
426 : * Initialize result type and projection.
427 : */
428 1397 : ExecInitResultTypeTL(&tidrangestate->ss.ps);
429 1397 : ExecAssignScanProjectionInfo(&tidrangestate->ss);
430 :
431 : /*
432 : * initialize child expressions
433 : */
434 1397 : tidrangestate->ss.ps.qual =
435 1397 : ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate);
436 :
437 1397 : TidExprListCreate(tidrangestate);
438 :
439 : /*
440 : * all done.
441 : */
442 1397 : return tidrangestate;
443 : }
444 :
445 : /* ----------------------------------------------------------------
446 : * Parallel Scan Support
447 : * ----------------------------------------------------------------
448 : */
449 :
450 : /* ----------------------------------------------------------------
451 : * ExecTidRangeScanEstimate
452 : *
453 : * Compute the amount of space we'll need in the parallel
454 : * query DSM, and inform pcxt->estimator about our needs.
455 : * ----------------------------------------------------------------
456 : */
457 : void
458 16 : ExecTidRangeScanEstimate(TidRangeScanState *node, ParallelContext *pcxt)
459 : {
460 16 : EState *estate = node->ss.ps.state;
461 :
462 16 : node->trss_pscanlen =
463 16 : table_parallelscan_estimate(node->ss.ss_currentRelation,
464 : estate->es_snapshot);
465 16 : shm_toc_estimate_chunk(&pcxt->estimator, node->trss_pscanlen);
466 16 : shm_toc_estimate_keys(&pcxt->estimator, 1);
467 16 : }
468 :
469 : /* ----------------------------------------------------------------
470 : * ExecTidRangeScanInitializeDSM
471 : *
472 : * Set up a parallel TID range scan descriptor.
473 : * ----------------------------------------------------------------
474 : */
475 : void
476 16 : ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt)
477 : {
478 16 : EState *estate = node->ss.ps.state;
479 : ParallelTableScanDesc pscan;
480 16 : uint32 flags = SO_NONE;
481 :
482 16 : if (ScanRelIsReadOnly(&node->ss))
483 16 : flags |= SO_HINT_REL_READ_ONLY;
484 :
485 16 : if (estate->es_instrument & INSTRUMENT_IO)
486 0 : flags |= SO_SCAN_INSTRUMENT;
487 :
488 16 : pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen);
489 16 : table_parallelscan_initialize(node->ss.ss_currentRelation,
490 : pscan,
491 : estate->es_snapshot);
492 16 : shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
493 16 : node->ss.ss_currentScanDesc =
494 16 : table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
495 : pscan, flags);
496 16 : }
497 :
498 : /* ----------------------------------------------------------------
499 : * ExecTidRangeScanReInitializeDSM
500 : *
501 : * Reset shared state before beginning a fresh scan.
502 : * ----------------------------------------------------------------
503 : */
504 : void
505 0 : ExecTidRangeScanReInitializeDSM(TidRangeScanState *node,
506 : ParallelContext *pcxt)
507 : {
508 : ParallelTableScanDesc pscan;
509 :
510 0 : pscan = node->ss.ss_currentScanDesc->rs_parallel;
511 0 : table_parallelscan_reinitialize(node->ss.ss_currentRelation, pscan);
512 0 : }
513 :
514 : /* ----------------------------------------------------------------
515 : * ExecTidRangeScanInitializeWorker
516 : *
517 : * Copy relevant information from TOC into planstate.
518 : * ----------------------------------------------------------------
519 : */
520 : void
521 64 : ExecTidRangeScanInitializeWorker(TidRangeScanState *node,
522 : ParallelWorkerContext *pwcxt)
523 : {
524 : ParallelTableScanDesc pscan;
525 64 : uint32 flags = SO_NONE;
526 :
527 64 : if (ScanRelIsReadOnly(&node->ss))
528 64 : flags |= SO_HINT_REL_READ_ONLY;
529 :
530 64 : if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
531 0 : flags |= SO_SCAN_INSTRUMENT;
532 :
533 64 : pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
534 64 : node->ss.ss_currentScanDesc =
535 64 : table_beginscan_parallel_tidrange(node->ss.ss_currentRelation,
536 : pscan, flags);
537 64 : }
538 :
539 : /*
540 : * Compute the amount of space we'll need for the shared instrumentation and
541 : * inform pcxt->estimator.
542 : */
543 : void
544 16 : ExecTidRangeScanInstrumentEstimate(TidRangeScanState *node,
545 : ParallelContext *pcxt)
546 : {
547 16 : EState *estate = node->ss.ps.state;
548 : Size size;
549 :
550 16 : if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
551 16 : return;
552 :
553 0 : size = add_size(offsetof(SharedTidRangeScanInstrumentation, sinstrument),
554 0 : mul_size(pcxt->nworkers, sizeof(TidRangeScanInstrumentation)));
555 :
556 0 : shm_toc_estimate_chunk(&pcxt->estimator, size);
557 0 : shm_toc_estimate_keys(&pcxt->estimator, 1);
558 : }
559 :
560 : /*
561 : * Set up parallel scan instrumentation.
562 : */
563 : void
564 16 : ExecTidRangeScanInstrumentInitDSM(TidRangeScanState *node,
565 : ParallelContext *pcxt)
566 : {
567 16 : EState *estate = node->ss.ps.state;
568 : SharedTidRangeScanInstrumentation *sinstrument;
569 : Size size;
570 :
571 16 : if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
572 16 : return;
573 :
574 0 : size = add_size(offsetof(SharedTidRangeScanInstrumentation, sinstrument),
575 0 : mul_size(pcxt->nworkers, sizeof(TidRangeScanInstrumentation)));
576 0 : sinstrument = shm_toc_allocate(pcxt->toc, size);
577 0 : memset(sinstrument, 0, size);
578 0 : sinstrument->num_workers = pcxt->nworkers;
579 0 : shm_toc_insert(pcxt->toc,
580 0 : node->ss.ps.plan->plan_node_id +
581 : PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
582 : sinstrument);
583 0 : node->trss_sinstrument = sinstrument;
584 : }
585 :
586 : /*
587 : * Look up and save the location of the shared instrumentation.
588 : */
589 : void
590 64 : ExecTidRangeScanInstrumentInitWorker(TidRangeScanState *node,
591 : ParallelWorkerContext *pwcxt)
592 : {
593 64 : EState *estate = node->ss.ps.state;
594 :
595 64 : if ((estate->es_instrument & INSTRUMENT_IO) == 0)
596 64 : return;
597 :
598 0 : node->trss_sinstrument = shm_toc_lookup(pwcxt->toc,
599 0 : node->ss.ps.plan->plan_node_id +
600 : PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
601 : false);
602 : }
603 :
604 : /*
605 : * Transfer scan instrumentation from DSM to private memory.
606 : */
607 : void
608 0 : ExecTidRangeScanRetrieveInstrumentation(TidRangeScanState *node)
609 : {
610 0 : SharedTidRangeScanInstrumentation *sinstrument = node->trss_sinstrument;
611 : Size size;
612 :
613 0 : if (sinstrument == NULL)
614 0 : return;
615 :
616 0 : size = offsetof(SharedTidRangeScanInstrumentation, sinstrument)
617 0 : + sinstrument->num_workers * sizeof(TidRangeScanInstrumentation);
618 :
619 0 : node->trss_sinstrument = palloc(size);
620 0 : memcpy(node->trss_sinstrument, sinstrument, size);
621 : }
|