Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nodeBitmapHeapscan.c
4 : * Routines to support bitmapped scans of relations
5 : *
6 : * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 : * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8 : * special snapshots). The reason is that since index and heap scans are
9 : * decoupled, there can be no assurance that the index tuple prompting a
10 : * visit to a particular heap TID still exists when the visit is made.
11 : * Therefore the tuple might not exist anymore either (which is OK because
12 : * heap_fetch will cope) --- but worse, the tuple slot could have been
13 : * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 : * certain to fail the time qual and so it will not be mistakenly returned,
15 : * but with anything else we might return a tuple that doesn't meet the
16 : * required index qual conditions.
17 : *
18 : *
19 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
20 : * Portions Copyright (c) 1994, Regents of the University of California
21 : *
22 : *
23 : * IDENTIFICATION
24 : * src/backend/executor/nodeBitmapHeapscan.c
25 : *
26 : *-------------------------------------------------------------------------
27 : */
28 : /*
29 : * INTERFACE ROUTINES
30 : * ExecBitmapHeapScan scans a relation using bitmap info
31 : * ExecBitmapHeapNext workhorse for above
32 : * ExecInitBitmapHeapScan creates and initializes state info.
33 : * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 : * ExecEndBitmapHeapScan releases all storage.
35 : */
36 : #include "postgres.h"
37 :
38 : #include <math.h>
39 :
40 : #include "access/relscan.h"
41 : #include "access/tableam.h"
42 : #include "access/visibilitymap.h"
43 : #include "executor/executor.h"
44 : #include "executor/nodeBitmapHeapscan.h"
45 : #include "miscadmin.h"
46 : #include "pgstat.h"
47 : #include "storage/bufmgr.h"
48 : #include "utils/rel.h"
49 : #include "utils/spccache.h"
50 :
51 : static void BitmapTableScanSetup(BitmapHeapScanState *node);
52 : static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 : static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
54 : static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
55 :
56 :
57 : /*
58 : * Do the underlying index scan, build the bitmap, set up the parallel state
59 : * needed for parallel workers to iterate through the bitmap, and set up the
60 : * underlying table scan descriptor.
61 : */
62 : static void
63 25222 : BitmapTableScanSetup(BitmapHeapScanState *node)
64 : {
65 25222 : TBMIterator tbmiterator = {0};
66 25222 : ParallelBitmapHeapState *pstate = node->pstate;
67 25222 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
68 :
69 25222 : if (!pstate)
70 : {
71 24880 : node->tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
72 :
73 24880 : if (!node->tbm || !IsA(node->tbm, TIDBitmap))
74 0 : elog(ERROR, "unrecognized result from subplan");
75 : }
76 342 : else if (BitmapShouldInitializeSharedState(pstate))
77 : {
78 : /*
79 : * The leader will immediately come out of the function, but others
80 : * will be blocked until leader populates the TBM and wakes them up.
81 : */
82 72 : node->tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
83 72 : if (!node->tbm || !IsA(node->tbm, TIDBitmap))
84 0 : elog(ERROR, "unrecognized result from subplan");
85 :
86 : /*
87 : * Prepare to iterate over the TBM. This will return the dsa_pointer
88 : * of the iterator state which will be used by multiple processes to
89 : * iterate jointly.
90 : */
91 72 : pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm);
92 :
93 : /* We have initialized the shared state so wake up others. */
94 72 : BitmapDoneInitializingSharedState(pstate);
95 : }
96 :
97 25222 : tbmiterator = tbm_begin_iterate(node->tbm, dsa,
98 : pstate ?
99 : pstate->tbmiterator :
100 : InvalidDsaPointer);
101 :
102 : /*
103 : * If this is the first scan of the underlying table, create the table
104 : * scan descriptor and begin the scan.
105 : */
106 25222 : if (!node->ss.ss_currentScanDesc)
107 : {
108 20998 : bool need_tuples = false;
109 :
110 : /*
111 : * We can potentially skip fetching heap pages if we do not need any
112 : * columns of the table, either for checking non-indexable quals or
113 : * for returning data. This test is a bit simplistic, as it checks
114 : * the stronger condition that there's no qual or return tlist at all.
115 : * But in most cases it's probably not worth working harder than that.
116 : */
117 39704 : need_tuples = (node->ss.ps.plan->qual != NIL ||
118 18706 : node->ss.ps.plan->targetlist != NIL);
119 :
120 20998 : node->ss.ss_currentScanDesc =
121 20998 : table_beginscan_bm(node->ss.ss_currentRelation,
122 20998 : node->ss.ps.state->es_snapshot,
123 : 0,
124 : NULL,
125 : need_tuples);
126 : }
127 :
128 25222 : node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator;
129 25222 : node->initialized = true;
130 25222 : }
131 :
132 : /* ----------------------------------------------------------------
133 : * BitmapHeapNext
134 : *
135 : * Retrieve next tuple from the BitmapHeapScan node's currentRelation
136 : * ----------------------------------------------------------------
137 : */
138 : static TupleTableSlot *
139 5892754 : BitmapHeapNext(BitmapHeapScanState *node)
140 : {
141 5892754 : ExprContext *econtext = node->ss.ps.ps_ExprContext;
142 5892754 : TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
143 :
144 : /*
145 : * If we haven't yet performed the underlying index scan, do it, and begin
146 : * the iteration over the bitmap.
147 : */
148 5892754 : if (!node->initialized)
149 25222 : BitmapTableScanSetup(node);
150 :
151 6650774 : while (table_scan_bitmap_next_tuple(node->ss.ss_currentScanDesc,
152 : slot, &node->recheck,
153 : &node->stats.lossy_pages,
154 : &node->stats.exact_pages))
155 : {
156 : /*
157 : * Continuing in previously obtained page.
158 : */
159 6625958 : CHECK_FOR_INTERRUPTS();
160 :
161 : /*
162 : * If we are using lossy info, we have to recheck the qual conditions
163 : * at every tuple.
164 : */
165 6625958 : if (node->recheck)
166 : {
167 3295332 : econtext->ecxt_scantuple = slot;
168 3295332 : if (!ExecQualAndReset(node->bitmapqualorig, econtext))
169 : {
170 : /* Fails recheck, so drop it and loop back for another */
171 758020 : InstrCountFiltered2(node, 1);
172 758020 : ExecClearTuple(slot);
173 758020 : continue;
174 : }
175 : }
176 :
177 : /* OK to return this tuple */
178 5867938 : return slot;
179 : }
180 :
181 : /*
182 : * if we get here it means we are at the end of the scan..
183 : */
184 24810 : return ExecClearTuple(slot);
185 : }
186 :
187 : /*
188 : * BitmapDoneInitializingSharedState - Shared state is initialized
189 : *
190 : * By this time the leader has already populated the TBM and initialized the
191 : * shared state so wake up other processes.
192 : */
193 : static inline void
194 72 : BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
195 : {
196 72 : SpinLockAcquire(&pstate->mutex);
197 72 : pstate->state = BM_FINISHED;
198 72 : SpinLockRelease(&pstate->mutex);
199 72 : ConditionVariableBroadcast(&pstate->cv);
200 72 : }
201 :
202 : /*
203 : * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
204 : */
205 : static bool
206 0 : BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
207 : {
208 : ExprContext *econtext;
209 :
210 : /*
211 : * extract necessary information from index scan node
212 : */
213 0 : econtext = node->ss.ps.ps_ExprContext;
214 :
215 : /* Does the tuple meet the original qual conditions? */
216 0 : econtext->ecxt_scantuple = slot;
217 0 : return ExecQualAndReset(node->bitmapqualorig, econtext);
218 : }
219 :
220 : /* ----------------------------------------------------------------
221 : * ExecBitmapHeapScan(node)
222 : * ----------------------------------------------------------------
223 : */
224 : static TupleTableSlot *
225 5598512 : ExecBitmapHeapScan(PlanState *pstate)
226 : {
227 5598512 : BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
228 :
229 5598512 : return ExecScan(&node->ss,
230 : (ExecScanAccessMtd) BitmapHeapNext,
231 : (ExecScanRecheckMtd) BitmapHeapRecheck);
232 : }
233 :
234 : /* ----------------------------------------------------------------
235 : * ExecReScanBitmapHeapScan(node)
236 : * ----------------------------------------------------------------
237 : */
238 : void
239 10578 : ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
240 : {
241 10578 : PlanState *outerPlan = outerPlanState(node);
242 :
243 10578 : TableScanDesc scan = node->ss.ss_currentScanDesc;
244 :
245 10578 : if (scan)
246 : {
247 : /*
248 : * End iteration on iterators saved in scan descriptor if they have
249 : * not already been cleaned up.
250 : */
251 4230 : if (!tbm_exhausted(&scan->st.rs_tbmiterator))
252 4224 : tbm_end_iterate(&scan->st.rs_tbmiterator);
253 :
254 : /* rescan to release any page pin */
255 4230 : table_rescan(node->ss.ss_currentScanDesc, NULL);
256 : }
257 :
258 : /* release bitmaps and buffers if any */
259 10578 : if (node->tbm)
260 4224 : tbm_free(node->tbm);
261 10578 : node->tbm = NULL;
262 10578 : node->initialized = false;
263 10578 : node->recheck = true;
264 :
265 10578 : ExecScanReScan(&node->ss);
266 :
267 : /*
268 : * if chgParam of subnode is not null then plan will be re-scanned by
269 : * first ExecProcNode.
270 : */
271 10578 : if (outerPlan->chgParam == NULL)
272 298 : ExecReScan(outerPlan);
273 10578 : }
274 :
275 : /* ----------------------------------------------------------------
276 : * ExecEndBitmapHeapScan
277 : * ----------------------------------------------------------------
278 : */
279 : void
280 25822 : ExecEndBitmapHeapScan(BitmapHeapScanState *node)
281 : {
282 : TableScanDesc scanDesc;
283 :
284 : /*
285 : * When ending a parallel worker, copy the statistics gathered by the
286 : * worker back into shared memory so that it can be picked up by the main
287 : * process to report in EXPLAIN ANALYZE.
288 : */
289 25822 : if (node->sinstrument != NULL && IsParallelWorker())
290 : {
291 : BitmapHeapScanInstrumentation *si;
292 :
293 : Assert(ParallelWorkerNumber <= node->sinstrument->num_workers);
294 0 : si = &node->sinstrument->sinstrument[ParallelWorkerNumber];
295 :
296 : /*
297 : * Here we accumulate the stats rather than performing memcpy on
298 : * node->stats into si. When a Gather/GatherMerge node finishes it
299 : * will perform planner shutdown on the workers. On rescan it will
300 : * spin up new workers which will have a new BitmapHeapScanState and
301 : * zeroed stats.
302 : */
303 0 : si->exact_pages += node->stats.exact_pages;
304 0 : si->lossy_pages += node->stats.lossy_pages;
305 : }
306 :
307 : /*
308 : * extract information from the node
309 : */
310 25822 : scanDesc = node->ss.ss_currentScanDesc;
311 :
312 : /*
313 : * close down subplans
314 : */
315 25822 : ExecEndNode(outerPlanState(node));
316 :
317 25822 : if (scanDesc)
318 : {
319 : /*
320 : * End iteration on iterators saved in scan descriptor if they have
321 : * not already been cleaned up.
322 : */
323 20896 : if (!tbm_exhausted(&scanDesc->st.rs_tbmiterator))
324 20896 : tbm_end_iterate(&scanDesc->st.rs_tbmiterator);
325 :
326 : /*
327 : * close table scan
328 : */
329 20896 : table_endscan(scanDesc);
330 : }
331 :
332 : /*
333 : * release bitmaps and buffers if any
334 : */
335 25822 : if (node->tbm)
336 20626 : tbm_free(node->tbm);
337 25822 : }
338 :
339 : /* ----------------------------------------------------------------
340 : * ExecInitBitmapHeapScan
341 : *
342 : * Initializes the scan's state information.
343 : * ----------------------------------------------------------------
344 : */
345 : BitmapHeapScanState *
346 25924 : ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
347 : {
348 : BitmapHeapScanState *scanstate;
349 : Relation currentRelation;
350 :
351 : /* check for unsupported flags */
352 : Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
353 :
354 : /*
355 : * Assert caller didn't ask for an unsafe snapshot --- see comments at
356 : * head of file.
357 : */
358 : Assert(IsMVCCSnapshot(estate->es_snapshot));
359 :
360 : /*
361 : * create state structure
362 : */
363 25924 : scanstate = makeNode(BitmapHeapScanState);
364 25924 : scanstate->ss.ps.plan = (Plan *) node;
365 25924 : scanstate->ss.ps.state = estate;
366 25924 : scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
367 :
368 25924 : scanstate->tbm = NULL;
369 :
370 : /* Zero the statistics counters */
371 25924 : memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation));
372 :
373 25924 : scanstate->initialized = false;
374 25924 : scanstate->pstate = NULL;
375 25924 : scanstate->recheck = true;
376 :
377 : /*
378 : * Miscellaneous initialization
379 : *
380 : * create expression context for node
381 : */
382 25924 : ExecAssignExprContext(estate, &scanstate->ss.ps);
383 :
384 : /*
385 : * open the scan relation
386 : */
387 25924 : currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
388 :
389 : /*
390 : * initialize child nodes
391 : */
392 25924 : outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
393 :
394 : /*
395 : * get the scan type from the relation descriptor.
396 : */
397 25924 : ExecInitScanTupleSlot(estate, &scanstate->ss,
398 : RelationGetDescr(currentRelation),
399 : table_slot_callbacks(currentRelation));
400 :
401 : /*
402 : * Initialize result type and projection.
403 : */
404 25924 : ExecInitResultTypeTL(&scanstate->ss.ps);
405 25924 : ExecAssignScanProjectionInfo(&scanstate->ss);
406 :
407 : /*
408 : * initialize child expressions
409 : */
410 25924 : scanstate->ss.ps.qual =
411 25924 : ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
412 25924 : scanstate->bitmapqualorig =
413 25924 : ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
414 :
415 25924 : scanstate->ss.ss_currentRelation = currentRelation;
416 :
417 : /*
418 : * all done.
419 : */
420 25924 : return scanstate;
421 : }
422 :
423 : /*----------------
424 : * BitmapShouldInitializeSharedState
425 : *
426 : * The first process to come here and see the state to the BM_INITIAL
427 : * will become the leader for the parallel bitmap scan and will be
428 : * responsible for populating the TIDBitmap. The other processes will
429 : * be blocked by the condition variable until the leader wakes them up.
430 : * ---------------
431 : */
432 : static bool
433 342 : BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
434 : {
435 : SharedBitmapState state;
436 :
437 : while (1)
438 : {
439 342 : SpinLockAcquire(&pstate->mutex);
440 342 : state = pstate->state;
441 342 : if (pstate->state == BM_INITIAL)
442 72 : pstate->state = BM_INPROGRESS;
443 342 : SpinLockRelease(&pstate->mutex);
444 :
445 : /* Exit if bitmap is done, or if we're the leader. */
446 342 : if (state != BM_INPROGRESS)
447 342 : break;
448 :
449 : /* Wait for the leader to wake us up. */
450 0 : ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
451 : }
452 :
453 342 : ConditionVariableCancelSleep();
454 :
455 342 : return (state == BM_INITIAL);
456 : }
457 :
458 : /* ----------------------------------------------------------------
459 : * ExecBitmapHeapEstimate
460 : *
461 : * Compute the amount of space we'll need in the parallel
462 : * query DSM, and inform pcxt->estimator about our needs.
463 : * ----------------------------------------------------------------
464 : */
465 : void
466 18 : ExecBitmapHeapEstimate(BitmapHeapScanState *node,
467 : ParallelContext *pcxt)
468 : {
469 : Size size;
470 :
471 18 : size = MAXALIGN(sizeof(ParallelBitmapHeapState));
472 :
473 : /* account for instrumentation, if required */
474 18 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
475 : {
476 0 : size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
477 0 : size = add_size(size, mul_size(pcxt->nworkers, sizeof(BitmapHeapScanInstrumentation)));
478 : }
479 :
480 18 : shm_toc_estimate_chunk(&pcxt->estimator, size);
481 18 : shm_toc_estimate_keys(&pcxt->estimator, 1);
482 18 : }
483 :
484 : /* ----------------------------------------------------------------
485 : * ExecBitmapHeapInitializeDSM
486 : *
487 : * Set up a parallel bitmap heap scan descriptor.
488 : * ----------------------------------------------------------------
489 : */
490 : void
491 18 : ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
492 : ParallelContext *pcxt)
493 : {
494 : ParallelBitmapHeapState *pstate;
495 18 : SharedBitmapHeapInstrumentation *sinstrument = NULL;
496 18 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
497 : char *ptr;
498 : Size size;
499 :
500 : /* If there's no DSA, there are no workers; initialize nothing. */
501 18 : if (dsa == NULL)
502 0 : return;
503 :
504 18 : size = MAXALIGN(sizeof(ParallelBitmapHeapState));
505 18 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
506 : {
507 0 : size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
508 0 : size = add_size(size, mul_size(pcxt->nworkers, sizeof(BitmapHeapScanInstrumentation)));
509 : }
510 :
511 18 : ptr = shm_toc_allocate(pcxt->toc, size);
512 18 : pstate = (ParallelBitmapHeapState *) ptr;
513 18 : ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
514 18 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
515 0 : sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
516 :
517 18 : pstate->tbmiterator = 0;
518 :
519 : /* Initialize the mutex */
520 18 : SpinLockInit(&pstate->mutex);
521 18 : pstate->state = BM_INITIAL;
522 :
523 18 : ConditionVariableInit(&pstate->cv);
524 :
525 18 : if (sinstrument)
526 : {
527 0 : sinstrument->num_workers = pcxt->nworkers;
528 :
529 : /* ensure any unfilled slots will contain zeroes */
530 0 : memset(sinstrument->sinstrument, 0,
531 0 : pcxt->nworkers * sizeof(BitmapHeapScanInstrumentation));
532 : }
533 :
534 18 : shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
535 18 : node->pstate = pstate;
536 18 : node->sinstrument = sinstrument;
537 : }
538 :
539 : /* ----------------------------------------------------------------
540 : * ExecBitmapHeapReInitializeDSM
541 : *
542 : * Reset shared state before beginning a fresh scan.
543 : * ----------------------------------------------------------------
544 : */
545 : void
546 54 : ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
547 : ParallelContext *pcxt)
548 : {
549 54 : ParallelBitmapHeapState *pstate = node->pstate;
550 54 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
551 :
552 : /* If there's no DSA, there are no workers; do nothing. */
553 54 : if (dsa == NULL)
554 0 : return;
555 :
556 54 : pstate->state = BM_INITIAL;
557 :
558 54 : if (DsaPointerIsValid(pstate->tbmiterator))
559 54 : tbm_free_shared_area(dsa, pstate->tbmiterator);
560 :
561 54 : pstate->tbmiterator = InvalidDsaPointer;
562 : }
563 :
564 : /* ----------------------------------------------------------------
565 : * ExecBitmapHeapInitializeWorker
566 : *
567 : * Copy relevant information from TOC into planstate.
568 : * ----------------------------------------------------------------
569 : */
570 : void
571 270 : ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
572 : ParallelWorkerContext *pwcxt)
573 : {
574 : char *ptr;
575 :
576 : Assert(node->ss.ps.state->es_query_dsa != NULL);
577 :
578 270 : ptr = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
579 :
580 270 : node->pstate = (ParallelBitmapHeapState *) ptr;
581 270 : ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
582 :
583 270 : if (node->ss.ps.instrument)
584 0 : node->sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
585 270 : }
586 :
587 : /* ----------------------------------------------------------------
588 : * ExecBitmapHeapRetrieveInstrumentation
589 : *
590 : * Transfer bitmap heap scan statistics from DSM to private memory.
591 : * ----------------------------------------------------------------
592 : */
593 : void
594 0 : ExecBitmapHeapRetrieveInstrumentation(BitmapHeapScanState *node)
595 : {
596 0 : SharedBitmapHeapInstrumentation *sinstrument = node->sinstrument;
597 : Size size;
598 :
599 0 : if (sinstrument == NULL)
600 0 : return;
601 :
602 0 : size = offsetof(SharedBitmapHeapInstrumentation, sinstrument)
603 0 : + sinstrument->num_workers * sizeof(BitmapHeapScanInstrumentation);
604 :
605 0 : node->sinstrument = palloc(size);
606 0 : memcpy(node->sinstrument, sinstrument, size);
607 : }
|