Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nodeBitmapHeapscan.c
4 : * Routines to support bitmapped scans of relations
5 : *
6 : * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 : * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8 : * special snapshots). The reason is that since index and heap scans are
9 : * decoupled, there can be no assurance that the index tuple prompting a
10 : * visit to a particular heap TID still exists when the visit is made.
11 : * Therefore the tuple might not exist anymore either (which is OK because
12 : * heap_fetch will cope) --- but worse, the tuple slot could have been
13 : * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 : * certain to fail the time qual and so it will not be mistakenly returned,
15 : * but with anything else we might return a tuple that doesn't meet the
16 : * required index qual conditions.
17 : *
18 : *
19 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
20 : * Portions Copyright (c) 1994, Regents of the University of California
21 : *
22 : *
23 : * IDENTIFICATION
24 : * src/backend/executor/nodeBitmapHeapscan.c
25 : *
26 : *-------------------------------------------------------------------------
27 : */
28 : /*
29 : * INTERFACE ROUTINES
30 : * ExecBitmapHeapScan scans a relation using bitmap info
31 : * ExecBitmapHeapNext workhorse for above
32 : * ExecInitBitmapHeapScan creates and initializes state info.
33 : * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 : * ExecEndBitmapHeapScan releases all storage.
35 : */
36 : #include "postgres.h"
37 :
38 : #include "access/relscan.h"
39 : #include "access/tableam.h"
40 : #include "access/visibilitymap.h"
41 : #include "executor/executor.h"
42 : #include "executor/instrument.h"
43 : #include "executor/nodeBitmapHeapscan.h"
44 : #include "miscadmin.h"
45 : #include "pgstat.h"
46 : #include "storage/bufmgr.h"
47 : #include "storage/condition_variable.h"
48 : #include "utils/dsa.h"
49 : #include "utils/rel.h"
50 : #include "utils/spccache.h"
51 : #include "utils/wait_event.h"
52 :
53 : static void BitmapTableScanSetup(BitmapHeapScanState *node);
54 : static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
55 : static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
56 : static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
57 :
58 :
59 : /* ----------------
60 : * SharedBitmapState information
61 : *
62 : * BM_INITIAL TIDBitmap creation is not yet started, so first worker
63 : * to see this state will set the state to BM_INPROGRESS
64 : * and that process will be responsible for creating
65 : * TIDBitmap.
66 : * BM_INPROGRESS TIDBitmap creation is in progress; workers need to
67 : * sleep until it's finished.
68 : * BM_FINISHED TIDBitmap creation is done, so now all workers can
69 : * proceed to iterate over TIDBitmap.
70 : * ----------------
71 : */
72 : typedef enum
73 : {
74 : BM_INITIAL,
75 : BM_INPROGRESS,
76 : BM_FINISHED,
77 : } SharedBitmapState;
78 :
79 : /* ----------------
80 : * ParallelBitmapHeapState information
81 : * tbmiterator iterator for scanning current pages
82 : * mutex mutual exclusion for state
83 : * state current state of the TIDBitmap
84 : * cv conditional wait variable
85 : * ----------------
86 : */
87 : typedef struct ParallelBitmapHeapState
88 : {
89 : dsa_pointer tbmiterator;
90 : slock_t mutex;
91 : SharedBitmapState state;
92 : ConditionVariable cv;
93 : } ParallelBitmapHeapState;
94 :
95 :
96 : /*
97 : * Do the underlying index scan, build the bitmap, set up the parallel state
98 : * needed for parallel workers to iterate through the bitmap, and set up the
99 : * underlying table scan descriptor.
100 : */
101 : static void
102 15650 : BitmapTableScanSetup(BitmapHeapScanState *node)
103 : {
104 15650 : TBMIterator tbmiterator = {0};
105 15650 : ParallelBitmapHeapState *pstate = node->pstate;
106 15650 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
107 :
108 15650 : if (!pstate)
109 : {
110 15423 : node->tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
111 :
112 15423 : if (!node->tbm || !IsA(node->tbm, TIDBitmap))
113 0 : elog(ERROR, "unrecognized result from subplan");
114 : }
115 227 : else if (BitmapShouldInitializeSharedState(pstate))
116 : {
117 : /*
118 : * The leader will immediately come out of the function, but others
119 : * will be blocked until leader populates the TBM and wakes them up.
120 : */
121 48 : node->tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
122 48 : if (!node->tbm || !IsA(node->tbm, TIDBitmap))
123 0 : elog(ERROR, "unrecognized result from subplan");
124 :
125 : /*
126 : * Prepare to iterate over the TBM. This will return the dsa_pointer
127 : * of the iterator state which will be used by multiple processes to
128 : * iterate jointly.
129 : */
130 48 : pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm);
131 :
132 : /* We have initialized the shared state so wake up others. */
133 48 : BitmapDoneInitializingSharedState(pstate);
134 : }
135 :
136 15650 : tbmiterator = tbm_begin_iterate(node->tbm, dsa,
137 : pstate ?
138 : pstate->tbmiterator :
139 : InvalidDsaPointer);
140 :
141 : /*
142 : * If this is the first scan of the underlying table, create the table
143 : * scan descriptor and begin the scan.
144 : */
145 15650 : if (!node->ss.ss_currentScanDesc)
146 : {
147 13360 : node->ss.ss_currentScanDesc =
148 13360 : table_beginscan_bm(node->ss.ss_currentRelation,
149 13360 : node->ss.ps.state->es_snapshot,
150 : 0,
151 : NULL);
152 : }
153 :
154 15650 : node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator;
155 15650 : node->initialized = true;
156 15650 : }
157 :
158 : /* ----------------------------------------------------------------
159 : * BitmapHeapNext
160 : *
161 : * Retrieve next tuple from the BitmapHeapScan node's currentRelation
162 : * ----------------------------------------------------------------
163 : */
164 : static TupleTableSlot *
165 3665440 : BitmapHeapNext(BitmapHeapScanState *node)
166 : {
167 3665440 : ExprContext *econtext = node->ss.ps.ps_ExprContext;
168 3665440 : TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
169 :
170 : /*
171 : * If we haven't yet performed the underlying index scan, do it, and begin
172 : * the iteration over the bitmap.
173 : */
174 3665440 : if (!node->initialized)
175 15650 : BitmapTableScanSetup(node);
176 :
177 4165975 : while (table_scan_bitmap_next_tuple(node->ss.ss_currentScanDesc,
178 : slot, &node->recheck,
179 : &node->stats.lossy_pages,
180 : &node->stats.exact_pages))
181 : {
182 : /*
183 : * Continuing in previously obtained page.
184 : */
185 4150638 : CHECK_FOR_INTERRUPTS();
186 :
187 : /*
188 : * If we are using lossy info, we have to recheck the qual conditions
189 : * at every tuple.
190 : */
191 4150638 : if (node->recheck)
192 : {
193 1889119 : econtext->ecxt_scantuple = slot;
194 1889119 : if (!ExecQualAndReset(node->bitmapqualorig, econtext))
195 : {
196 : /* Fails recheck, so drop it and loop back for another */
197 500535 : InstrCountFiltered2(node, 1);
198 500535 : ExecClearTuple(slot);
199 500535 : continue;
200 : }
201 : }
202 :
203 : /* OK to return this tuple */
204 3650103 : return slot;
205 : }
206 :
207 : /*
208 : * if we get here it means we are at the end of the scan..
209 : */
210 15334 : return ExecClearTuple(slot);
211 : }
212 :
213 : /*
214 : * BitmapDoneInitializingSharedState - Shared state is initialized
215 : *
216 : * By this time the leader has already populated the TBM and initialized the
217 : * shared state so wake up other processes.
218 : */
219 : static inline void
220 48 : BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
221 : {
222 48 : SpinLockAcquire(&pstate->mutex);
223 48 : pstate->state = BM_FINISHED;
224 48 : SpinLockRelease(&pstate->mutex);
225 48 : ConditionVariableBroadcast(&pstate->cv);
226 48 : }
227 :
228 : /*
229 : * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
230 : */
231 : static bool
232 0 : BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
233 : {
234 : ExprContext *econtext;
235 :
236 : /*
237 : * extract necessary information from index scan node
238 : */
239 0 : econtext = node->ss.ps.ps_ExprContext;
240 :
241 : /* Does the tuple meet the original qual conditions? */
242 0 : econtext->ecxt_scantuple = slot;
243 0 : return ExecQualAndReset(node->bitmapqualorig, econtext);
244 : }
245 :
246 : /* ----------------------------------------------------------------
247 : * ExecBitmapHeapScan(node)
248 : * ----------------------------------------------------------------
249 : */
250 : static TupleTableSlot *
251 3437693 : ExecBitmapHeapScan(PlanState *pstate)
252 : {
253 3437693 : BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
254 :
255 3437693 : return ExecScan(&node->ss,
256 : (ExecScanAccessMtd) BitmapHeapNext,
257 : (ExecScanRecheckMtd) BitmapHeapRecheck);
258 : }
259 :
260 : /* ----------------------------------------------------------------
261 : * ExecReScanBitmapHeapScan(node)
262 : * ----------------------------------------------------------------
263 : */
264 : void
265 2921 : ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
266 : {
267 2921 : PlanState *outerPlan = outerPlanState(node);
268 :
269 2921 : TableScanDesc scan = node->ss.ss_currentScanDesc;
270 :
271 2921 : if (scan)
272 : {
273 : /*
274 : * End iteration on iterators saved in scan descriptor if they have
275 : * not already been cleaned up.
276 : */
277 2294 : if (!tbm_exhausted(&scan->st.rs_tbmiterator))
278 2290 : tbm_end_iterate(&scan->st.rs_tbmiterator);
279 :
280 : /* rescan to release any page pin */
281 2294 : table_rescan(node->ss.ss_currentScanDesc, NULL);
282 : }
283 :
284 : /* release bitmaps and buffers if any */
285 2921 : if (node->tbm)
286 2290 : tbm_free(node->tbm);
287 2921 : node->tbm = NULL;
288 2921 : node->initialized = false;
289 2921 : node->recheck = true;
290 :
291 2921 : ExecScanReScan(&node->ss);
292 :
293 : /*
294 : * if chgParam of subnode is not null then plan will be re-scanned by
295 : * first ExecProcNode.
296 : */
297 2921 : if (outerPlan->chgParam == NULL)
298 200 : ExecReScan(outerPlan);
299 2921 : }
300 :
301 : /* ----------------------------------------------------------------
302 : * ExecEndBitmapHeapScan
303 : * ----------------------------------------------------------------
304 : */
305 : void
306 16304 : ExecEndBitmapHeapScan(BitmapHeapScanState *node)
307 : {
308 : TableScanDesc scanDesc;
309 :
310 : /*
311 : * When ending a parallel worker, copy the statistics gathered by the
312 : * worker back into shared memory so that it can be picked up by the main
313 : * process to report in EXPLAIN ANALYZE.
314 : */
315 16304 : if (node->sinstrument != NULL && IsParallelWorker())
316 : {
317 : BitmapHeapScanInstrumentation *si;
318 :
319 : Assert(ParallelWorkerNumber < node->sinstrument->num_workers);
320 0 : si = &node->sinstrument->sinstrument[ParallelWorkerNumber];
321 :
322 : /*
323 : * Here we accumulate the stats rather than performing memcpy on
324 : * node->stats into si. When a Gather/GatherMerge node finishes it
325 : * will perform planner shutdown on the workers. On rescan it will
326 : * spin up new workers which will have a new BitmapHeapScanState and
327 : * zeroed stats.
328 : */
329 0 : si->exact_pages += node->stats.exact_pages;
330 0 : si->lossy_pages += node->stats.lossy_pages;
331 : }
332 :
333 : /*
334 : * extract information from the node
335 : */
336 16304 : scanDesc = node->ss.ss_currentScanDesc;
337 :
338 : /*
339 : * close down subplans
340 : */
341 16304 : ExecEndNode(outerPlanState(node));
342 :
343 16304 : if (scanDesc)
344 : {
345 : /*
346 : * End iteration on iterators saved in scan descriptor if they have
347 : * not already been cleaned up.
348 : */
349 13279 : if (!tbm_exhausted(&scanDesc->st.rs_tbmiterator))
350 13279 : tbm_end_iterate(&scanDesc->st.rs_tbmiterator);
351 :
352 : /*
353 : * close table scan
354 : */
355 13279 : table_endscan(scanDesc);
356 : }
357 :
358 : /*
359 : * release bitmaps and buffers if any
360 : */
361 16304 : if (node->tbm)
362 13100 : tbm_free(node->tbm);
363 16304 : }
364 :
365 : /* ----------------------------------------------------------------
366 : * ExecInitBitmapHeapScan
367 : *
368 : * Initializes the scan's state information.
369 : * ----------------------------------------------------------------
370 : */
371 : BitmapHeapScanState *
372 16385 : ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
373 : {
374 : BitmapHeapScanState *scanstate;
375 : Relation currentRelation;
376 :
377 : /* check for unsupported flags */
378 : Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
379 :
380 : /*
381 : * Assert caller didn't ask for an unsafe snapshot --- see comments at
382 : * head of file.
383 : */
384 : Assert(IsMVCCSnapshot(estate->es_snapshot));
385 :
386 : /*
387 : * create state structure
388 : */
389 16385 : scanstate = makeNode(BitmapHeapScanState);
390 16385 : scanstate->ss.ps.plan = (Plan *) node;
391 16385 : scanstate->ss.ps.state = estate;
392 16385 : scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
393 :
394 16385 : scanstate->tbm = NULL;
395 :
396 : /* Zero the statistics counters */
397 16385 : memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation));
398 :
399 16385 : scanstate->initialized = false;
400 16385 : scanstate->pstate = NULL;
401 16385 : scanstate->recheck = true;
402 :
403 : /*
404 : * Miscellaneous initialization
405 : *
406 : * create expression context for node
407 : */
408 16385 : ExecAssignExprContext(estate, &scanstate->ss.ps);
409 :
410 : /*
411 : * open the scan relation
412 : */
413 16385 : currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
414 :
415 : /*
416 : * initialize child nodes
417 : */
418 16385 : outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
419 :
420 : /*
421 : * get the scan type from the relation descriptor.
422 : */
423 16385 : ExecInitScanTupleSlot(estate, &scanstate->ss,
424 : RelationGetDescr(currentRelation),
425 : table_slot_callbacks(currentRelation),
426 : TTS_FLAG_OBEYS_NOT_NULL_CONSTRAINTS);
427 :
428 : /*
429 : * Initialize result type and projection.
430 : */
431 16385 : ExecInitResultTypeTL(&scanstate->ss.ps);
432 16385 : ExecAssignScanProjectionInfo(&scanstate->ss);
433 :
434 : /*
435 : * initialize child expressions
436 : */
437 16385 : scanstate->ss.ps.qual =
438 16385 : ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
439 16385 : scanstate->bitmapqualorig =
440 16385 : ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
441 :
442 16385 : scanstate->ss.ss_currentRelation = currentRelation;
443 :
444 : /*
445 : * all done.
446 : */
447 16385 : return scanstate;
448 : }
449 :
450 : /*----------------
451 : * BitmapShouldInitializeSharedState
452 : *
453 : * The first process to come here and see the state to the BM_INITIAL
454 : * will become the leader for the parallel bitmap scan and will be
455 : * responsible for populating the TIDBitmap. The other processes will
456 : * be blocked by the condition variable until the leader wakes them up.
457 : * ---------------
458 : */
459 : static bool
460 227 : BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
461 : {
462 : SharedBitmapState state;
463 :
464 : while (1)
465 : {
466 235 : SpinLockAcquire(&pstate->mutex);
467 235 : state = pstate->state;
468 235 : if (pstate->state == BM_INITIAL)
469 48 : pstate->state = BM_INPROGRESS;
470 235 : SpinLockRelease(&pstate->mutex);
471 :
472 : /* Exit if bitmap is done, or if we're the leader. */
473 235 : if (state != BM_INPROGRESS)
474 227 : break;
475 :
476 : /* Wait for the leader to wake us up. */
477 8 : ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
478 : }
479 :
480 227 : ConditionVariableCancelSleep();
481 :
482 227 : return (state == BM_INITIAL);
483 : }
484 :
485 : /* ----------------------------------------------------------------
486 : * ExecBitmapHeapEstimate
487 : *
488 : * Compute the amount of space we'll need in the parallel
489 : * query DSM, and inform pcxt->estimator about our needs.
490 : * ----------------------------------------------------------------
491 : */
492 : void
493 12 : ExecBitmapHeapEstimate(BitmapHeapScanState *node,
494 : ParallelContext *pcxt)
495 : {
496 : Size size;
497 :
498 12 : size = MAXALIGN(sizeof(ParallelBitmapHeapState));
499 :
500 : /* account for instrumentation, if required */
501 12 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
502 : {
503 0 : size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
504 0 : size = add_size(size, mul_size(pcxt->nworkers, sizeof(BitmapHeapScanInstrumentation)));
505 : }
506 :
507 12 : shm_toc_estimate_chunk(&pcxt->estimator, size);
508 12 : shm_toc_estimate_keys(&pcxt->estimator, 1);
509 12 : }
510 :
511 : /* ----------------------------------------------------------------
512 : * ExecBitmapHeapInitializeDSM
513 : *
514 : * Set up a parallel bitmap heap scan descriptor.
515 : * ----------------------------------------------------------------
516 : */
517 : void
518 12 : ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
519 : ParallelContext *pcxt)
520 : {
521 : ParallelBitmapHeapState *pstate;
522 12 : SharedBitmapHeapInstrumentation *sinstrument = NULL;
523 12 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
524 : char *ptr;
525 : Size size;
526 :
527 : /* If there's no DSA, there are no workers; initialize nothing. */
528 12 : if (dsa == NULL)
529 0 : return;
530 :
531 12 : size = MAXALIGN(sizeof(ParallelBitmapHeapState));
532 12 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
533 : {
534 0 : size = add_size(size, offsetof(SharedBitmapHeapInstrumentation, sinstrument));
535 0 : size = add_size(size, mul_size(pcxt->nworkers, sizeof(BitmapHeapScanInstrumentation)));
536 : }
537 :
538 12 : ptr = shm_toc_allocate(pcxt->toc, size);
539 12 : pstate = (ParallelBitmapHeapState *) ptr;
540 12 : ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
541 12 : if (node->ss.ps.instrument && pcxt->nworkers > 0)
542 0 : sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
543 :
544 12 : pstate->tbmiterator = 0;
545 :
546 : /* Initialize the mutex */
547 12 : SpinLockInit(&pstate->mutex);
548 12 : pstate->state = BM_INITIAL;
549 :
550 12 : ConditionVariableInit(&pstate->cv);
551 :
552 12 : if (sinstrument)
553 : {
554 0 : sinstrument->num_workers = pcxt->nworkers;
555 :
556 : /* ensure any unfilled slots will contain zeroes */
557 0 : memset(sinstrument->sinstrument, 0,
558 0 : pcxt->nworkers * sizeof(BitmapHeapScanInstrumentation));
559 : }
560 :
561 12 : shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
562 12 : node->pstate = pstate;
563 12 : node->sinstrument = sinstrument;
564 : }
565 :
566 : /* ----------------------------------------------------------------
567 : * ExecBitmapHeapReInitializeDSM
568 : *
569 : * Reset shared state before beginning a fresh scan.
570 : * ----------------------------------------------------------------
571 : */
572 : void
573 36 : ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
574 : ParallelContext *pcxt)
575 : {
576 36 : ParallelBitmapHeapState *pstate = node->pstate;
577 36 : dsa_area *dsa = node->ss.ps.state->es_query_dsa;
578 :
579 : /* If there's no DSA, there are no workers; do nothing. */
580 36 : if (dsa == NULL)
581 0 : return;
582 :
583 36 : pstate->state = BM_INITIAL;
584 :
585 36 : if (DsaPointerIsValid(pstate->tbmiterator))
586 36 : tbm_free_shared_area(dsa, pstate->tbmiterator);
587 :
588 36 : pstate->tbmiterator = InvalidDsaPointer;
589 : }
590 :
591 : /* ----------------------------------------------------------------
592 : * ExecBitmapHeapInitializeWorker
593 : *
594 : * Copy relevant information from TOC into planstate.
595 : * ----------------------------------------------------------------
596 : */
597 : void
598 179 : ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
599 : ParallelWorkerContext *pwcxt)
600 : {
601 : char *ptr;
602 :
603 : Assert(node->ss.ps.state->es_query_dsa != NULL);
604 :
605 179 : ptr = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
606 :
607 179 : node->pstate = (ParallelBitmapHeapState *) ptr;
608 179 : ptr += MAXALIGN(sizeof(ParallelBitmapHeapState));
609 :
610 179 : if (node->ss.ps.instrument)
611 0 : node->sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
612 179 : }
613 :
614 : /* ----------------------------------------------------------------
615 : * ExecBitmapHeapRetrieveInstrumentation
616 : *
617 : * Transfer bitmap heap scan statistics from DSM to private memory.
618 : * ----------------------------------------------------------------
619 : */
620 : void
621 0 : ExecBitmapHeapRetrieveInstrumentation(BitmapHeapScanState *node)
622 : {
623 0 : SharedBitmapHeapInstrumentation *sinstrument = node->sinstrument;
624 : Size size;
625 :
626 0 : if (sinstrument == NULL)
627 0 : return;
628 :
629 0 : size = offsetof(SharedBitmapHeapInstrumentation, sinstrument)
630 0 : + sinstrument->num_workers * sizeof(BitmapHeapScanInstrumentation);
631 :
632 0 : node->sinstrument = palloc(size);
633 0 : memcpy(node->sinstrument, sinstrument, size);
634 : }
|