Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "commands/progress.h"
24 : #include "commands/vacuum.h"
25 : #include "nodes/execnodes.h"
26 : #include "pgstat.h"
27 : #include "storage/bulk_write.h"
28 : #include "storage/condition_variable.h"
29 : #include "storage/indexfsm.h"
30 : #include "storage/ipc.h"
31 : #include "storage/lmgr.h"
32 : #include "utils/fmgrprotos.h"
33 : #include "utils/index_selfuncs.h"
34 : #include "utils/memutils.h"
35 :
36 :
37 : /*
38 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
39 : *
40 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
41 : * scan to advance it via another call to _bt_first.
42 : *
43 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
44 : * a new page; others must wait.
45 : *
46 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
47 : * to a new page; some process can start doing that.
48 : *
49 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
50 : */
51 : typedef enum
52 : {
53 : BTPARALLEL_NOT_INITIALIZED,
54 : BTPARALLEL_NEED_PRIMSCAN,
55 : BTPARALLEL_ADVANCING,
56 : BTPARALLEL_IDLE,
57 : BTPARALLEL_DONE,
58 : } BTPS_State;
59 :
60 : /*
61 : * BTParallelScanDescData contains btree specific shared information required
62 : * for parallel scan.
63 : */
64 : typedef struct BTParallelScanDescData
65 : {
66 : BlockNumber btps_nextScanPage; /* next page to be scanned */
67 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
68 : * btps_nextScanPage */
69 : BTPS_State btps_pageStatus; /* indicates whether next page is
70 : * available for scan. see above for
71 : * possible states of parallel scan. */
72 : slock_t btps_mutex; /* protects above variables, btps_arrElems */
73 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
74 :
75 : /*
76 : * btps_arrElems is used when scans need to schedule another primitive
77 : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
78 : */
79 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
80 : } BTParallelScanDescData;
81 :
82 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
83 :
84 :
85 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
86 : IndexBulkDeleteCallback callback, void *callback_state,
87 : BTCycleId cycleid);
88 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
89 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
90 : IndexTuple posting,
91 : OffsetNumber updatedoffset,
92 : int *nremaining);
93 :
94 :
95 : /*
96 : * Btree handler function: return IndexAmRoutine with access method parameters
97 : * and callbacks.
98 : */
99 : Datum
100 2785704 : bthandler(PG_FUNCTION_ARGS)
101 : {
102 2785704 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
103 :
104 2785704 : amroutine->amstrategies = BTMaxStrategyNumber;
105 2785704 : amroutine->amsupport = BTNProcs;
106 2785704 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
107 2785704 : amroutine->amcanorder = true;
108 2785704 : amroutine->amcanorderbyop = false;
109 2785704 : amroutine->amcanbackward = true;
110 2785704 : amroutine->amcanunique = true;
111 2785704 : amroutine->amcanmulticol = true;
112 2785704 : amroutine->amoptionalkey = true;
113 2785704 : amroutine->amsearcharray = true;
114 2785704 : amroutine->amsearchnulls = true;
115 2785704 : amroutine->amstorage = false;
116 2785704 : amroutine->amclusterable = true;
117 2785704 : amroutine->ampredlocks = true;
118 2785704 : amroutine->amcanparallel = true;
119 2785704 : amroutine->amcanbuildparallel = true;
120 2785704 : amroutine->amcaninclude = true;
121 2785704 : amroutine->amusemaintenanceworkmem = false;
122 2785704 : amroutine->amsummarizing = false;
123 2785704 : amroutine->amparallelvacuumoptions =
124 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
125 2785704 : amroutine->amkeytype = InvalidOid;
126 :
127 2785704 : amroutine->ambuild = btbuild;
128 2785704 : amroutine->ambuildempty = btbuildempty;
129 2785704 : amroutine->aminsert = btinsert;
130 2785704 : amroutine->aminsertcleanup = NULL;
131 2785704 : amroutine->ambulkdelete = btbulkdelete;
132 2785704 : amroutine->amvacuumcleanup = btvacuumcleanup;
133 2785704 : amroutine->amcanreturn = btcanreturn;
134 2785704 : amroutine->amcostestimate = btcostestimate;
135 2785704 : amroutine->amgettreeheight = btgettreeheight;
136 2785704 : amroutine->amoptions = btoptions;
137 2785704 : amroutine->amproperty = btproperty;
138 2785704 : amroutine->ambuildphasename = btbuildphasename;
139 2785704 : amroutine->amvalidate = btvalidate;
140 2785704 : amroutine->amadjustmembers = btadjustmembers;
141 2785704 : amroutine->ambeginscan = btbeginscan;
142 2785704 : amroutine->amrescan = btrescan;
143 2785704 : amroutine->amgettuple = btgettuple;
144 2785704 : amroutine->amgetbitmap = btgetbitmap;
145 2785704 : amroutine->amendscan = btendscan;
146 2785704 : amroutine->ammarkpos = btmarkpos;
147 2785704 : amroutine->amrestrpos = btrestrpos;
148 2785704 : amroutine->amestimateparallelscan = btestimateparallelscan;
149 2785704 : amroutine->aminitparallelscan = btinitparallelscan;
150 2785704 : amroutine->amparallelrescan = btparallelrescan;
151 :
152 2785704 : PG_RETURN_POINTER(amroutine);
153 : }
154 :
155 : /*
156 : * btbuildempty() -- build an empty btree index in the initialization fork
157 : */
158 : void
159 148 : btbuildempty(Relation index)
160 : {
161 148 : bool allequalimage = _bt_allequalimage(index, false);
162 : BulkWriteState *bulkstate;
163 : BulkWriteBuffer metabuf;
164 :
165 148 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 :
167 : /* Construct metapage. */
168 148 : metabuf = smgr_bulk_get_buf(bulkstate);
169 148 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170 148 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 :
172 148 : smgr_bulk_finish(bulkstate);
173 148 : }
174 :
175 : /*
176 : * btinsert() -- insert an index tuple into a btree.
177 : *
178 : * Descend the tree recursively, find the appropriate location for our
179 : * new tuple, and put it there.
180 : */
181 : bool
182 6759632 : btinsert(Relation rel, Datum *values, bool *isnull,
183 : ItemPointer ht_ctid, Relation heapRel,
184 : IndexUniqueCheck checkUnique,
185 : bool indexUnchanged,
186 : IndexInfo *indexInfo)
187 : {
188 : bool result;
189 : IndexTuple itup;
190 :
191 : /* generate an index tuple */
192 6759632 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193 6759632 : itup->t_tid = *ht_ctid;
194 :
195 6759632 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 :
197 6759130 : pfree(itup);
198 :
199 6759130 : return result;
200 : }
201 :
202 : /*
203 : * btgettuple() -- Get the next tuple in the scan.
204 : */
205 : bool
206 29977502 : btgettuple(IndexScanDesc scan, ScanDirection dir)
207 : {
208 29977502 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : bool res;
210 :
211 : /* btree indexes are never lossy */
212 29977502 : scan->xs_recheck = false;
213 :
214 : /* Each loop iteration performs another primitive index scan */
215 : do
216 : {
217 : /*
218 : * If we've already initialized this scan, we can just advance it in
219 : * the appropriate direction. If we haven't done so yet, we call
220 : * _bt_first() to get the first item in the scan.
221 : */
222 29977666 : if (!BTScanPosIsValid(so->currPos))
223 13206002 : res = _bt_first(scan, dir);
224 : else
225 : {
226 : /*
227 : * Check to see if we should kill the previously-fetched tuple.
228 : */
229 16771664 : if (scan->kill_prior_tuple)
230 : {
231 : /*
232 : * Yes, remember it for later. (We'll deal with all such
233 : * tuples at once right before leaving the index page.) The
234 : * test for numKilled overrun is not just paranoia: if the
235 : * caller reverses direction in the indexscan then the same
236 : * item might get entered multiple times. It's not worth
237 : * trying to optimize that, so we don't detect it, but instead
238 : * just forget any excess entries.
239 : */
240 446464 : if (so->killedItems == NULL)
241 157582 : so->killedItems = (int *)
242 157582 : palloc(MaxTIDsPerBTreePage * sizeof(int));
243 446464 : if (so->numKilled < MaxTIDsPerBTreePage)
244 446464 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245 : }
246 :
247 : /*
248 : * Now continue the scan.
249 : */
250 16771664 : res = _bt_next(scan, dir);
251 : }
252 :
253 : /* If we have a tuple, return it ... */
254 29977666 : if (res)
255 24006836 : break;
256 : /* ... otherwise see if we need another primitive index scan */
257 5970830 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 :
259 29977502 : return res;
260 : }
261 :
262 : /*
263 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
264 : */
265 : int64
266 12096 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
267 : {
268 12096 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
269 12096 : int64 ntids = 0;
270 : ItemPointer heapTid;
271 :
272 : /* Each loop iteration performs another primitive index scan */
273 : do
274 : {
275 : /* Fetch the first page & tuple */
276 12578 : if (_bt_first(scan, ForwardScanDirection))
277 : {
278 : /* Save tuple ID, and continue scanning */
279 8814 : heapTid = &scan->xs_heaptid;
280 8814 : tbm_add_tuples(tbm, heapTid, 1, false);
281 8814 : ntids++;
282 :
283 : for (;;)
284 : {
285 : /*
286 : * Advance to next tuple within page. This is the same as the
287 : * easy case in _bt_next().
288 : */
289 1870688 : if (++so->currPos.itemIndex > so->currPos.lastItem)
290 : {
291 : /* let _bt_next do the heavy lifting */
292 13688 : if (!_bt_next(scan, ForwardScanDirection))
293 8814 : break;
294 : }
295 :
296 : /* Save tuple ID, and continue scanning */
297 1861874 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298 1861874 : tbm_add_tuples(tbm, heapTid, 1, false);
299 1861874 : ntids++;
300 : }
301 : }
302 : /* Now see if we need another primitive index scan */
303 12578 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 :
305 12096 : return ntids;
306 : }
307 :
308 : /*
309 : * btbeginscan() -- start a scan on a btree index
310 : */
311 : IndexScanDesc
312 12767672 : btbeginscan(Relation rel, int nkeys, int norderbys)
313 : {
314 : IndexScanDesc scan;
315 : BTScanOpaque so;
316 :
317 : /* no order by operators allowed */
318 : Assert(norderbys == 0);
319 :
320 : /* get the scan */
321 12767672 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 :
323 : /* allocate private workspace */
324 12767672 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
325 12767672 : BTScanPosInvalidate(so->currPos);
326 12767672 : BTScanPosInvalidate(so->markPos);
327 12767672 : if (scan->numberOfKeys > 0)
328 12755412 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329 : else
330 12260 : so->keyData = NULL;
331 :
332 12767672 : so->needPrimScan = false;
333 12767672 : so->scanBehind = false;
334 12767672 : so->oppositeDirCheck = false;
335 12767672 : so->arrayKeys = NULL;
336 12767672 : so->orderProcs = NULL;
337 12767672 : so->arrayContext = NULL;
338 :
339 12767672 : so->killedItems = NULL; /* until needed */
340 12767672 : so->numKilled = 0;
341 :
342 : /*
343 : * We don't know yet whether the scan will be index-only, so we do not
344 : * allocate the tuple workspace arrays until btrescan. However, we set up
345 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
346 : */
347 12767672 : so->currTuples = so->markTuples = NULL;
348 :
349 12767672 : scan->xs_itupdesc = RelationGetDescr(rel);
350 :
351 12767672 : scan->opaque = so;
352 :
353 12767672 : return scan;
354 : }
355 :
356 : /*
357 : * btrescan() -- rescan an index relation
358 : */
359 : void
360 13219030 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
361 : ScanKey orderbys, int norderbys)
362 : {
363 13219030 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
364 :
365 : /* we aren't holding any read locks, but gotta drop the pins */
366 13219030 : if (BTScanPosIsValid(so->currPos))
367 : {
368 : /* Before leaving current page, deal with any killed items */
369 68140 : if (so->numKilled > 0)
370 608 : _bt_killitems(scan);
371 68140 : BTScanPosUnpinIfPinned(so->currPos);
372 68140 : BTScanPosInvalidate(so->currPos);
373 : }
374 :
375 13219030 : so->markItemIndex = -1;
376 13219030 : so->needPrimScan = false;
377 13219030 : so->scanBehind = false;
378 13219030 : so->oppositeDirCheck = false;
379 13219030 : BTScanPosUnpinIfPinned(so->markPos);
380 13219030 : BTScanPosInvalidate(so->markPos);
381 :
382 : /*
383 : * Allocate tuple workspace arrays, if needed for an index-only scan and
384 : * not already done in a previous rescan call. To save on palloc
385 : * overhead, both workspaces are allocated as one palloc block; only this
386 : * function and btendscan know that.
387 : *
388 : * NOTE: this data structure also makes it safe to return data from a
389 : * "name" column, even though btree name_ops uses an underlying storage
390 : * datatype of cstring. The risk there is that "name" is supposed to be
391 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
392 : * However, since we only return data out of tuples sitting in the
393 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
394 : * data out of the markTuples array --- running off the end of memory for
395 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
396 : * adding special-case treatment for name_ops elsewhere.
397 : */
398 13219030 : if (scan->xs_want_itup && so->currTuples == NULL)
399 : {
400 115098 : so->currTuples = (char *) palloc(BLCKSZ * 2);
401 115098 : so->markTuples = so->currTuples + BLCKSZ;
402 : }
403 :
404 : /*
405 : * Reset the scan keys
406 : */
407 13219030 : if (scankey && scan->numberOfKeys > 0)
408 13206644 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
409 13219030 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410 13219030 : so->numArrayKeys = 0; /* ditto */
411 13219030 : }
412 :
413 : /*
414 : * btendscan() -- close down a scan
415 : */
416 : void
417 12766226 : btendscan(IndexScanDesc scan)
418 : {
419 12766226 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 :
421 : /* we aren't holding any read locks, but gotta drop the pins */
422 12766226 : if (BTScanPosIsValid(so->currPos))
423 : {
424 : /* Before leaving current page, deal with any killed items */
425 7166016 : if (so->numKilled > 0)
426 81520 : _bt_killitems(scan);
427 7166016 : BTScanPosUnpinIfPinned(so->currPos);
428 : }
429 :
430 12766226 : so->markItemIndex = -1;
431 12766226 : BTScanPosUnpinIfPinned(so->markPos);
432 :
433 : /* No need to invalidate positions, the RAM is about to be freed. */
434 :
435 : /* Release storage */
436 12766226 : if (so->keyData != NULL)
437 12753996 : pfree(so->keyData);
438 : /* so->arrayKeys and so->orderProcs are in arrayContext */
439 12766226 : if (so->arrayContext != NULL)
440 1110 : MemoryContextDelete(so->arrayContext);
441 12766226 : if (so->killedItems != NULL)
442 157522 : pfree(so->killedItems);
443 12766226 : if (so->currTuples != NULL)
444 115054 : pfree(so->currTuples);
445 : /* so->markTuples should not be pfree'd, see btrescan */
446 12766226 : pfree(so);
447 12766226 : }
448 :
449 : /*
450 : * btmarkpos() -- save current scan position
451 : */
452 : void
453 130042 : btmarkpos(IndexScanDesc scan)
454 : {
455 130042 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 :
457 : /* There may be an old mark with a pin (but no lock). */
458 130042 : BTScanPosUnpinIfPinned(so->markPos);
459 :
460 : /*
461 : * Just record the current itemIndex. If we later step to next page
462 : * before releasing the marked position, _bt_steppage makes a full copy of
463 : * the currPos struct in markPos. If (as often happens) the mark is moved
464 : * before we leave the page, we don't have to do that work.
465 : */
466 130042 : if (BTScanPosIsValid(so->currPos))
467 130042 : so->markItemIndex = so->currPos.itemIndex;
468 : else
469 : {
470 0 : BTScanPosInvalidate(so->markPos);
471 0 : so->markItemIndex = -1;
472 : }
473 130042 : }
474 :
475 : /*
476 : * btrestrpos() -- restore scan to last saved position
477 : */
478 : void
479 54018 : btrestrpos(IndexScanDesc scan)
480 : {
481 54018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 :
483 54018 : if (so->markItemIndex >= 0)
484 : {
485 : /*
486 : * The scan has never moved to a new page since the last mark. Just
487 : * restore the itemIndex.
488 : *
489 : * NB: In this case we can't count on anything in so->markPos to be
490 : * accurate.
491 : */
492 53910 : so->currPos.itemIndex = so->markItemIndex;
493 : }
494 : else
495 : {
496 : /*
497 : * The scan moved to a new page after last mark or restore, and we are
498 : * now restoring to the marked page. We aren't holding any read
499 : * locks, but if we're still holding the pin for the current position,
500 : * we must drop it.
501 : */
502 108 : if (BTScanPosIsValid(so->currPos))
503 : {
504 : /* Before leaving current page, deal with any killed items */
505 108 : if (so->numKilled > 0)
506 0 : _bt_killitems(scan);
507 108 : BTScanPosUnpinIfPinned(so->currPos);
508 : }
509 :
510 108 : if (BTScanPosIsValid(so->markPos))
511 : {
512 : /* bump pin on mark buffer for assignment to current buffer */
513 108 : if (BTScanPosIsPinned(so->markPos))
514 0 : IncrBufferRefCount(so->markPos.buf);
515 108 : memcpy(&so->currPos, &so->markPos,
516 : offsetof(BTScanPosData, items[1]) +
517 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
518 108 : if (so->currTuples)
519 0 : memcpy(so->currTuples, so->markTuples,
520 0 : so->markPos.nextTupleOffset);
521 : /* Reset the scan's array keys (see _bt_steppage for why) */
522 108 : if (so->numArrayKeys)
523 : {
524 0 : _bt_start_array_keys(scan, so->currPos.dir);
525 0 : so->needPrimScan = false;
526 : }
527 : }
528 : else
529 0 : BTScanPosInvalidate(so->currPos);
530 : }
531 54018 : }
532 :
533 : /*
534 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
535 : */
536 : Size
537 64 : btestimateparallelscan(int nkeys, int norderbys)
538 : {
539 : /* Pessimistically assume all input scankeys will be output with arrays */
540 64 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 : }
542 :
543 : /*
544 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
545 : */
546 : void
547 64 : btinitparallelscan(void *target)
548 : {
549 64 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
550 :
551 64 : SpinLockInit(&bt_target->btps_mutex);
552 64 : bt_target->btps_nextScanPage = InvalidBlockNumber;
553 64 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
554 64 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
555 64 : ConditionVariableInit(&bt_target->btps_cv);
556 64 : }
557 :
558 : /*
559 : * btparallelrescan() -- reset parallel scan
560 : */
561 : void
562 24 : btparallelrescan(IndexScanDesc scan)
563 : {
564 : BTParallelScanDesc btscan;
565 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
566 :
567 : Assert(parallel_scan);
568 :
569 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
570 : parallel_scan->ps_offset);
571 :
572 : /*
573 : * In theory, we don't need to acquire the spinlock here, because there
574 : * shouldn't be any other workers running at this point, but we do so for
575 : * consistency.
576 : */
577 24 : SpinLockAcquire(&btscan->btps_mutex);
578 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
579 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
580 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
581 24 : SpinLockRelease(&btscan->btps_mutex);
582 24 : }
583 :
584 : /*
585 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
586 : * page. Other scans must wait until we call _bt_parallel_release()
587 : * or _bt_parallel_done().
588 : *
589 : * The return value is true if we successfully seized the scan and false
590 : * if we did not. The latter case occurs when no pages remain, or when
591 : * another primitive index scan is scheduled that caller's backend cannot
592 : * start just yet (only backends that call from _bt_first are capable of
593 : * starting primitive index scans, which they indicate by passing first=true).
594 : *
595 : * If the return value is true, *next_scan_page returns the next page of the
596 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
597 : * An invalid *next_scan_page means the scan hasn't yet started, or that
598 : * caller needs to start the next primitive index scan (if it's the latter
599 : * case we'll set so.needPrimScan).
600 : *
601 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
602 : * the return value is false.
603 : */
604 : bool
605 1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
606 : BlockNumber *last_curr_page, bool first)
607 : {
608 1658 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
609 1658 : bool exit_loop = false,
610 1658 : status = true,
611 1658 : endscan = false;
612 1658 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
613 : BTParallelScanDesc btscan;
614 :
615 1658 : *next_scan_page = InvalidBlockNumber;
616 1658 : *last_curr_page = InvalidBlockNumber;
617 :
618 : /*
619 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
620 : * call to _bt_readnextpage treats this backend similarly to a serial
621 : * backend that steps from *last_curr_page to *next_scan_page (unless this
622 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
623 : */
624 1658 : BTScanPosInvalidate(so->currPos);
625 1658 : so->currPos.moreLeft = so->currPos.moreRight = true;
626 :
627 1658 : if (first)
628 : {
629 : /*
630 : * Initialize array related state when called from _bt_first, assuming
631 : * that this will be the first primitive index scan for the scan
632 : */
633 446 : so->needPrimScan = false;
634 446 : so->scanBehind = false;
635 446 : so->oppositeDirCheck = false;
636 : }
637 : else
638 : {
639 : /*
640 : * Don't attempt to seize the scan when it requires another primitive
641 : * index scan, since caller's backend cannot start it right now
642 : */
643 1212 : if (so->needPrimScan)
644 0 : return false;
645 : }
646 :
647 1658 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
648 : parallel_scan->ps_offset);
649 :
650 : while (1)
651 : {
652 1658 : SpinLockAcquire(&btscan->btps_mutex);
653 :
654 1658 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
655 : {
656 : /* We're done with this parallel index scan */
657 308 : status = false;
658 : }
659 1350 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
660 1226 : btscan->btps_nextScanPage == P_NONE)
661 : {
662 : /* End this parallel index scan */
663 14 : status = false;
664 14 : endscan = true;
665 : }
666 1336 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
667 : {
668 : Assert(so->numArrayKeys);
669 :
670 36 : if (first)
671 : {
672 : /* Can start scheduled primitive scan right away, so do so */
673 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
674 72 : for (int i = 0; i < so->numArrayKeys; i++)
675 : {
676 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
677 36 : ScanKey skey = &so->keyData[array->scan_key];
678 :
679 36 : array->cur_elem = btscan->btps_arrElems[i];
680 36 : skey->sk_argument = array->elem_values[array->cur_elem];
681 : }
682 36 : exit_loop = true;
683 : }
684 : else
685 : {
686 : /*
687 : * Don't attempt to seize the scan when it requires another
688 : * primitive index scan, since caller's backend cannot start
689 : * it right now
690 : */
691 0 : status = false;
692 : }
693 :
694 : /*
695 : * Either way, update backend local state to indicate that a
696 : * pending primitive scan is required
697 : */
698 36 : so->needPrimScan = true;
699 36 : so->scanBehind = false;
700 36 : so->oppositeDirCheck = false;
701 : }
702 1300 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
703 : {
704 : /*
705 : * We have successfully seized control of the scan for the purpose
706 : * of advancing it to a new page!
707 : */
708 1300 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
709 : Assert(btscan->btps_nextScanPage != P_NONE);
710 1300 : *next_scan_page = btscan->btps_nextScanPage;
711 1300 : *last_curr_page = btscan->btps_lastCurrPage;
712 1300 : exit_loop = true;
713 : }
714 1658 : SpinLockRelease(&btscan->btps_mutex);
715 1658 : if (exit_loop || !status)
716 : break;
717 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
718 : }
719 1658 : ConditionVariableCancelSleep();
720 :
721 : /* When the scan has reached the rightmost (or leftmost) page, end it */
722 1658 : if (endscan)
723 14 : _bt_parallel_done(scan);
724 :
725 1658 : return status;
726 : }
727 :
728 : /*
729 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
730 : * new page. We now have the new value btps_nextScanPage; another backend
731 : * can now begin advancing the scan.
732 : *
733 : * Callers whose scan uses array keys must save their curr_page argument so
734 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
735 : * determine that another primitive index scan is required.
736 : *
737 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
738 : * rightmost/leftmost page. This is treated as reaching the end of the scan
739 : * within _bt_parallel_seize.
740 : *
741 : * Note: unlike the serial case, parallel scans don't need to remember both
742 : * sibling links. next_scan_page is whichever link is next given the scan's
743 : * direction. That's all we'll ever need, since the direction of a parallel
744 : * scan can never change.
745 : */
746 : void
747 1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
748 : BlockNumber curr_page)
749 : {
750 1336 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
751 : BTParallelScanDesc btscan;
752 :
753 : Assert(BlockNumberIsValid(next_scan_page));
754 :
755 1336 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
756 : parallel_scan->ps_offset);
757 :
758 1336 : SpinLockAcquire(&btscan->btps_mutex);
759 1336 : btscan->btps_nextScanPage = next_scan_page;
760 1336 : btscan->btps_lastCurrPage = curr_page;
761 1336 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
762 1336 : SpinLockRelease(&btscan->btps_mutex);
763 1336 : ConditionVariableSignal(&btscan->btps_cv);
764 1336 : }
765 :
766 : /*
767 : * _bt_parallel_done() -- Mark the parallel scan as complete.
768 : *
769 : * When there are no pages left to scan, this function should be called to
770 : * notify other workers. Otherwise, they might wait forever for the scan to
771 : * advance to the next page.
772 : */
773 : void
774 5983130 : _bt_parallel_done(IndexScanDesc scan)
775 : {
776 5983130 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
777 5983130 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
778 : BTParallelScanDesc btscan;
779 5983130 : bool status_changed = false;
780 :
781 : Assert(!BTScanPosIsValid(so->currPos));
782 :
783 : /* Do nothing, for non-parallel scans */
784 5983130 : if (parallel_scan == NULL)
785 5982968 : return;
786 :
787 : /*
788 : * Should not mark parallel scan done when there's still a pending
789 : * primitive index scan
790 : */
791 162 : if (so->needPrimScan)
792 36 : return;
793 :
794 126 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
795 : parallel_scan->ps_offset);
796 :
797 : /*
798 : * Mark the parallel scan as done, unless some other process did so
799 : * already
800 : */
801 126 : SpinLockAcquire(&btscan->btps_mutex);
802 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
803 126 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
804 : {
805 88 : btscan->btps_pageStatus = BTPARALLEL_DONE;
806 88 : status_changed = true;
807 : }
808 126 : SpinLockRelease(&btscan->btps_mutex);
809 :
810 : /* wake up all the workers associated with this parallel scan */
811 126 : if (status_changed)
812 88 : ConditionVariableBroadcast(&btscan->btps_cv);
813 : }
814 :
815 : /*
816 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
817 : *
818 : * Caller passes the curr_page most recently passed to _bt_parallel_release
819 : * by its backend. Caller successfully schedules the next primitive index scan
820 : * if the shared parallel state hasn't been seized since caller's backend last
821 : * advanced the scan.
822 : */
823 : void
824 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
825 : {
826 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
827 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828 : BTParallelScanDesc btscan;
829 :
830 : Assert(so->numArrayKeys);
831 :
832 36 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
833 : parallel_scan->ps_offset);
834 :
835 36 : SpinLockAcquire(&btscan->btps_mutex);
836 36 : if (btscan->btps_lastCurrPage == curr_page &&
837 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
838 : {
839 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
840 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
841 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
842 :
843 : /* Serialize scan's current array keys */
844 72 : for (int i = 0; i < so->numArrayKeys; i++)
845 : {
846 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
847 :
848 36 : btscan->btps_arrElems[i] = array->cur_elem;
849 : }
850 : }
851 36 : SpinLockRelease(&btscan->btps_mutex);
852 36 : }
853 :
854 : /*
855 : * Bulk deletion of all index entries pointing to a set of heap tuples.
856 : * The set of target tuples is specified via a callback routine that tells
857 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
858 : *
859 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
860 : */
861 : IndexBulkDeleteResult *
862 2554 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
863 : IndexBulkDeleteCallback callback, void *callback_state)
864 : {
865 2554 : Relation rel = info->index;
866 : BTCycleId cycleid;
867 :
868 : /* allocate stats if first time through, else re-use existing struct */
869 2554 : if (stats == NULL)
870 2554 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
871 :
872 : /* Establish the vacuum cycle ID to use for this scan */
873 : /* The ENSURE stuff ensures we clean up shared memory on failure */
874 2554 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
875 : {
876 2554 : cycleid = _bt_start_vacuum(rel);
877 :
878 2554 : btvacuumscan(info, stats, callback, callback_state, cycleid);
879 : }
880 2554 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
881 2554 : _bt_end_vacuum(rel);
882 :
883 2554 : return stats;
884 : }
885 :
886 : /*
887 : * Post-VACUUM cleanup.
888 : *
889 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
890 : */
891 : IndexBulkDeleteResult *
892 156640 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
893 : {
894 : BlockNumber num_delpages;
895 :
896 : /* No-op in ANALYZE ONLY mode */
897 156640 : if (info->analyze_only)
898 16226 : return stats;
899 :
900 : /*
901 : * If btbulkdelete was called, we need not do anything (we just maintain
902 : * the information used within _bt_vacuum_needs_cleanup() by calling
903 : * _bt_set_cleanup_info() below).
904 : *
905 : * If btbulkdelete was _not_ called, then we have a choice to make: we
906 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
907 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
908 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
909 : * now.
910 : */
911 140414 : if (stats == NULL)
912 : {
913 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
914 138470 : if (!_bt_vacuum_needs_cleanup(info->index))
915 138458 : return NULL;
916 :
917 : /*
918 : * Since we aren't going to actually delete any leaf items, there's no
919 : * need to go through all the vacuum-cycle-ID pushups here.
920 : *
921 : * Posting list tuples are a source of inaccuracy for cleanup-only
922 : * scans. btvacuumscan() will assume that the number of index tuples
923 : * from each page can be used as num_index_tuples, even though
924 : * num_index_tuples is supposed to represent the number of TIDs in the
925 : * index. This naive approach can underestimate the number of tuples
926 : * in the index significantly.
927 : *
928 : * We handle the problem by making num_index_tuples an estimate in
929 : * cleanup-only case.
930 : */
931 12 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
932 12 : btvacuumscan(info, stats, NULL, NULL, 0);
933 12 : stats->estimated_count = true;
934 : }
935 :
936 : /*
937 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
938 : *
939 : * num_delpages is the number of deleted pages now in the index that were
940 : * not safe to place in the FSM to be recycled just yet. num_delpages is
941 : * greater than 0 only when _bt_pagedel() actually deleted pages during
942 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
943 : * have failed to place any newly deleted pages in the FSM just moments
944 : * ago. (Actually, there are edge cases where recycling of the current
945 : * VACUUM's newly deleted pages does not even become safe by the time the
946 : * next VACUUM comes around. See nbtree/README.)
947 : */
948 : Assert(stats->pages_deleted >= stats->pages_free);
949 1956 : num_delpages = stats->pages_deleted - stats->pages_free;
950 1956 : _bt_set_cleanup_info(info->index, num_delpages);
951 :
952 : /*
953 : * It's quite possible for us to be fooled by concurrent page splits into
954 : * double-counting some index tuples, so disbelieve any total that exceeds
955 : * the underlying heap's count ... if we know that accurately. Otherwise
956 : * this might just make matters worse.
957 : */
958 1956 : if (!info->estimated_count)
959 : {
960 1906 : if (stats->num_index_tuples > info->num_heap_tuples)
961 4 : stats->num_index_tuples = info->num_heap_tuples;
962 : }
963 :
964 1956 : return stats;
965 : }
966 :
967 : /*
968 : * btvacuumscan --- scan the index for VACUUMing purposes
969 : *
970 : * This combines the functions of looking for leaf tuples that are deletable
971 : * according to the vacuum callback, looking for empty pages that can be
972 : * deleted, and looking for old deleted pages that can be recycled. Both
973 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
974 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
975 : *
976 : * The caller is responsible for initially allocating/zeroing a stats struct
977 : * and for obtaining a vacuum cycle ID if necessary.
978 : */
979 : static void
980 2566 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
981 : IndexBulkDeleteCallback callback, void *callback_state,
982 : BTCycleId cycleid)
983 : {
984 2566 : Relation rel = info->index;
985 : BTVacState vstate;
986 : BlockNumber num_pages;
987 : BlockNumber scanblkno;
988 : bool needLock;
989 :
990 : /*
991 : * Reset fields that track information about the entire index now. This
992 : * avoids double-counting in the case where a single VACUUM command
993 : * requires multiple scans of the index.
994 : *
995 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
996 : * since they track information about the VACUUM command, and so must last
997 : * across each call to btvacuumscan().
998 : *
999 : * (Note that pages_free is treated as state about the whole index, not
1000 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1001 : * calls are idempotent, and get repeated for the same deleted pages in
1002 : * some scenarios. The point for us is to track the number of recyclable
1003 : * pages in the index at the end of the VACUUM command.)
1004 : */
1005 2566 : stats->num_pages = 0;
1006 2566 : stats->num_index_tuples = 0;
1007 2566 : stats->pages_deleted = 0;
1008 2566 : stats->pages_free = 0;
1009 :
1010 : /* Set up info to pass down to btvacuumpage */
1011 2566 : vstate.info = info;
1012 2566 : vstate.stats = stats;
1013 2566 : vstate.callback = callback;
1014 2566 : vstate.callback_state = callback_state;
1015 2566 : vstate.cycleid = cycleid;
1016 :
1017 : /* Create a temporary memory context to run _bt_pagedel in */
1018 2566 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1019 : "_bt_pagedel",
1020 : ALLOCSET_DEFAULT_SIZES);
1021 :
1022 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1023 2566 : vstate.bufsize = 0;
1024 2566 : vstate.maxbufsize = 0;
1025 2566 : vstate.pendingpages = NULL;
1026 2566 : vstate.npendingpages = 0;
1027 : /* Consider applying _bt_pendingfsm_finalize optimization */
1028 2566 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1029 :
1030 : /*
1031 : * The outer loop iterates over all index pages except the metapage, in
1032 : * physical order (we hope the kernel will cooperate in providing
1033 : * read-ahead for speed). It is critical that we visit all leaf pages,
1034 : * including ones added after we start the scan, else we might fail to
1035 : * delete some deletable tuples. Hence, we must repeatedly check the
1036 : * relation length. We must acquire the relation-extension lock while
1037 : * doing so to avoid a race condition: if someone else is extending the
1038 : * relation, there is a window where bufmgr/smgr have created a new
1039 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1040 : * we manage to scan such a page here, we'll improperly assume it can be
1041 : * recycled. Taking the lock synchronizes things enough to prevent a
1042 : * problem: either num_pages won't include the new page, or _bt_getbuf
1043 : * already has write lock on the buffer and it will be fully initialized
1044 : * before we can examine it. Also, we need not worry if a page is added
1045 : * immediately after we look; the page splitting code already has
1046 : * write-lock on the left page before it adds a right page, so we must
1047 : * already have processed any tuples due to be moved into such a page.
1048 : *
1049 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1050 : * think the use of the extension lock is still required.
1051 : *
1052 : * We can skip locking for new or temp relations, however, since no one
1053 : * else could be accessing them.
1054 : */
1055 2566 : needLock = !RELATION_IS_LOCAL(rel);
1056 :
1057 2566 : scanblkno = BTREE_METAPAGE + 1;
1058 : for (;;)
1059 : {
1060 : /* Get the current relation length */
1061 4864 : if (needLock)
1062 4860 : LockRelationForExtension(rel, ExclusiveLock);
1063 4864 : num_pages = RelationGetNumberOfBlocks(rel);
1064 4864 : if (needLock)
1065 4860 : UnlockRelationForExtension(rel, ExclusiveLock);
1066 :
1067 4864 : if (info->report_progress)
1068 952 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1069 : num_pages);
1070 :
1071 : /* Quit if we've scanned the whole relation */
1072 4864 : if (scanblkno >= num_pages)
1073 2566 : break;
1074 : /* Iterate over pages, then loop back to recheck length */
1075 25402 : for (; scanblkno < num_pages; scanblkno++)
1076 : {
1077 23104 : btvacuumpage(&vstate, scanblkno);
1078 23104 : if (info->report_progress)
1079 984 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1080 : scanblkno);
1081 : }
1082 : }
1083 :
1084 : /* Set statistics num_pages field to final size of index */
1085 2566 : stats->num_pages = num_pages;
1086 :
1087 2566 : MemoryContextDelete(vstate.pagedelcontext);
1088 :
1089 : /*
1090 : * If there were any calls to _bt_pagedel() during scan of the index then
1091 : * see if any of the resulting pages can be placed in the FSM now. When
1092 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1093 : *
1094 : * Finally, if we placed any pages in the FSM (either just now or during
1095 : * the scan), forcibly update the upper-level FSM pages to ensure that
1096 : * searchers can find them.
1097 : */
1098 2566 : _bt_pendingfsm_finalize(rel, &vstate);
1099 2566 : if (stats->pages_free > 0)
1100 30 : IndexFreeSpaceMapVacuum(rel);
1101 2566 : }
1102 :
1103 : /*
1104 : * btvacuumpage --- VACUUM one page
1105 : *
1106 : * This processes a single page for btvacuumscan(). In some cases we must
1107 : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1108 : * a previous call here. This is how we handle page splits (that happened
1109 : * after our cycleid was acquired) whose right half page happened to reuse
1110 : * a block that we might have processed at some point before it was
1111 : * recycled (i.e. before the page split).
1112 : */
1113 : static void
1114 23104 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1115 : {
1116 23104 : IndexVacuumInfo *info = vstate->info;
1117 23104 : IndexBulkDeleteResult *stats = vstate->stats;
1118 23104 : IndexBulkDeleteCallback callback = vstate->callback;
1119 23104 : void *callback_state = vstate->callback_state;
1120 23104 : Relation rel = info->index;
1121 23104 : Relation heaprel = info->heaprel;
1122 : bool attempt_pagedel;
1123 : BlockNumber blkno,
1124 : backtrack_to;
1125 : Buffer buf;
1126 : Page page;
1127 : BTPageOpaque opaque;
1128 :
1129 23104 : blkno = scanblkno;
1130 :
1131 23104 : backtrack:
1132 :
1133 23104 : attempt_pagedel = false;
1134 23104 : backtrack_to = P_NONE;
1135 :
1136 : /* call vacuum_delay_point while not holding any buffer lock */
1137 23104 : vacuum_delay_point();
1138 :
1139 : /*
1140 : * We can't use _bt_getbuf() here because it always applies
1141 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1142 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1143 : * buffer access strategy.
1144 : */
1145 23104 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1146 : info->strategy);
1147 23104 : _bt_lockbuf(rel, buf, BT_READ);
1148 23104 : page = BufferGetPage(buf);
1149 23104 : opaque = NULL;
1150 23104 : if (!PageIsNew(page))
1151 : {
1152 23104 : _bt_checkpage(rel, buf);
1153 23104 : opaque = BTPageGetOpaque(page);
1154 : }
1155 :
1156 : Assert(blkno <= scanblkno);
1157 23104 : if (blkno != scanblkno)
1158 : {
1159 : /*
1160 : * We're backtracking.
1161 : *
1162 : * We followed a right link to a sibling leaf page (a page that
1163 : * happens to be from a block located before scanblkno). The only
1164 : * case we want to do anything with is a live leaf page having the
1165 : * current vacuum cycle ID.
1166 : *
1167 : * The page had better be in a state that's consistent with what we
1168 : * expect. Check for conditions that imply corruption in passing. It
1169 : * can't be half-dead because only an interrupted VACUUM process can
1170 : * leave pages in that state, so we'd definitely have dealt with it
1171 : * back when the page was the scanblkno page (half-dead pages are
1172 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1173 : */
1174 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1175 : {
1176 : Assert(false);
1177 0 : ereport(LOG,
1178 : (errcode(ERRCODE_INDEX_CORRUPTED),
1179 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1180 : blkno, scanblkno, RelationGetRelationName(rel))));
1181 0 : _bt_relbuf(rel, buf);
1182 0 : return;
1183 : }
1184 :
1185 : /*
1186 : * We may have already processed the page in an earlier call, when the
1187 : * page was scanblkno. This happens when the leaf page split occurred
1188 : * after the scan began, but before the right sibling page became the
1189 : * scanblkno.
1190 : *
1191 : * Page may also have been deleted by current btvacuumpage() call,
1192 : * since _bt_pagedel() sometimes deletes the right sibling page of
1193 : * scanblkno in passing (it does so after we decided where to
1194 : * backtrack to). We don't need to process this page as a deleted
1195 : * page a second time now (in fact, it would be wrong to count it as a
1196 : * deleted page in the bulk delete statistics a second time).
1197 : */
1198 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1199 : {
1200 : /* Done with current scanblkno (and all lower split pages) */
1201 0 : _bt_relbuf(rel, buf);
1202 0 : return;
1203 : }
1204 : }
1205 :
1206 23104 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1207 : {
1208 : /* Okay to recycle this page (which could be leaf or internal) */
1209 1162 : RecordFreeIndexPage(rel, blkno);
1210 1162 : stats->pages_deleted++;
1211 1162 : stats->pages_free++;
1212 : }
1213 21942 : else if (P_ISDELETED(opaque))
1214 : {
1215 : /*
1216 : * Already deleted page (which could be leaf or internal). Can't
1217 : * recycle yet.
1218 : */
1219 158 : stats->pages_deleted++;
1220 : }
1221 21784 : else if (P_ISHALFDEAD(opaque))
1222 : {
1223 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1224 0 : attempt_pagedel = true;
1225 :
1226 : /*
1227 : * _bt_pagedel() will increment both pages_newly_deleted and
1228 : * pages_deleted stats in all cases (barring corruption)
1229 : */
1230 : }
1231 21784 : else if (P_ISLEAF(opaque))
1232 : {
1233 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1234 : int ndeletable;
1235 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1236 : int nupdatable;
1237 : OffsetNumber offnum,
1238 : minoff,
1239 : maxoff;
1240 : int nhtidsdead,
1241 : nhtidslive;
1242 :
1243 : /*
1244 : * Trade in the initial read lock for a full cleanup lock on this
1245 : * page. We must get such a lock on every leaf page over the course
1246 : * of the vacuum scan, whether or not it actually contains any
1247 : * deletable tuples --- see nbtree/README.
1248 : */
1249 20522 : _bt_upgradelockbufcleanup(rel, buf);
1250 :
1251 : /*
1252 : * Check whether we need to backtrack to earlier pages. What we are
1253 : * concerned about is a page split that happened since we started the
1254 : * vacuum scan. If the split moved tuples on the right half of the
1255 : * split (i.e. the tuples that sort high) to a block that we already
1256 : * passed over, then we might have missed the tuples. We need to
1257 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1258 : * or deleting scanblkno page below!)
1259 : */
1260 20522 : if (vstate->cycleid != 0 &&
1261 20394 : opaque->btpo_cycleid == vstate->cycleid &&
1262 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1263 0 : !P_RIGHTMOST(opaque) &&
1264 0 : opaque->btpo_next < scanblkno)
1265 0 : backtrack_to = opaque->btpo_next;
1266 :
1267 20522 : ndeletable = 0;
1268 20522 : nupdatable = 0;
1269 20522 : minoff = P_FIRSTDATAKEY(opaque);
1270 20522 : maxoff = PageGetMaxOffsetNumber(page);
1271 20522 : nhtidsdead = 0;
1272 20522 : nhtidslive = 0;
1273 20522 : if (callback)
1274 : {
1275 : /* btbulkdelete callback tells us what to delete (or update) */
1276 4119564 : for (offnum = minoff;
1277 : offnum <= maxoff;
1278 4099170 : offnum = OffsetNumberNext(offnum))
1279 : {
1280 : IndexTuple itup;
1281 :
1282 4099170 : itup = (IndexTuple) PageGetItem(page,
1283 : PageGetItemId(page, offnum));
1284 :
1285 : Assert(!BTreeTupleIsPivot(itup));
1286 4099170 : if (!BTreeTupleIsPosting(itup))
1287 : {
1288 : /* Regular tuple, standard table TID representation */
1289 3986546 : if (callback(&itup->t_tid, callback_state))
1290 : {
1291 1582910 : deletable[ndeletable++] = offnum;
1292 1582910 : nhtidsdead++;
1293 : }
1294 : else
1295 2403636 : nhtidslive++;
1296 : }
1297 : else
1298 : {
1299 : BTVacuumPosting vacposting;
1300 : int nremaining;
1301 :
1302 : /* Posting list tuple */
1303 112624 : vacposting = btreevacuumposting(vstate, itup, offnum,
1304 : &nremaining);
1305 112624 : if (vacposting == NULL)
1306 : {
1307 : /*
1308 : * All table TIDs from the posting tuple remain, so no
1309 : * delete or update required
1310 : */
1311 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1312 : }
1313 68998 : else if (nremaining > 0)
1314 : {
1315 :
1316 : /*
1317 : * Store metadata about posting list tuple in
1318 : * updatable array for entire page. Existing tuple
1319 : * will be updated during the later call to
1320 : * _bt_delitems_vacuum().
1321 : */
1322 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1323 28640 : updatable[nupdatable++] = vacposting;
1324 28640 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1325 : }
1326 : else
1327 : {
1328 : /*
1329 : * All table TIDs from the posting list must be
1330 : * deleted. We'll delete the index tuple completely
1331 : * (no update required).
1332 : */
1333 : Assert(nremaining == 0);
1334 40358 : deletable[ndeletable++] = offnum;
1335 40358 : nhtidsdead += BTreeTupleGetNPosting(itup);
1336 40358 : pfree(vacposting);
1337 : }
1338 :
1339 112624 : nhtidslive += nremaining;
1340 : }
1341 : }
1342 : }
1343 :
1344 : /*
1345 : * Apply any needed deletes or updates. We issue just one
1346 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1347 : */
1348 20522 : if (ndeletable > 0 || nupdatable > 0)
1349 : {
1350 : Assert(nhtidsdead >= ndeletable + nupdatable);
1351 13274 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1352 : nupdatable);
1353 :
1354 13274 : stats->tuples_removed += nhtidsdead;
1355 : /* must recompute maxoff */
1356 13274 : maxoff = PageGetMaxOffsetNumber(page);
1357 :
1358 : /* can't leak memory here */
1359 41914 : for (int i = 0; i < nupdatable; i++)
1360 28640 : pfree(updatable[i]);
1361 : }
1362 : else
1363 : {
1364 : /*
1365 : * If the leaf page has been split during this vacuum cycle, it
1366 : * seems worth expending a write to clear btpo_cycleid even if we
1367 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1368 : * takes care of this.) This ensures we won't process the page
1369 : * again.
1370 : *
1371 : * We treat this like a hint-bit update because there's no need to
1372 : * WAL-log it.
1373 : */
1374 : Assert(nhtidsdead == 0);
1375 7248 : if (vstate->cycleid != 0 &&
1376 7120 : opaque->btpo_cycleid == vstate->cycleid)
1377 : {
1378 0 : opaque->btpo_cycleid = 0;
1379 0 : MarkBufferDirtyHint(buf, true);
1380 : }
1381 : }
1382 :
1383 : /*
1384 : * If the leaf page is now empty, try to delete it; else count the
1385 : * live tuples (live table TIDs in posting lists are counted as
1386 : * separate live tuples). We don't delete when backtracking, though,
1387 : * since that would require teaching _bt_pagedel() about backtracking
1388 : * (doesn't seem worth adding more complexity to deal with that).
1389 : *
1390 : * We don't count the number of live TIDs during cleanup-only calls to
1391 : * btvacuumscan (i.e. when callback is not set). We count the number
1392 : * of index tuples directly instead. This avoids the expense of
1393 : * directly examining all of the tuples on each page. VACUUM will
1394 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1395 : * doesn't matter that this underestimates num_index_tuples
1396 : * significantly in some cases.
1397 : */
1398 20522 : if (minoff > maxoff)
1399 5690 : attempt_pagedel = (blkno == scanblkno);
1400 14832 : else if (callback)
1401 14712 : stats->num_index_tuples += nhtidslive;
1402 : else
1403 120 : stats->num_index_tuples += maxoff - minoff + 1;
1404 :
1405 : Assert(!attempt_pagedel || nhtidslive == 0);
1406 : }
1407 :
1408 23104 : if (attempt_pagedel)
1409 : {
1410 : MemoryContext oldcontext;
1411 :
1412 : /* Run pagedel in a temp context to avoid memory leakage */
1413 5690 : MemoryContextReset(vstate->pagedelcontext);
1414 5690 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1415 :
1416 : /*
1417 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1418 : * pages_newly_deleted and pages_deleted are likely to be incremented
1419 : * during call
1420 : */
1421 : Assert(blkno == scanblkno);
1422 5690 : _bt_pagedel(rel, buf, vstate);
1423 :
1424 5690 : MemoryContextSwitchTo(oldcontext);
1425 : /* pagedel released buffer, so we shouldn't */
1426 : }
1427 : else
1428 17414 : _bt_relbuf(rel, buf);
1429 :
1430 23104 : if (backtrack_to != P_NONE)
1431 : {
1432 0 : blkno = backtrack_to;
1433 0 : goto backtrack;
1434 : }
1435 : }
1436 :
1437 : /*
1438 : * btreevacuumposting --- determine TIDs still needed in posting list
1439 : *
1440 : * Returns metadata describing how to build replacement tuple without the TIDs
1441 : * that VACUUM needs to delete. Returned value is NULL in the common case
1442 : * where no changes are needed to caller's posting list tuple (we avoid
1443 : * allocating memory here as an optimization).
1444 : *
1445 : * The number of TIDs that should remain in the posting list tuple is set for
1446 : * caller in *nremaining.
1447 : */
1448 : static BTVacuumPosting
1449 112624 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1450 : OffsetNumber updatedoffset, int *nremaining)
1451 : {
1452 112624 : int live = 0;
1453 112624 : int nitem = BTreeTupleGetNPosting(posting);
1454 112624 : ItemPointer items = BTreeTupleGetPosting(posting);
1455 112624 : BTVacuumPosting vacposting = NULL;
1456 :
1457 661362 : for (int i = 0; i < nitem; i++)
1458 : {
1459 548738 : if (!vstate->callback(items + i, vstate->callback_state))
1460 : {
1461 : /* Live table TID */
1462 292562 : live++;
1463 : }
1464 256176 : else if (vacposting == NULL)
1465 : {
1466 : /*
1467 : * First dead table TID encountered.
1468 : *
1469 : * It's now clear that we need to delete one or more dead table
1470 : * TIDs, so start maintaining metadata describing how to update
1471 : * existing posting list tuple.
1472 : */
1473 68998 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1474 : nitem * sizeof(uint16));
1475 :
1476 68998 : vacposting->itup = posting;
1477 68998 : vacposting->updatedoffset = updatedoffset;
1478 68998 : vacposting->ndeletedtids = 0;
1479 68998 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1480 : }
1481 : else
1482 : {
1483 : /* Second or subsequent dead table TID */
1484 187178 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1485 : }
1486 : }
1487 :
1488 112624 : *nremaining = live;
1489 112624 : return vacposting;
1490 : }
1491 :
1492 : /*
1493 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1494 : *
1495 : * btrees always do, so this is trivial.
1496 : */
1497 : bool
1498 959506 : btcanreturn(Relation index, int attno)
1499 : {
1500 959506 : return true;
1501 : }
1502 :
1503 : /*
1504 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1505 : */
1506 : int
1507 614174 : btgettreeheight(Relation rel)
1508 : {
1509 614174 : return _bt_getrootheight(rel);
1510 : }
|