Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "commands/progress.h"
24 : #include "commands/vacuum.h"
25 : #include "nodes/execnodes.h"
26 : #include "pgstat.h"
27 : #include "storage/bulk_write.h"
28 : #include "storage/condition_variable.h"
29 : #include "storage/indexfsm.h"
30 : #include "storage/ipc.h"
31 : #include "storage/lmgr.h"
32 : #include "utils/fmgrprotos.h"
33 : #include "utils/index_selfuncs.h"
34 : #include "utils/memutils.h"
35 :
36 :
37 : /*
38 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
39 : *
40 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
41 : * scan to advance it via another call to _bt_first.
42 : *
43 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
44 : * a new page; others must wait.
45 : *
46 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
47 : * to a new page; some process can start doing that.
48 : *
49 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
50 : */
51 : typedef enum
52 : {
53 : BTPARALLEL_NOT_INITIALIZED,
54 : BTPARALLEL_NEED_PRIMSCAN,
55 : BTPARALLEL_ADVANCING,
56 : BTPARALLEL_IDLE,
57 : BTPARALLEL_DONE,
58 : } BTPS_State;
59 :
60 : /*
61 : * BTParallelScanDescData contains btree specific shared information required
62 : * for parallel scan.
63 : */
64 : typedef struct BTParallelScanDescData
65 : {
66 : BlockNumber btps_nextScanPage; /* next page to be scanned */
67 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
68 : * btps_nextScanPage */
69 : BTPS_State btps_pageStatus; /* indicates whether next page is
70 : * available for scan. see above for
71 : * possible states of parallel scan. */
72 : slock_t btps_mutex; /* protects above variables, btps_arrElems */
73 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
74 :
75 : /*
76 : * btps_arrElems is used when scans need to schedule another primitive
77 : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
78 : */
79 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
80 : } BTParallelScanDescData;
81 :
82 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
83 :
84 :
85 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
86 : IndexBulkDeleteCallback callback, void *callback_state,
87 : BTCycleId cycleid);
88 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
89 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
90 : IndexTuple posting,
91 : OffsetNumber updatedoffset,
92 : int *nremaining);
93 :
94 :
95 : /*
96 : * Btree handler function: return IndexAmRoutine with access method parameters
97 : * and callbacks.
98 : */
99 : Datum
100 2855304 : bthandler(PG_FUNCTION_ARGS)
101 : {
102 2855304 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
103 :
104 2855304 : amroutine->amstrategies = BTMaxStrategyNumber;
105 2855304 : amroutine->amsupport = BTNProcs;
106 2855304 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
107 2855304 : amroutine->amcanorder = true;
108 2855304 : amroutine->amcanorderbyop = false;
109 2855304 : amroutine->amcanbackward = true;
110 2855304 : amroutine->amcanunique = true;
111 2855304 : amroutine->amcanmulticol = true;
112 2855304 : amroutine->amoptionalkey = true;
113 2855304 : amroutine->amsearcharray = true;
114 2855304 : amroutine->amsearchnulls = true;
115 2855304 : amroutine->amstorage = false;
116 2855304 : amroutine->amclusterable = true;
117 2855304 : amroutine->ampredlocks = true;
118 2855304 : amroutine->amcanparallel = true;
119 2855304 : amroutine->amcanbuildparallel = true;
120 2855304 : amroutine->amcaninclude = true;
121 2855304 : amroutine->amusemaintenanceworkmem = false;
122 2855304 : amroutine->amsummarizing = false;
123 2855304 : amroutine->amparallelvacuumoptions =
124 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
125 2855304 : amroutine->amkeytype = InvalidOid;
126 :
127 2855304 : amroutine->ambuild = btbuild;
128 2855304 : amroutine->ambuildempty = btbuildempty;
129 2855304 : amroutine->aminsert = btinsert;
130 2855304 : amroutine->aminsertcleanup = NULL;
131 2855304 : amroutine->ambulkdelete = btbulkdelete;
132 2855304 : amroutine->amvacuumcleanup = btvacuumcleanup;
133 2855304 : amroutine->amcanreturn = btcanreturn;
134 2855304 : amroutine->amcostestimate = btcostestimate;
135 2855304 : amroutine->amgettreeheight = btgettreeheight;
136 2855304 : amroutine->amoptions = btoptions;
137 2855304 : amroutine->amproperty = btproperty;
138 2855304 : amroutine->ambuildphasename = btbuildphasename;
139 2855304 : amroutine->amvalidate = btvalidate;
140 2855304 : amroutine->amadjustmembers = btadjustmembers;
141 2855304 : amroutine->ambeginscan = btbeginscan;
142 2855304 : amroutine->amrescan = btrescan;
143 2855304 : amroutine->amgettuple = btgettuple;
144 2855304 : amroutine->amgetbitmap = btgetbitmap;
145 2855304 : amroutine->amendscan = btendscan;
146 2855304 : amroutine->ammarkpos = btmarkpos;
147 2855304 : amroutine->amrestrpos = btrestrpos;
148 2855304 : amroutine->amestimateparallelscan = btestimateparallelscan;
149 2855304 : amroutine->aminitparallelscan = btinitparallelscan;
150 2855304 : amroutine->amparallelrescan = btparallelrescan;
151 :
152 2855304 : PG_RETURN_POINTER(amroutine);
153 : }
154 :
155 : /*
156 : * btbuildempty() -- build an empty btree index in the initialization fork
157 : */
158 : void
159 142 : btbuildempty(Relation index)
160 : {
161 142 : bool allequalimage = _bt_allequalimage(index, false);
162 : BulkWriteState *bulkstate;
163 : BulkWriteBuffer metabuf;
164 :
165 142 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 :
167 : /* Construct metapage. */
168 142 : metabuf = smgr_bulk_get_buf(bulkstate);
169 142 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170 142 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 :
172 142 : smgr_bulk_finish(bulkstate);
173 142 : }
174 :
175 : /*
176 : * btinsert() -- insert an index tuple into a btree.
177 : *
178 : * Descend the tree recursively, find the appropriate location for our
179 : * new tuple, and put it there.
180 : */
181 : bool
182 6732522 : btinsert(Relation rel, Datum *values, bool *isnull,
183 : ItemPointer ht_ctid, Relation heapRel,
184 : IndexUniqueCheck checkUnique,
185 : bool indexUnchanged,
186 : IndexInfo *indexInfo)
187 : {
188 : bool result;
189 : IndexTuple itup;
190 :
191 : /* generate an index tuple */
192 6732522 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193 6732522 : itup->t_tid = *ht_ctid;
194 :
195 6732522 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 :
197 6732020 : pfree(itup);
198 :
199 6732020 : return result;
200 : }
201 :
202 : /*
203 : * btgettuple() -- Get the next tuple in the scan.
204 : */
205 : bool
206 29885394 : btgettuple(IndexScanDesc scan, ScanDirection dir)
207 : {
208 29885394 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : bool res;
210 :
211 : /* btree indexes are never lossy */
212 29885394 : scan->xs_recheck = false;
213 :
214 : /* Each loop iteration performs another primitive index scan */
215 : do
216 : {
217 : /*
218 : * If we've already initialized this scan, we can just advance it in
219 : * the appropriate direction. If we haven't done so yet, we call
220 : * _bt_first() to get the first item in the scan.
221 : */
222 29885558 : if (!BTScanPosIsValid(so->currPos))
223 13086500 : res = _bt_first(scan, dir);
224 : else
225 : {
226 : /*
227 : * Check to see if we should kill the previously-fetched tuple.
228 : */
229 16799058 : if (scan->kill_prior_tuple)
230 : {
231 : /*
232 : * Yes, remember it for later. (We'll deal with all such
233 : * tuples at once right before leaving the index page.) The
234 : * test for numKilled overrun is not just paranoia: if the
235 : * caller reverses direction in the indexscan then the same
236 : * item might get entered multiple times. It's not worth
237 : * trying to optimize that, so we don't detect it, but instead
238 : * just forget any excess entries.
239 : */
240 496440 : if (so->killedItems == NULL)
241 157306 : so->killedItems = (int *)
242 157306 : palloc(MaxTIDsPerBTreePage * sizeof(int));
243 496440 : if (so->numKilled < MaxTIDsPerBTreePage)
244 496440 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245 : }
246 :
247 : /*
248 : * Now continue the scan.
249 : */
250 16799058 : res = _bt_next(scan, dir);
251 : }
252 :
253 : /* If we have a tuple, return it ... */
254 29885558 : if (res)
255 23927006 : break;
256 : /* ... otherwise see if we need another primitive index scan */
257 5958552 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 :
259 29885394 : return res;
260 : }
261 :
262 : /*
263 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
264 : */
265 : int64
266 12154 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
267 : {
268 12154 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
269 12154 : int64 ntids = 0;
270 : ItemPointer heapTid;
271 :
272 : /* Each loop iteration performs another primitive index scan */
273 : do
274 : {
275 : /* Fetch the first page & tuple */
276 12590 : if (_bt_first(scan, ForwardScanDirection))
277 : {
278 : /* Save tuple ID, and continue scanning */
279 8836 : heapTid = &scan->xs_heaptid;
280 8836 : tbm_add_tuples(tbm, heapTid, 1, false);
281 8836 : ntids++;
282 :
283 : for (;;)
284 : {
285 : /*
286 : * Advance to next tuple within page. This is the same as the
287 : * easy case in _bt_next().
288 : */
289 1864250 : if (++so->currPos.itemIndex > so->currPos.lastItem)
290 : {
291 : /* let _bt_next do the heavy lifting */
292 13702 : if (!_bt_next(scan, ForwardScanDirection))
293 8836 : break;
294 : }
295 :
296 : /* Save tuple ID, and continue scanning */
297 1855414 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298 1855414 : tbm_add_tuples(tbm, heapTid, 1, false);
299 1855414 : ntids++;
300 : }
301 : }
302 : /* Now see if we need another primitive index scan */
303 12590 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 :
305 12154 : return ntids;
306 : }
307 :
308 : /*
309 : * btbeginscan() -- start a scan on a btree index
310 : */
311 : IndexScanDesc
312 12649948 : btbeginscan(Relation rel, int nkeys, int norderbys)
313 : {
314 : IndexScanDesc scan;
315 : BTScanOpaque so;
316 :
317 : /* no order by operators allowed */
318 : Assert(norderbys == 0);
319 :
320 : /* get the scan */
321 12649948 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 :
323 : /* allocate private workspace */
324 12649948 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
325 12649948 : BTScanPosInvalidate(so->currPos);
326 12649948 : BTScanPosInvalidate(so->markPos);
327 12649948 : if (scan->numberOfKeys > 0)
328 12638074 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329 : else
330 11874 : so->keyData = NULL;
331 :
332 12649948 : so->needPrimScan = false;
333 12649948 : so->scanBehind = false;
334 12649948 : so->oppositeDirCheck = false;
335 12649948 : so->arrayKeys = NULL;
336 12649948 : so->orderProcs = NULL;
337 12649948 : so->arrayContext = NULL;
338 :
339 12649948 : so->killedItems = NULL; /* until needed */
340 12649948 : so->numKilled = 0;
341 :
342 : /*
343 : * We don't know yet whether the scan will be index-only, so we do not
344 : * allocate the tuple workspace arrays until btrescan. However, we set up
345 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
346 : */
347 12649948 : so->currTuples = so->markTuples = NULL;
348 :
349 12649948 : scan->xs_itupdesc = RelationGetDescr(rel);
350 :
351 12649948 : scan->opaque = so;
352 :
353 12649948 : return scan;
354 : }
355 :
356 : /*
357 : * btrescan() -- rescan an index relation
358 : */
359 : void
360 13099558 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
361 : ScanKey orderbys, int norderbys)
362 : {
363 13099558 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
364 :
365 : /* we aren't holding any read locks, but gotta drop the pins */
366 13099558 : if (BTScanPosIsValid(so->currPos))
367 : {
368 : /* Before leaving current page, deal with any killed items */
369 69242 : if (so->numKilled > 0)
370 814 : _bt_killitems(scan);
371 69242 : BTScanPosUnpinIfPinned(so->currPos);
372 69242 : BTScanPosInvalidate(so->currPos);
373 : }
374 :
375 13099558 : so->markItemIndex = -1;
376 13099558 : so->needPrimScan = false;
377 13099558 : so->scanBehind = false;
378 13099558 : so->oppositeDirCheck = false;
379 13099558 : BTScanPosUnpinIfPinned(so->markPos);
380 13099558 : BTScanPosInvalidate(so->markPos);
381 :
382 : /*
383 : * Allocate tuple workspace arrays, if needed for an index-only scan and
384 : * not already done in a previous rescan call. To save on palloc
385 : * overhead, both workspaces are allocated as one palloc block; only this
386 : * function and btendscan know that.
387 : *
388 : * NOTE: this data structure also makes it safe to return data from a
389 : * "name" column, even though btree name_ops uses an underlying storage
390 : * datatype of cstring. The risk there is that "name" is supposed to be
391 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
392 : * However, since we only return data out of tuples sitting in the
393 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
394 : * data out of the markTuples array --- running off the end of memory for
395 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
396 : * adding special-case treatment for name_ops elsewhere.
397 : */
398 13099558 : if (scan->xs_want_itup && so->currTuples == NULL)
399 : {
400 113032 : so->currTuples = (char *) palloc(BLCKSZ * 2);
401 113032 : so->markTuples = so->currTuples + BLCKSZ;
402 : }
403 :
404 : /*
405 : * Reset the scan keys
406 : */
407 13099558 : if (scankey && scan->numberOfKeys > 0)
408 13087582 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
409 13099558 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410 13099558 : so->numArrayKeys = 0; /* ditto */
411 13099558 : }
412 :
413 : /*
414 : * btendscan() -- close down a scan
415 : */
416 : void
417 12648538 : btendscan(IndexScanDesc scan)
418 : {
419 12648538 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 :
421 : /* we aren't holding any read locks, but gotta drop the pins */
422 12648538 : if (BTScanPosIsValid(so->currPos))
423 : {
424 : /* Before leaving current page, deal with any killed items */
425 7057720 : if (so->numKilled > 0)
426 81336 : _bt_killitems(scan);
427 7057720 : BTScanPosUnpinIfPinned(so->currPos);
428 : }
429 :
430 12648538 : so->markItemIndex = -1;
431 12648538 : BTScanPosUnpinIfPinned(so->markPos);
432 :
433 : /* No need to invalidate positions, the RAM is about to be freed. */
434 :
435 : /* Release storage */
436 12648538 : if (so->keyData != NULL)
437 12636706 : pfree(so->keyData);
438 : /* so->arrayKeys and so->orderProcs are in arrayContext */
439 12648538 : if (so->arrayContext != NULL)
440 1014 : MemoryContextDelete(so->arrayContext);
441 12648538 : if (so->killedItems != NULL)
442 157256 : pfree(so->killedItems);
443 12648538 : if (so->currTuples != NULL)
444 112976 : pfree(so->currTuples);
445 : /* so->markTuples should not be pfree'd, see btrescan */
446 12648538 : pfree(so);
447 12648538 : }
448 :
449 : /*
450 : * btmarkpos() -- save current scan position
451 : */
452 : void
453 130042 : btmarkpos(IndexScanDesc scan)
454 : {
455 130042 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 :
457 : /* There may be an old mark with a pin (but no lock). */
458 130042 : BTScanPosUnpinIfPinned(so->markPos);
459 :
460 : /*
461 : * Just record the current itemIndex. If we later step to next page
462 : * before releasing the marked position, _bt_steppage makes a full copy of
463 : * the currPos struct in markPos. If (as often happens) the mark is moved
464 : * before we leave the page, we don't have to do that work.
465 : */
466 130042 : if (BTScanPosIsValid(so->currPos))
467 130042 : so->markItemIndex = so->currPos.itemIndex;
468 : else
469 : {
470 0 : BTScanPosInvalidate(so->markPos);
471 0 : so->markItemIndex = -1;
472 : }
473 130042 : }
474 :
475 : /*
476 : * btrestrpos() -- restore scan to last saved position
477 : */
478 : void
479 54018 : btrestrpos(IndexScanDesc scan)
480 : {
481 54018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 :
483 54018 : if (so->markItemIndex >= 0)
484 : {
485 : /*
486 : * The scan has never moved to a new page since the last mark. Just
487 : * restore the itemIndex.
488 : *
489 : * NB: In this case we can't count on anything in so->markPos to be
490 : * accurate.
491 : */
492 53910 : so->currPos.itemIndex = so->markItemIndex;
493 : }
494 : else
495 : {
496 : /*
497 : * The scan moved to a new page after last mark or restore, and we are
498 : * now restoring to the marked page. We aren't holding any read
499 : * locks, but if we're still holding the pin for the current position,
500 : * we must drop it.
501 : */
502 108 : if (BTScanPosIsValid(so->currPos))
503 : {
504 : /* Before leaving current page, deal with any killed items */
505 108 : if (so->numKilled > 0)
506 0 : _bt_killitems(scan);
507 108 : BTScanPosUnpinIfPinned(so->currPos);
508 : }
509 :
510 108 : if (BTScanPosIsValid(so->markPos))
511 : {
512 : /* bump pin on mark buffer for assignment to current buffer */
513 108 : if (BTScanPosIsPinned(so->markPos))
514 0 : IncrBufferRefCount(so->markPos.buf);
515 108 : memcpy(&so->currPos, &so->markPos,
516 : offsetof(BTScanPosData, items[1]) +
517 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
518 108 : if (so->currTuples)
519 0 : memcpy(so->currTuples, so->markTuples,
520 0 : so->markPos.nextTupleOffset);
521 : /* Reset the scan's array keys (see _bt_steppage for why) */
522 108 : if (so->numArrayKeys)
523 : {
524 0 : _bt_start_array_keys(scan, so->currPos.dir);
525 0 : so->needPrimScan = false;
526 : }
527 : }
528 : else
529 0 : BTScanPosInvalidate(so->currPos);
530 : }
531 54018 : }
532 :
533 : /*
534 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
535 : */
536 : Size
537 58 : btestimateparallelscan(int nkeys, int norderbys)
538 : {
539 : /* Pessimistically assume all input scankeys will be output with arrays */
540 58 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 : }
542 :
543 : /*
544 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
545 : */
546 : void
547 58 : btinitparallelscan(void *target)
548 : {
549 58 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
550 :
551 58 : SpinLockInit(&bt_target->btps_mutex);
552 58 : bt_target->btps_nextScanPage = InvalidBlockNumber;
553 58 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
554 58 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
555 58 : ConditionVariableInit(&bt_target->btps_cv);
556 58 : }
557 :
558 : /*
559 : * btparallelrescan() -- reset parallel scan
560 : */
561 : void
562 24 : btparallelrescan(IndexScanDesc scan)
563 : {
564 : BTParallelScanDesc btscan;
565 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
566 :
567 : Assert(parallel_scan);
568 :
569 24 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
570 : parallel_scan->ps_offset);
571 :
572 : /*
573 : * In theory, we don't need to acquire the spinlock here, because there
574 : * shouldn't be any other workers running at this point, but we do so for
575 : * consistency.
576 : */
577 24 : SpinLockAcquire(&btscan->btps_mutex);
578 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
579 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
580 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
581 24 : SpinLockRelease(&btscan->btps_mutex);
582 24 : }
583 :
584 : /*
585 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
586 : * page. Other scans must wait until we call _bt_parallel_release()
587 : * or _bt_parallel_done().
588 : *
589 : * The return value is true if we successfully seized the scan and false
590 : * if we did not. The latter case occurs when no pages remain, or when
591 : * another primitive index scan is scheduled that caller's backend cannot
592 : * start just yet (only backends that call from _bt_first are capable of
593 : * starting primitive index scans, which they indicate by passing first=true).
594 : *
595 : * If the return value is true, *next_scan_page returns the next page of the
596 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
597 : * An invalid *next_scan_page means the scan hasn't yet started, or that
598 : * caller needs to start the next primitive index scan (if it's the latter
599 : * case we'll set so.needPrimScan).
600 : *
601 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
602 : * the return value is false.
603 : */
604 : bool
605 1646 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
606 : BlockNumber *last_curr_page, bool first)
607 : {
608 1646 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
609 1646 : bool exit_loop = false,
610 1646 : status = true,
611 1646 : endscan = false;
612 1646 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
613 : BTParallelScanDesc btscan;
614 :
615 1646 : *next_scan_page = InvalidBlockNumber;
616 1646 : *last_curr_page = InvalidBlockNumber;
617 :
618 : /*
619 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
620 : * call to _bt_readnextpage treats this backend similarly to a serial
621 : * backend that steps from *last_curr_page to *next_scan_page (unless this
622 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
623 : */
624 1646 : BTScanPosInvalidate(so->currPos);
625 1646 : so->currPos.moreLeft = so->currPos.moreRight = true;
626 :
627 1646 : if (first)
628 : {
629 : /*
630 : * Initialize array related state when called from _bt_first, assuming
631 : * that this will be the first primitive index scan for the scan
632 : */
633 434 : so->needPrimScan = false;
634 434 : so->scanBehind = false;
635 434 : so->oppositeDirCheck = false;
636 : }
637 : else
638 : {
639 : /*
640 : * Don't attempt to seize the scan when it requires another primitive
641 : * index scan, since caller's backend cannot start it right now
642 : */
643 1212 : if (so->needPrimScan)
644 0 : return false;
645 : }
646 :
647 1646 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
648 : parallel_scan->ps_offset);
649 :
650 : while (1)
651 : {
652 1646 : SpinLockAcquire(&btscan->btps_mutex);
653 :
654 1646 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
655 : {
656 : /* We're done with this parallel index scan */
657 312 : status = false;
658 : }
659 1334 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
660 1216 : btscan->btps_nextScanPage == P_NONE)
661 : {
662 : /* End this parallel index scan */
663 4 : status = false;
664 4 : endscan = true;
665 : }
666 1330 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
667 : {
668 : Assert(so->numArrayKeys);
669 :
670 36 : if (first)
671 : {
672 : /* Can start scheduled primitive scan right away, so do so */
673 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
674 72 : for (int i = 0; i < so->numArrayKeys; i++)
675 : {
676 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
677 36 : ScanKey skey = &so->keyData[array->scan_key];
678 :
679 36 : array->cur_elem = btscan->btps_arrElems[i];
680 36 : skey->sk_argument = array->elem_values[array->cur_elem];
681 : }
682 36 : exit_loop = true;
683 : }
684 : else
685 : {
686 : /*
687 : * Don't attempt to seize the scan when it requires another
688 : * primitive index scan, since caller's backend cannot start
689 : * it right now
690 : */
691 0 : status = false;
692 : }
693 :
694 : /*
695 : * Either way, update backend local state to indicate that a
696 : * pending primitive scan is required
697 : */
698 36 : so->needPrimScan = true;
699 36 : so->scanBehind = false;
700 36 : so->oppositeDirCheck = false;
701 : }
702 1294 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
703 : {
704 : /*
705 : * We have successfully seized control of the scan for the purpose
706 : * of advancing it to a new page!
707 : */
708 1294 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
709 : Assert(btscan->btps_nextScanPage != P_NONE);
710 1294 : *next_scan_page = btscan->btps_nextScanPage;
711 1294 : *last_curr_page = btscan->btps_lastCurrPage;
712 1294 : exit_loop = true;
713 : }
714 1646 : SpinLockRelease(&btscan->btps_mutex);
715 1646 : if (exit_loop || !status)
716 : break;
717 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
718 : }
719 1646 : ConditionVariableCancelSleep();
720 :
721 : /* When the scan has reached the rightmost (or leftmost) page, end it */
722 1646 : if (endscan)
723 4 : _bt_parallel_done(scan);
724 :
725 1646 : return status;
726 : }
727 :
728 : /*
729 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
730 : * new page. We now have the new value btps_nextScanPage; another backend
731 : * can now begin advancing the scan.
732 : *
733 : * Callers whose scan uses array keys must save their curr_page argument so
734 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
735 : * determine that another primitive index scan is required.
736 : *
737 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
738 : * rightmost/leftmost page. This is treated as reaching the end of the scan
739 : * within _bt_parallel_seize.
740 : *
741 : * Note: unlike the serial case, parallel scans don't need to remember both
742 : * sibling links. next_scan_page is whichever link is next given the scan's
743 : * direction. That's all we'll ever need, since the direction of a parallel
744 : * scan can never change.
745 : */
746 : void
747 1330 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
748 : BlockNumber curr_page)
749 : {
750 1330 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
751 : BTParallelScanDesc btscan;
752 :
753 : Assert(BlockNumberIsValid(next_scan_page));
754 :
755 1330 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
756 : parallel_scan->ps_offset);
757 :
758 1330 : SpinLockAcquire(&btscan->btps_mutex);
759 1330 : btscan->btps_nextScanPage = next_scan_page;
760 1330 : btscan->btps_lastCurrPage = curr_page;
761 1330 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
762 1330 : SpinLockRelease(&btscan->btps_mutex);
763 1330 : ConditionVariableSignal(&btscan->btps_cv);
764 1330 : }
765 :
766 : /*
767 : * _bt_parallel_done() -- Mark the parallel scan as complete.
768 : *
769 : * When there are no pages left to scan, this function should be called to
770 : * notify other workers. Otherwise, they might wait forever for the scan to
771 : * advance to the next page.
772 : */
773 : void
774 5970860 : _bt_parallel_done(IndexScanDesc scan)
775 : {
776 5970860 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
777 5970860 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
778 : BTParallelScanDesc btscan;
779 5970860 : bool status_changed = false;
780 :
781 : Assert(!BTScanPosIsValid(so->currPos));
782 :
783 : /* Do nothing, for non-parallel scans */
784 5970860 : if (parallel_scan == NULL)
785 5970708 : return;
786 :
787 : /*
788 : * Should not mark parallel scan done when there's still a pending
789 : * primitive index scan
790 : */
791 152 : if (so->needPrimScan)
792 36 : return;
793 :
794 116 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
795 : parallel_scan->ps_offset);
796 :
797 : /*
798 : * Mark the parallel scan as done, unless some other process did so
799 : * already
800 : */
801 116 : SpinLockAcquire(&btscan->btps_mutex);
802 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
803 116 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
804 : {
805 82 : btscan->btps_pageStatus = BTPARALLEL_DONE;
806 82 : status_changed = true;
807 : }
808 116 : SpinLockRelease(&btscan->btps_mutex);
809 :
810 : /* wake up all the workers associated with this parallel scan */
811 116 : if (status_changed)
812 82 : ConditionVariableBroadcast(&btscan->btps_cv);
813 : }
814 :
815 : /*
816 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
817 : *
818 : * Caller passes the curr_page most recently passed to _bt_parallel_release
819 : * by its backend. Caller successfully schedules the next primitive index scan
820 : * if the shared parallel state hasn't been seized since caller's backend last
821 : * advanced the scan.
822 : */
823 : void
824 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
825 : {
826 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
827 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828 : BTParallelScanDesc btscan;
829 :
830 : Assert(so->numArrayKeys);
831 :
832 36 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
833 : parallel_scan->ps_offset);
834 :
835 36 : SpinLockAcquire(&btscan->btps_mutex);
836 36 : if (btscan->btps_lastCurrPage == curr_page &&
837 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
838 : {
839 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
840 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
841 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
842 :
843 : /* Serialize scan's current array keys */
844 72 : for (int i = 0; i < so->numArrayKeys; i++)
845 : {
846 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
847 :
848 36 : btscan->btps_arrElems[i] = array->cur_elem;
849 : }
850 : }
851 36 : SpinLockRelease(&btscan->btps_mutex);
852 36 : }
853 :
854 : /*
855 : * Bulk deletion of all index entries pointing to a set of heap tuples.
856 : * The set of target tuples is specified via a callback routine that tells
857 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
858 : *
859 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
860 : */
861 : IndexBulkDeleteResult *
862 2434 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
863 : IndexBulkDeleteCallback callback, void *callback_state)
864 : {
865 2434 : Relation rel = info->index;
866 : BTCycleId cycleid;
867 :
868 : /* allocate stats if first time through, else re-use existing struct */
869 2434 : if (stats == NULL)
870 2434 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
871 :
872 : /* Establish the vacuum cycle ID to use for this scan */
873 : /* The ENSURE stuff ensures we clean up shared memory on failure */
874 2434 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
875 : {
876 2434 : cycleid = _bt_start_vacuum(rel);
877 :
878 2434 : btvacuumscan(info, stats, callback, callback_state, cycleid);
879 : }
880 2434 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
881 2434 : _bt_end_vacuum(rel);
882 :
883 2434 : return stats;
884 : }
885 :
886 : /*
887 : * Post-VACUUM cleanup.
888 : *
889 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
890 : */
891 : IndexBulkDeleteResult *
892 137410 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
893 : {
894 : BlockNumber num_delpages;
895 :
896 : /* No-op in ANALYZE ONLY mode */
897 137410 : if (info->analyze_only)
898 16104 : return stats;
899 :
900 : /*
901 : * If btbulkdelete was called, we need not do anything (we just maintain
902 : * the information used within _bt_vacuum_needs_cleanup() by calling
903 : * _bt_set_cleanup_info() below).
904 : *
905 : * If btbulkdelete was _not_ called, then we have a choice to make: we
906 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
907 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
908 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
909 : * now.
910 : */
911 121306 : if (stats == NULL)
912 : {
913 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
914 119454 : if (!_bt_vacuum_needs_cleanup(info->index))
915 119444 : return NULL;
916 :
917 : /*
918 : * Since we aren't going to actually delete any leaf items, there's no
919 : * need to go through all the vacuum-cycle-ID pushups here.
920 : *
921 : * Posting list tuples are a source of inaccuracy for cleanup-only
922 : * scans. btvacuumscan() will assume that the number of index tuples
923 : * from each page can be used as num_index_tuples, even though
924 : * num_index_tuples is supposed to represent the number of TIDs in the
925 : * index. This naive approach can underestimate the number of tuples
926 : * in the index significantly.
927 : *
928 : * We handle the problem by making num_index_tuples an estimate in
929 : * cleanup-only case.
930 : */
931 10 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
932 10 : btvacuumscan(info, stats, NULL, NULL, 0);
933 10 : stats->estimated_count = true;
934 : }
935 :
936 : /*
937 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
938 : *
939 : * num_delpages is the number of deleted pages now in the index that were
940 : * not safe to place in the FSM to be recycled just yet. num_delpages is
941 : * greater than 0 only when _bt_pagedel() actually deleted pages during
942 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
943 : * have failed to place any newly deleted pages in the FSM just moments
944 : * ago. (Actually, there are edge cases where recycling of the current
945 : * VACUUM's newly deleted pages does not even become safe by the time the
946 : * next VACUUM comes around. See nbtree/README.)
947 : */
948 : Assert(stats->pages_deleted >= stats->pages_free);
949 1862 : num_delpages = stats->pages_deleted - stats->pages_free;
950 1862 : _bt_set_cleanup_info(info->index, num_delpages);
951 :
952 : /*
953 : * It's quite possible for us to be fooled by concurrent page splits into
954 : * double-counting some index tuples, so disbelieve any total that exceeds
955 : * the underlying heap's count ... if we know that accurately. Otherwise
956 : * this might just make matters worse.
957 : */
958 1862 : if (!info->estimated_count)
959 : {
960 1818 : if (stats->num_index_tuples > info->num_heap_tuples)
961 6 : stats->num_index_tuples = info->num_heap_tuples;
962 : }
963 :
964 1862 : return stats;
965 : }
966 :
967 : /*
968 : * btvacuumscan --- scan the index for VACUUMing purposes
969 : *
970 : * This combines the functions of looking for leaf tuples that are deletable
971 : * according to the vacuum callback, looking for empty pages that can be
972 : * deleted, and looking for old deleted pages that can be recycled. Both
973 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
974 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
975 : *
976 : * The caller is responsible for initially allocating/zeroing a stats struct
977 : * and for obtaining a vacuum cycle ID if necessary.
978 : */
979 : static void
980 2444 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
981 : IndexBulkDeleteCallback callback, void *callback_state,
982 : BTCycleId cycleid)
983 : {
984 2444 : Relation rel = info->index;
985 : BTVacState vstate;
986 : BlockNumber num_pages;
987 : BlockNumber scanblkno;
988 : bool needLock;
989 :
990 : /*
991 : * Reset fields that track information about the entire index now. This
992 : * avoids double-counting in the case where a single VACUUM command
993 : * requires multiple scans of the index.
994 : *
995 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
996 : * since they track information about the VACUUM command, and so must last
997 : * across each call to btvacuumscan().
998 : *
999 : * (Note that pages_free is treated as state about the whole index, not
1000 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1001 : * calls are idempotent, and get repeated for the same deleted pages in
1002 : * some scenarios. The point for us is to track the number of recyclable
1003 : * pages in the index at the end of the VACUUM command.)
1004 : */
1005 2444 : stats->num_pages = 0;
1006 2444 : stats->num_index_tuples = 0;
1007 2444 : stats->pages_deleted = 0;
1008 2444 : stats->pages_free = 0;
1009 :
1010 : /* Set up info to pass down to btvacuumpage */
1011 2444 : vstate.info = info;
1012 2444 : vstate.stats = stats;
1013 2444 : vstate.callback = callback;
1014 2444 : vstate.callback_state = callback_state;
1015 2444 : vstate.cycleid = cycleid;
1016 :
1017 : /* Create a temporary memory context to run _bt_pagedel in */
1018 2444 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1019 : "_bt_pagedel",
1020 : ALLOCSET_DEFAULT_SIZES);
1021 :
1022 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1023 2444 : vstate.bufsize = 0;
1024 2444 : vstate.maxbufsize = 0;
1025 2444 : vstate.pendingpages = NULL;
1026 2444 : vstate.npendingpages = 0;
1027 : /* Consider applying _bt_pendingfsm_finalize optimization */
1028 2444 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1029 :
1030 : /*
1031 : * The outer loop iterates over all index pages except the metapage, in
1032 : * physical order (we hope the kernel will cooperate in providing
1033 : * read-ahead for speed). It is critical that we visit all leaf pages,
1034 : * including ones added after we start the scan, else we might fail to
1035 : * delete some deletable tuples. Hence, we must repeatedly check the
1036 : * relation length. We must acquire the relation-extension lock while
1037 : * doing so to avoid a race condition: if someone else is extending the
1038 : * relation, there is a window where bufmgr/smgr have created a new
1039 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1040 : * we manage to scan such a page here, we'll improperly assume it can be
1041 : * recycled. Taking the lock synchronizes things enough to prevent a
1042 : * problem: either num_pages won't include the new page, or _bt_getbuf
1043 : * already has write lock on the buffer and it will be fully initialized
1044 : * before we can examine it. Also, we need not worry if a page is added
1045 : * immediately after we look; the page splitting code already has
1046 : * write-lock on the left page before it adds a right page, so we must
1047 : * already have processed any tuples due to be moved into such a page.
1048 : *
1049 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1050 : * think the use of the extension lock is still required.
1051 : *
1052 : * We can skip locking for new or temp relations, however, since no one
1053 : * else could be accessing them.
1054 : */
1055 2444 : needLock = !RELATION_IS_LOCAL(rel);
1056 :
1057 2444 : scanblkno = BTREE_METAPAGE + 1;
1058 : for (;;)
1059 : {
1060 : /* Get the current relation length */
1061 4620 : if (needLock)
1062 4616 : LockRelationForExtension(rel, ExclusiveLock);
1063 4620 : num_pages = RelationGetNumberOfBlocks(rel);
1064 4620 : if (needLock)
1065 4616 : UnlockRelationForExtension(rel, ExclusiveLock);
1066 :
1067 4620 : if (info->report_progress)
1068 896 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1069 : num_pages);
1070 :
1071 : /* Quit if we've scanned the whole relation */
1072 4620 : if (scanblkno >= num_pages)
1073 2444 : break;
1074 : /* Iterate over pages, then loop back to recheck length */
1075 22208 : for (; scanblkno < num_pages; scanblkno++)
1076 : {
1077 20032 : btvacuumpage(&vstate, scanblkno);
1078 20032 : if (info->report_progress)
1079 404 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1080 : scanblkno);
1081 : }
1082 : }
1083 :
1084 : /* Set statistics num_pages field to final size of index */
1085 2444 : stats->num_pages = num_pages;
1086 :
1087 2444 : MemoryContextDelete(vstate.pagedelcontext);
1088 :
1089 : /*
1090 : * If there were any calls to _bt_pagedel() during scan of the index then
1091 : * see if any of the resulting pages can be placed in the FSM now. When
1092 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1093 : *
1094 : * Finally, if we placed any pages in the FSM (either just now or during
1095 : * the scan), forcibly update the upper-level FSM pages to ensure that
1096 : * searchers can find them.
1097 : */
1098 2444 : _bt_pendingfsm_finalize(rel, &vstate);
1099 2444 : if (stats->pages_free > 0)
1100 20 : IndexFreeSpaceMapVacuum(rel);
1101 2444 : }
1102 :
1103 : /*
1104 : * btvacuumpage --- VACUUM one page
1105 : *
1106 : * This processes a single page for btvacuumscan(). In some cases we must
1107 : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1108 : * a previous call here. This is how we handle page splits (that happened
1109 : * after our cycleid was acquired) whose right half page happened to reuse
1110 : * a block that we might have processed at some point before it was
1111 : * recycled (i.e. before the page split).
1112 : */
1113 : static void
1114 20032 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1115 : {
1116 20032 : IndexVacuumInfo *info = vstate->info;
1117 20032 : IndexBulkDeleteResult *stats = vstate->stats;
1118 20032 : IndexBulkDeleteCallback callback = vstate->callback;
1119 20032 : void *callback_state = vstate->callback_state;
1120 20032 : Relation rel = info->index;
1121 20032 : Relation heaprel = info->heaprel;
1122 : bool attempt_pagedel;
1123 : BlockNumber blkno,
1124 : backtrack_to;
1125 : Buffer buf;
1126 : Page page;
1127 : BTPageOpaque opaque;
1128 :
1129 20032 : blkno = scanblkno;
1130 :
1131 20032 : backtrack:
1132 :
1133 20032 : attempt_pagedel = false;
1134 20032 : backtrack_to = P_NONE;
1135 :
1136 : /* call vacuum_delay_point while not holding any buffer lock */
1137 20032 : vacuum_delay_point();
1138 :
1139 : /*
1140 : * We can't use _bt_getbuf() here because it always applies
1141 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1142 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1143 : * buffer access strategy.
1144 : */
1145 20032 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1146 : info->strategy);
1147 20032 : _bt_lockbuf(rel, buf, BT_READ);
1148 20032 : page = BufferGetPage(buf);
1149 20032 : opaque = NULL;
1150 20032 : if (!PageIsNew(page))
1151 : {
1152 20032 : _bt_checkpage(rel, buf);
1153 20032 : opaque = BTPageGetOpaque(page);
1154 : }
1155 :
1156 : Assert(blkno <= scanblkno);
1157 20032 : if (blkno != scanblkno)
1158 : {
1159 : /*
1160 : * We're backtracking.
1161 : *
1162 : * We followed a right link to a sibling leaf page (a page that
1163 : * happens to be from a block located before scanblkno). The only
1164 : * case we want to do anything with is a live leaf page having the
1165 : * current vacuum cycle ID.
1166 : *
1167 : * The page had better be in a state that's consistent with what we
1168 : * expect. Check for conditions that imply corruption in passing. It
1169 : * can't be half-dead because only an interrupted VACUUM process can
1170 : * leave pages in that state, so we'd definitely have dealt with it
1171 : * back when the page was the scanblkno page (half-dead pages are
1172 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1173 : */
1174 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1175 : {
1176 : Assert(false);
1177 0 : ereport(LOG,
1178 : (errcode(ERRCODE_INDEX_CORRUPTED),
1179 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1180 : blkno, scanblkno, RelationGetRelationName(rel))));
1181 0 : _bt_relbuf(rel, buf);
1182 0 : return;
1183 : }
1184 :
1185 : /*
1186 : * We may have already processed the page in an earlier call, when the
1187 : * page was scanblkno. This happens when the leaf page split occurred
1188 : * after the scan began, but before the right sibling page became the
1189 : * scanblkno.
1190 : *
1191 : * Page may also have been deleted by current btvacuumpage() call,
1192 : * since _bt_pagedel() sometimes deletes the right sibling page of
1193 : * scanblkno in passing (it does so after we decided where to
1194 : * backtrack to). We don't need to process this page as a deleted
1195 : * page a second time now (in fact, it would be wrong to count it as a
1196 : * deleted page in the bulk delete statistics a second time).
1197 : */
1198 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1199 : {
1200 : /* Done with current scanblkno (and all lower split pages) */
1201 0 : _bt_relbuf(rel, buf);
1202 0 : return;
1203 : }
1204 : }
1205 :
1206 20032 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1207 : {
1208 : /* Okay to recycle this page (which could be leaf or internal) */
1209 120 : RecordFreeIndexPage(rel, blkno);
1210 120 : stats->pages_deleted++;
1211 120 : stats->pages_free++;
1212 : }
1213 19912 : else if (P_ISDELETED(opaque))
1214 : {
1215 : /*
1216 : * Already deleted page (which could be leaf or internal). Can't
1217 : * recycle yet.
1218 : */
1219 66 : stats->pages_deleted++;
1220 : }
1221 19846 : else if (P_ISHALFDEAD(opaque))
1222 : {
1223 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1224 0 : attempt_pagedel = true;
1225 :
1226 : /*
1227 : * _bt_pagedel() will increment both pages_newly_deleted and
1228 : * pages_deleted stats in all cases (barring corruption)
1229 : */
1230 : }
1231 19846 : else if (P_ISLEAF(opaque))
1232 : {
1233 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1234 : int ndeletable;
1235 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1236 : int nupdatable;
1237 : OffsetNumber offnum,
1238 : minoff,
1239 : maxoff;
1240 : int nhtidsdead,
1241 : nhtidslive;
1242 :
1243 : /*
1244 : * Trade in the initial read lock for a full cleanup lock on this
1245 : * page. We must get such a lock on every leaf page over the course
1246 : * of the vacuum scan, whether or not it actually contains any
1247 : * deletable tuples --- see nbtree/README.
1248 : */
1249 18524 : _bt_upgradelockbufcleanup(rel, buf);
1250 :
1251 : /*
1252 : * Check whether we need to backtrack to earlier pages. What we are
1253 : * concerned about is a page split that happened since we started the
1254 : * vacuum scan. If the split moved tuples on the right half of the
1255 : * split (i.e. the tuples that sort high) to a block that we already
1256 : * passed over, then we might have missed the tuples. We need to
1257 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1258 : * or deleting scanblkno page below!)
1259 : */
1260 18524 : if (vstate->cycleid != 0 &&
1261 18364 : opaque->btpo_cycleid == vstate->cycleid &&
1262 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1263 0 : !P_RIGHTMOST(opaque) &&
1264 0 : opaque->btpo_next < scanblkno)
1265 0 : backtrack_to = opaque->btpo_next;
1266 :
1267 18524 : ndeletable = 0;
1268 18524 : nupdatable = 0;
1269 18524 : minoff = P_FIRSTDATAKEY(opaque);
1270 18524 : maxoff = PageGetMaxOffsetNumber(page);
1271 18524 : nhtidsdead = 0;
1272 18524 : nhtidslive = 0;
1273 18524 : if (callback)
1274 : {
1275 : /* btbulkdelete callback tells us what to delete (or update) */
1276 3472012 : for (offnum = minoff;
1277 : offnum <= maxoff;
1278 3453648 : offnum = OffsetNumberNext(offnum))
1279 : {
1280 : IndexTuple itup;
1281 :
1282 3453648 : itup = (IndexTuple) PageGetItem(page,
1283 : PageGetItemId(page, offnum));
1284 :
1285 : Assert(!BTreeTupleIsPivot(itup));
1286 3453648 : if (!BTreeTupleIsPosting(itup))
1287 : {
1288 : /* Regular tuple, standard table TID representation */
1289 3343618 : if (callback(&itup->t_tid, callback_state))
1290 : {
1291 1477856 : deletable[ndeletable++] = offnum;
1292 1477856 : nhtidsdead++;
1293 : }
1294 : else
1295 1865762 : nhtidslive++;
1296 : }
1297 : else
1298 : {
1299 : BTVacuumPosting vacposting;
1300 : int nremaining;
1301 :
1302 : /* Posting list tuple */
1303 110030 : vacposting = btreevacuumposting(vstate, itup, offnum,
1304 : &nremaining);
1305 110030 : if (vacposting == NULL)
1306 : {
1307 : /*
1308 : * All table TIDs from the posting tuple remain, so no
1309 : * delete or update required
1310 : */
1311 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1312 : }
1313 65740 : else if (nremaining > 0)
1314 : {
1315 :
1316 : /*
1317 : * Store metadata about posting list tuple in
1318 : * updatable array for entire page. Existing tuple
1319 : * will be updated during the later call to
1320 : * _bt_delitems_vacuum().
1321 : */
1322 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1323 27756 : updatable[nupdatable++] = vacposting;
1324 27756 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1325 : }
1326 : else
1327 : {
1328 : /*
1329 : * All table TIDs from the posting list must be
1330 : * deleted. We'll delete the index tuple completely
1331 : * (no update required).
1332 : */
1333 : Assert(nremaining == 0);
1334 37984 : deletable[ndeletable++] = offnum;
1335 37984 : nhtidsdead += BTreeTupleGetNPosting(itup);
1336 37984 : pfree(vacposting);
1337 : }
1338 :
1339 110030 : nhtidslive += nremaining;
1340 : }
1341 : }
1342 : }
1343 :
1344 : /*
1345 : * Apply any needed deletes or updates. We issue just one
1346 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1347 : */
1348 18524 : if (ndeletable > 0 || nupdatable > 0)
1349 : {
1350 : Assert(nhtidsdead >= ndeletable + nupdatable);
1351 12756 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1352 : nupdatable);
1353 :
1354 12756 : stats->tuples_removed += nhtidsdead;
1355 : /* must recompute maxoff */
1356 12756 : maxoff = PageGetMaxOffsetNumber(page);
1357 :
1358 : /* can't leak memory here */
1359 40512 : for (int i = 0; i < nupdatable; i++)
1360 27756 : pfree(updatable[i]);
1361 : }
1362 : else
1363 : {
1364 : /*
1365 : * If the leaf page has been split during this vacuum cycle, it
1366 : * seems worth expending a write to clear btpo_cycleid even if we
1367 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1368 : * takes care of this.) This ensures we won't process the page
1369 : * again.
1370 : *
1371 : * We treat this like a hint-bit update because there's no need to
1372 : * WAL-log it.
1373 : */
1374 : Assert(nhtidsdead == 0);
1375 5768 : if (vstate->cycleid != 0 &&
1376 5608 : opaque->btpo_cycleid == vstate->cycleid)
1377 : {
1378 0 : opaque->btpo_cycleid = 0;
1379 0 : MarkBufferDirtyHint(buf, true);
1380 : }
1381 : }
1382 :
1383 : /*
1384 : * If the leaf page is now empty, try to delete it; else count the
1385 : * live tuples (live table TIDs in posting lists are counted as
1386 : * separate live tuples). We don't delete when backtracking, though,
1387 : * since that would require teaching _bt_pagedel() about backtracking
1388 : * (doesn't seem worth adding more complexity to deal with that).
1389 : *
1390 : * We don't count the number of live TIDs during cleanup-only calls to
1391 : * btvacuumscan (i.e. when callback is not set). We count the number
1392 : * of index tuples directly instead. This avoids the expense of
1393 : * directly examining all of the tuples on each page. VACUUM will
1394 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1395 : * doesn't matter that this underestimates num_index_tuples
1396 : * significantly in some cases.
1397 : */
1398 18524 : if (minoff > maxoff)
1399 5566 : attempt_pagedel = (blkno == scanblkno);
1400 12958 : else if (callback)
1401 12806 : stats->num_index_tuples += nhtidslive;
1402 : else
1403 152 : stats->num_index_tuples += maxoff - minoff + 1;
1404 :
1405 : Assert(!attempt_pagedel || nhtidslive == 0);
1406 : }
1407 :
1408 20032 : if (attempt_pagedel)
1409 : {
1410 : MemoryContext oldcontext;
1411 :
1412 : /* Run pagedel in a temp context to avoid memory leakage */
1413 5566 : MemoryContextReset(vstate->pagedelcontext);
1414 5566 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1415 :
1416 : /*
1417 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1418 : * pages_newly_deleted and pages_deleted are likely to be incremented
1419 : * during call
1420 : */
1421 : Assert(blkno == scanblkno);
1422 5566 : _bt_pagedel(rel, buf, vstate);
1423 :
1424 5566 : MemoryContextSwitchTo(oldcontext);
1425 : /* pagedel released buffer, so we shouldn't */
1426 : }
1427 : else
1428 14466 : _bt_relbuf(rel, buf);
1429 :
1430 20032 : if (backtrack_to != P_NONE)
1431 : {
1432 0 : blkno = backtrack_to;
1433 0 : goto backtrack;
1434 : }
1435 : }
1436 :
1437 : /*
1438 : * btreevacuumposting --- determine TIDs still needed in posting list
1439 : *
1440 : * Returns metadata describing how to build replacement tuple without the TIDs
1441 : * that VACUUM needs to delete. Returned value is NULL in the common case
1442 : * where no changes are needed to caller's posting list tuple (we avoid
1443 : * allocating memory here as an optimization).
1444 : *
1445 : * The number of TIDs that should remain in the posting list tuple is set for
1446 : * caller in *nremaining.
1447 : */
1448 : static BTVacuumPosting
1449 110030 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1450 : OffsetNumber updatedoffset, int *nremaining)
1451 : {
1452 110030 : int live = 0;
1453 110030 : int nitem = BTreeTupleGetNPosting(posting);
1454 110030 : ItemPointer items = BTreeTupleGetPosting(posting);
1455 110030 : BTVacuumPosting vacposting = NULL;
1456 :
1457 637482 : for (int i = 0; i < nitem; i++)
1458 : {
1459 527452 : if (!vstate->callback(items + i, vstate->callback_state))
1460 : {
1461 : /* Live table TID */
1462 285278 : live++;
1463 : }
1464 242174 : else if (vacposting == NULL)
1465 : {
1466 : /*
1467 : * First dead table TID encountered.
1468 : *
1469 : * It's now clear that we need to delete one or more dead table
1470 : * TIDs, so start maintaining metadata describing how to update
1471 : * existing posting list tuple.
1472 : */
1473 65740 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1474 : nitem * sizeof(uint16));
1475 :
1476 65740 : vacposting->itup = posting;
1477 65740 : vacposting->updatedoffset = updatedoffset;
1478 65740 : vacposting->ndeletedtids = 0;
1479 65740 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1480 : }
1481 : else
1482 : {
1483 : /* Second or subsequent dead table TID */
1484 176434 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1485 : }
1486 : }
1487 :
1488 110030 : *nremaining = live;
1489 110030 : return vacposting;
1490 : }
1491 :
1492 : /*
1493 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1494 : *
1495 : * btrees always do, so this is trivial.
1496 : */
1497 : bool
1498 959028 : btcanreturn(Relation index, int attno)
1499 : {
1500 959028 : return true;
1501 : }
1502 :
1503 : /*
1504 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1505 : */
1506 : int
1507 612650 : btgettreeheight(Relation rel)
1508 : {
1509 612650 : return _bt_getrootheight(rel);
1510 : }
|