Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/read_stream.h"
34 : #include "utils/fmgrprotos.h"
35 : #include "utils/index_selfuncs.h"
36 : #include "utils/memutils.h"
37 :
38 :
39 : /*
40 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
41 : *
42 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
43 : * scan to advance it via another call to _bt_first.
44 : *
45 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46 : * a new page; others must wait.
47 : *
48 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49 : * to a new page; some process can start doing that.
50 : *
51 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52 : */
53 : typedef enum
54 : {
55 : BTPARALLEL_NOT_INITIALIZED,
56 : BTPARALLEL_NEED_PRIMSCAN,
57 : BTPARALLEL_ADVANCING,
58 : BTPARALLEL_IDLE,
59 : BTPARALLEL_DONE,
60 : } BTPS_State;
61 :
62 : /*
63 : * BTParallelScanDescData contains btree specific shared information required
64 : * for parallel scan.
65 : */
66 : typedef struct BTParallelScanDescData
67 : {
68 : BlockNumber btps_nextScanPage; /* next page to be scanned */
69 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
70 : * btps_nextScanPage */
71 : BTPS_State btps_pageStatus; /* indicates whether next page is
72 : * available for scan. see above for
73 : * possible states of parallel scan. */
74 : LWLock btps_lock; /* protects shared parallel state */
75 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
76 :
77 : /*
78 : * btps_arrElems is used when scans need to schedule another primitive
79 : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
80 : */
81 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
82 : } BTParallelScanDescData;
83 :
84 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
85 :
86 :
87 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
88 : IndexBulkDeleteCallback callback, void *callback_state,
89 : BTCycleId cycleid);
90 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
91 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
92 : IndexTuple posting,
93 : OffsetNumber updatedoffset,
94 : int *nremaining);
95 :
96 :
97 : /*
98 : * Btree handler function: return IndexAmRoutine with access method parameters
99 : * and callbacks.
100 : */
101 : Datum
102 3604844 : bthandler(PG_FUNCTION_ARGS)
103 : {
104 3604844 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
105 :
106 3604844 : amroutine->amstrategies = BTMaxStrategyNumber;
107 3604844 : amroutine->amsupport = BTNProcs;
108 3604844 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
109 3604844 : amroutine->amcanorder = true;
110 3604844 : amroutine->amcanorderbyop = false;
111 3604844 : amroutine->amcanhash = false;
112 3604844 : amroutine->amconsistentequality = true;
113 3604844 : amroutine->amconsistentordering = true;
114 3604844 : amroutine->amcanbackward = true;
115 3604844 : amroutine->amcanunique = true;
116 3604844 : amroutine->amcanmulticol = true;
117 3604844 : amroutine->amoptionalkey = true;
118 3604844 : amroutine->amsearcharray = true;
119 3604844 : amroutine->amsearchnulls = true;
120 3604844 : amroutine->amstorage = false;
121 3604844 : amroutine->amclusterable = true;
122 3604844 : amroutine->ampredlocks = true;
123 3604844 : amroutine->amcanparallel = true;
124 3604844 : amroutine->amcanbuildparallel = true;
125 3604844 : amroutine->amcaninclude = true;
126 3604844 : amroutine->amusemaintenanceworkmem = false;
127 3604844 : amroutine->amsummarizing = false;
128 3604844 : amroutine->amparallelvacuumoptions =
129 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
130 3604844 : amroutine->amkeytype = InvalidOid;
131 :
132 3604844 : amroutine->ambuild = btbuild;
133 3604844 : amroutine->ambuildempty = btbuildempty;
134 3604844 : amroutine->aminsert = btinsert;
135 3604844 : amroutine->aminsertcleanup = NULL;
136 3604844 : amroutine->ambulkdelete = btbulkdelete;
137 3604844 : amroutine->amvacuumcleanup = btvacuumcleanup;
138 3604844 : amroutine->amcanreturn = btcanreturn;
139 3604844 : amroutine->amcostestimate = btcostestimate;
140 3604844 : amroutine->amgettreeheight = btgettreeheight;
141 3604844 : amroutine->amoptions = btoptions;
142 3604844 : amroutine->amproperty = btproperty;
143 3604844 : amroutine->ambuildphasename = btbuildphasename;
144 3604844 : amroutine->amvalidate = btvalidate;
145 3604844 : amroutine->amadjustmembers = btadjustmembers;
146 3604844 : amroutine->ambeginscan = btbeginscan;
147 3604844 : amroutine->amrescan = btrescan;
148 3604844 : amroutine->amgettuple = btgettuple;
149 3604844 : amroutine->amgetbitmap = btgetbitmap;
150 3604844 : amroutine->amendscan = btendscan;
151 3604844 : amroutine->ammarkpos = btmarkpos;
152 3604844 : amroutine->amrestrpos = btrestrpos;
153 3604844 : amroutine->amestimateparallelscan = btestimateparallelscan;
154 3604844 : amroutine->aminitparallelscan = btinitparallelscan;
155 3604844 : amroutine->amparallelrescan = btparallelrescan;
156 3604844 : amroutine->amtranslatestrategy = bttranslatestrategy;
157 3604844 : amroutine->amtranslatecmptype = bttranslatecmptype;
158 :
159 3604844 : PG_RETURN_POINTER(amroutine);
160 : }
161 :
162 : /*
163 : * btbuildempty() -- build an empty btree index in the initialization fork
164 : */
165 : void
166 148 : btbuildempty(Relation index)
167 : {
168 148 : bool allequalimage = _bt_allequalimage(index, false);
169 : BulkWriteState *bulkstate;
170 : BulkWriteBuffer metabuf;
171 :
172 148 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
173 :
174 : /* Construct metapage. */
175 148 : metabuf = smgr_bulk_get_buf(bulkstate);
176 148 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
177 148 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
178 :
179 148 : smgr_bulk_finish(bulkstate);
180 148 : }
181 :
182 : /*
183 : * btinsert() -- insert an index tuple into a btree.
184 : *
185 : * Descend the tree recursively, find the appropriate location for our
186 : * new tuple, and put it there.
187 : */
188 : bool
189 6956528 : btinsert(Relation rel, Datum *values, bool *isnull,
190 : ItemPointer ht_ctid, Relation heapRel,
191 : IndexUniqueCheck checkUnique,
192 : bool indexUnchanged,
193 : IndexInfo *indexInfo)
194 : {
195 : bool result;
196 : IndexTuple itup;
197 :
198 : /* generate an index tuple */
199 6956528 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
200 6956528 : itup->t_tid = *ht_ctid;
201 :
202 6956528 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
203 :
204 6956018 : pfree(itup);
205 :
206 6956018 : return result;
207 : }
208 :
209 : /*
210 : * btgettuple() -- Get the next tuple in the scan.
211 : */
212 : bool
213 32553850 : btgettuple(IndexScanDesc scan, ScanDirection dir)
214 : {
215 32553850 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
216 : bool res;
217 :
218 : /* btree indexes are never lossy */
219 32553850 : scan->xs_recheck = false;
220 :
221 : /* Each loop iteration performs another primitive index scan */
222 : do
223 : {
224 : /*
225 : * If we've already initialized this scan, we can just advance it in
226 : * the appropriate direction. If we haven't done so yet, we call
227 : * _bt_first() to get the first item in the scan.
228 : */
229 32570694 : if (!BTScanPosIsValid(so->currPos))
230 14438808 : res = _bt_first(scan, dir);
231 : else
232 : {
233 : /*
234 : * Check to see if we should kill the previously-fetched tuple.
235 : */
236 18131886 : if (scan->kill_prior_tuple)
237 : {
238 : /*
239 : * Yes, remember it for later. (We'll deal with all such
240 : * tuples at once right before leaving the index page.) The
241 : * test for numKilled overrun is not just paranoia: if the
242 : * caller reverses direction in the indexscan then the same
243 : * item might get entered multiple times. It's not worth
244 : * trying to optimize that, so we don't detect it, but instead
245 : * just forget any excess entries.
246 : */
247 486316 : if (so->killedItems == NULL)
248 163824 : so->killedItems = (int *)
249 163824 : palloc(MaxTIDsPerBTreePage * sizeof(int));
250 486316 : if (so->numKilled < MaxTIDsPerBTreePage)
251 486316 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
252 : }
253 :
254 : /*
255 : * Now continue the scan.
256 : */
257 18131886 : res = _bt_next(scan, dir);
258 : }
259 :
260 : /* If we have a tuple, return it ... */
261 32570694 : if (res)
262 26052274 : break;
263 : /* ... otherwise see if we need another primitive index scan */
264 6518420 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
265 :
266 32553850 : return res;
267 : }
268 :
269 : /*
270 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
271 : */
272 : int64
273 18394 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
274 : {
275 18394 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
276 18394 : int64 ntids = 0;
277 : ItemPointer heapTid;
278 :
279 : /* Each loop iteration performs another primitive index scan */
280 : do
281 : {
282 : /* Fetch the first page & tuple */
283 18790 : if (_bt_first(scan, ForwardScanDirection))
284 : {
285 : /* Save tuple ID, and continue scanning */
286 9466 : heapTid = &scan->xs_heaptid;
287 9466 : tbm_add_tuples(tbm, heapTid, 1, false);
288 9466 : ntids++;
289 :
290 : for (;;)
291 : {
292 : /*
293 : * Advance to next tuple within page. This is the same as the
294 : * easy case in _bt_next().
295 : */
296 1934098 : if (++so->currPos.itemIndex > so->currPos.lastItem)
297 : {
298 : /* let _bt_next do the heavy lifting */
299 14530 : if (!_bt_next(scan, ForwardScanDirection))
300 9466 : break;
301 : }
302 :
303 : /* Save tuple ID, and continue scanning */
304 1924632 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
305 1924632 : tbm_add_tuples(tbm, heapTid, 1, false);
306 1924632 : ntids++;
307 : }
308 : }
309 : /* Now see if we need another primitive index scan */
310 18790 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
311 :
312 18394 : return ntids;
313 : }
314 :
315 : /*
316 : * btbeginscan() -- start a scan on a btree index
317 : */
318 : IndexScanDesc
319 13782512 : btbeginscan(Relation rel, int nkeys, int norderbys)
320 : {
321 : IndexScanDesc scan;
322 : BTScanOpaque so;
323 :
324 : /* no order by operators allowed */
325 : Assert(norderbys == 0);
326 :
327 : /* get the scan */
328 13782512 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
329 :
330 : /* allocate private workspace */
331 13782512 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
332 13782512 : BTScanPosInvalidate(so->currPos);
333 13782512 : BTScanPosInvalidate(so->markPos);
334 13782512 : if (scan->numberOfKeys > 0)
335 13769666 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
336 : else
337 12846 : so->keyData = NULL;
338 :
339 13782512 : so->needPrimScan = false;
340 13782512 : so->scanBehind = false;
341 13782512 : so->oppositeDirCheck = false;
342 13782512 : so->arrayKeys = NULL;
343 13782512 : so->orderProcs = NULL;
344 13782512 : so->arrayContext = NULL;
345 :
346 13782512 : so->killedItems = NULL; /* until needed */
347 13782512 : so->numKilled = 0;
348 :
349 : /*
350 : * We don't know yet whether the scan will be index-only, so we do not
351 : * allocate the tuple workspace arrays until btrescan. However, we set up
352 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
353 : */
354 13782512 : so->currTuples = so->markTuples = NULL;
355 :
356 13782512 : scan->xs_itupdesc = RelationGetDescr(rel);
357 :
358 13782512 : scan->opaque = so;
359 :
360 13782512 : return scan;
361 : }
362 :
363 : /*
364 : * btrescan() -- rescan an index relation
365 : */
366 : void
367 14441592 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
368 : ScanKey orderbys, int norderbys)
369 : {
370 14441592 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
371 :
372 : /* we aren't holding any read locks, but gotta drop the pins */
373 14441592 : if (BTScanPosIsValid(so->currPos))
374 : {
375 : /* Before leaving current page, deal with any killed items */
376 76558 : if (so->numKilled > 0)
377 808 : _bt_killitems(scan);
378 76558 : BTScanPosUnpinIfPinned(so->currPos);
379 76558 : BTScanPosInvalidate(so->currPos);
380 : }
381 :
382 14441592 : so->markItemIndex = -1;
383 14441592 : so->needPrimScan = false;
384 14441592 : so->scanBehind = false;
385 14441592 : so->oppositeDirCheck = false;
386 14441592 : BTScanPosUnpinIfPinned(so->markPos);
387 14441592 : BTScanPosInvalidate(so->markPos);
388 :
389 : /*
390 : * Allocate tuple workspace arrays, if needed for an index-only scan and
391 : * not already done in a previous rescan call. To save on palloc
392 : * overhead, both workspaces are allocated as one palloc block; only this
393 : * function and btendscan know that.
394 : *
395 : * NOTE: this data structure also makes it safe to return data from a
396 : * "name" column, even though btree name_ops uses an underlying storage
397 : * datatype of cstring. The risk there is that "name" is supposed to be
398 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
399 : * However, since we only return data out of tuples sitting in the
400 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
401 : * data out of the markTuples array --- running off the end of memory for
402 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
403 : * adding special-case treatment for name_ops elsewhere.
404 : */
405 14441592 : if (scan->xs_want_itup && so->currTuples == NULL)
406 : {
407 130446 : so->currTuples = (char *) palloc(BLCKSZ * 2);
408 130446 : so->markTuples = so->currTuples + BLCKSZ;
409 : }
410 :
411 : /*
412 : * Reset the scan keys
413 : */
414 14441592 : if (scankey && scan->numberOfKeys > 0)
415 14428552 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
416 14441592 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
417 14441592 : so->numArrayKeys = 0; /* ditto */
418 14441592 : }
419 :
420 : /*
421 : * btendscan() -- close down a scan
422 : */
423 : void
424 13780988 : btendscan(IndexScanDesc scan)
425 : {
426 13780988 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
427 :
428 : /* we aren't holding any read locks, but gotta drop the pins */
429 13780988 : if (BTScanPosIsValid(so->currPos))
430 : {
431 : /* Before leaving current page, deal with any killed items */
432 7842742 : if (so->numKilled > 0)
433 85770 : _bt_killitems(scan);
434 7842742 : BTScanPosUnpinIfPinned(so->currPos);
435 : }
436 :
437 13780988 : so->markItemIndex = -1;
438 13780988 : BTScanPosUnpinIfPinned(so->markPos);
439 :
440 : /* No need to invalidate positions, the RAM is about to be freed. */
441 :
442 : /* Release storage */
443 13780988 : if (so->keyData != NULL)
444 13768172 : pfree(so->keyData);
445 : /* so->arrayKeys and so->orderProcs are in arrayContext */
446 13780988 : if (so->arrayContext != NULL)
447 1186 : MemoryContextDelete(so->arrayContext);
448 13780988 : if (so->killedItems != NULL)
449 163764 : pfree(so->killedItems);
450 13780988 : if (so->currTuples != NULL)
451 130402 : pfree(so->currTuples);
452 : /* so->markTuples should not be pfree'd, see btrescan */
453 13780988 : pfree(so);
454 13780988 : }
455 :
456 : /*
457 : * btmarkpos() -- save current scan position
458 : */
459 : void
460 130074 : btmarkpos(IndexScanDesc scan)
461 : {
462 130074 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
463 :
464 : /* There may be an old mark with a pin (but no lock). */
465 130074 : BTScanPosUnpinIfPinned(so->markPos);
466 :
467 : /*
468 : * Just record the current itemIndex. If we later step to next page
469 : * before releasing the marked position, _bt_steppage makes a full copy of
470 : * the currPos struct in markPos. If (as often happens) the mark is moved
471 : * before we leave the page, we don't have to do that work.
472 : */
473 130074 : if (BTScanPosIsValid(so->currPos))
474 130074 : so->markItemIndex = so->currPos.itemIndex;
475 : else
476 : {
477 0 : BTScanPosInvalidate(so->markPos);
478 0 : so->markItemIndex = -1;
479 : }
480 130074 : }
481 :
482 : /*
483 : * btrestrpos() -- restore scan to last saved position
484 : */
485 : void
486 54018 : btrestrpos(IndexScanDesc scan)
487 : {
488 54018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
489 :
490 54018 : if (so->markItemIndex >= 0)
491 : {
492 : /*
493 : * The scan has never moved to a new page since the last mark. Just
494 : * restore the itemIndex.
495 : *
496 : * NB: In this case we can't count on anything in so->markPos to be
497 : * accurate.
498 : */
499 53910 : so->currPos.itemIndex = so->markItemIndex;
500 : }
501 : else
502 : {
503 : /*
504 : * The scan moved to a new page after last mark or restore, and we are
505 : * now restoring to the marked page. We aren't holding any read
506 : * locks, but if we're still holding the pin for the current position,
507 : * we must drop it.
508 : */
509 108 : if (BTScanPosIsValid(so->currPos))
510 : {
511 : /* Before leaving current page, deal with any killed items */
512 108 : if (so->numKilled > 0)
513 0 : _bt_killitems(scan);
514 108 : BTScanPosUnpinIfPinned(so->currPos);
515 : }
516 :
517 108 : if (BTScanPosIsValid(so->markPos))
518 : {
519 : /* bump pin on mark buffer for assignment to current buffer */
520 108 : if (BTScanPosIsPinned(so->markPos))
521 0 : IncrBufferRefCount(so->markPos.buf);
522 108 : memcpy(&so->currPos, &so->markPos,
523 : offsetof(BTScanPosData, items[1]) +
524 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
525 108 : if (so->currTuples)
526 0 : memcpy(so->currTuples, so->markTuples,
527 0 : so->markPos.nextTupleOffset);
528 : /* Reset the scan's array keys (see _bt_steppage for why) */
529 108 : if (so->numArrayKeys)
530 : {
531 0 : _bt_start_array_keys(scan, so->currPos.dir);
532 0 : so->needPrimScan = false;
533 : }
534 : }
535 : else
536 0 : BTScanPosInvalidate(so->currPos);
537 : }
538 54018 : }
539 :
540 : /*
541 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
542 : */
543 : Size
544 64 : btestimateparallelscan(int nkeys, int norderbys)
545 : {
546 : /* Pessimistically assume all input scankeys will be output with arrays */
547 64 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
548 : }
549 :
550 : /*
551 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
552 : */
553 : void
554 64 : btinitparallelscan(void *target)
555 : {
556 64 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
557 :
558 64 : LWLockInitialize(&bt_target->btps_lock,
559 : LWTRANCHE_PARALLEL_BTREE_SCAN);
560 64 : bt_target->btps_nextScanPage = InvalidBlockNumber;
561 64 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
562 64 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
563 64 : ConditionVariableInit(&bt_target->btps_cv);
564 64 : }
565 :
566 : /*
567 : * btparallelrescan() -- reset parallel scan
568 : */
569 : void
570 24 : btparallelrescan(IndexScanDesc scan)
571 : {
572 : BTParallelScanDesc btscan;
573 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
574 :
575 : Assert(parallel_scan);
576 :
577 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
578 : parallel_scan->ps_offset_am);
579 :
580 : /*
581 : * In theory, we don't need to acquire the LWLock here, because there
582 : * shouldn't be any other workers running at this point, but we do so for
583 : * consistency.
584 : */
585 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
586 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
587 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
588 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
589 24 : LWLockRelease(&btscan->btps_lock);
590 24 : }
591 :
592 : /*
593 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
594 : * page. Other scans must wait until we call _bt_parallel_release()
595 : * or _bt_parallel_done().
596 : *
597 : * The return value is true if we successfully seized the scan and false
598 : * if we did not. The latter case occurs when no pages remain, or when
599 : * another primitive index scan is scheduled that caller's backend cannot
600 : * start just yet (only backends that call from _bt_first are capable of
601 : * starting primitive index scans, which they indicate by passing first=true).
602 : *
603 : * If the return value is true, *next_scan_page returns the next page of the
604 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
605 : * An invalid *next_scan_page means the scan hasn't yet started, or that
606 : * caller needs to start the next primitive index scan (if it's the latter
607 : * case we'll set so.needPrimScan).
608 : *
609 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
610 : * the return value is false.
611 : */
612 : bool
613 1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
614 : BlockNumber *last_curr_page, bool first)
615 : {
616 1658 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
617 1658 : bool exit_loop = false,
618 1658 : status = true,
619 1658 : endscan = false;
620 1658 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
621 : BTParallelScanDesc btscan;
622 :
623 1658 : *next_scan_page = InvalidBlockNumber;
624 1658 : *last_curr_page = InvalidBlockNumber;
625 :
626 : /*
627 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
628 : * call to _bt_readnextpage treats this backend similarly to a serial
629 : * backend that steps from *last_curr_page to *next_scan_page (unless this
630 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
631 : */
632 1658 : BTScanPosInvalidate(so->currPos);
633 1658 : so->currPos.moreLeft = so->currPos.moreRight = true;
634 :
635 1658 : if (first)
636 : {
637 : /*
638 : * Initialize array related state when called from _bt_first, assuming
639 : * that this will be the first primitive index scan for the scan
640 : */
641 446 : so->needPrimScan = false;
642 446 : so->scanBehind = false;
643 446 : so->oppositeDirCheck = false;
644 : }
645 : else
646 : {
647 : /*
648 : * Don't attempt to seize the scan when it requires another primitive
649 : * index scan, since caller's backend cannot start it right now
650 : */
651 1212 : if (so->needPrimScan)
652 0 : return false;
653 : }
654 :
655 1658 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
656 : parallel_scan->ps_offset_am);
657 :
658 : while (1)
659 : {
660 1662 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
661 :
662 1662 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
663 : {
664 : /* We're done with this parallel index scan */
665 312 : status = false;
666 : }
667 1350 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
668 1222 : btscan->btps_nextScanPage == P_NONE)
669 : {
670 : /* End this parallel index scan */
671 10 : status = false;
672 10 : endscan = true;
673 : }
674 1340 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
675 : {
676 : Assert(so->numArrayKeys);
677 :
678 36 : if (first)
679 : {
680 : /* Can start scheduled primitive scan right away, so do so */
681 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
682 72 : for (int i = 0; i < so->numArrayKeys; i++)
683 : {
684 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
685 36 : ScanKey skey = &so->keyData[array->scan_key];
686 :
687 36 : array->cur_elem = btscan->btps_arrElems[i];
688 36 : skey->sk_argument = array->elem_values[array->cur_elem];
689 : }
690 36 : exit_loop = true;
691 : }
692 : else
693 : {
694 : /*
695 : * Don't attempt to seize the scan when it requires another
696 : * primitive index scan, since caller's backend cannot start
697 : * it right now
698 : */
699 0 : status = false;
700 : }
701 :
702 : /*
703 : * Either way, update backend local state to indicate that a
704 : * pending primitive scan is required
705 : */
706 36 : so->needPrimScan = true;
707 36 : so->scanBehind = false;
708 36 : so->oppositeDirCheck = false;
709 : }
710 1304 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
711 : {
712 : /*
713 : * We have successfully seized control of the scan for the purpose
714 : * of advancing it to a new page!
715 : */
716 1300 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
717 : Assert(btscan->btps_nextScanPage != P_NONE);
718 1300 : *next_scan_page = btscan->btps_nextScanPage;
719 1300 : *last_curr_page = btscan->btps_lastCurrPage;
720 1300 : exit_loop = true;
721 : }
722 1662 : LWLockRelease(&btscan->btps_lock);
723 1662 : if (exit_loop || !status)
724 : break;
725 4 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
726 : }
727 1658 : ConditionVariableCancelSleep();
728 :
729 : /* When the scan has reached the rightmost (or leftmost) page, end it */
730 1658 : if (endscan)
731 10 : _bt_parallel_done(scan);
732 :
733 1658 : return status;
734 : }
735 :
736 : /*
737 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
738 : * new page. We now have the new value btps_nextScanPage; another backend
739 : * can now begin advancing the scan.
740 : *
741 : * Callers whose scan uses array keys must save their curr_page argument so
742 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
743 : * determine that another primitive index scan is required.
744 : *
745 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
746 : * rightmost/leftmost page. This is treated as reaching the end of the scan
747 : * within _bt_parallel_seize.
748 : *
749 : * Note: unlike the serial case, parallel scans don't need to remember both
750 : * sibling links. next_scan_page is whichever link is next given the scan's
751 : * direction. That's all we'll ever need, since the direction of a parallel
752 : * scan can never change.
753 : */
754 : void
755 1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
756 : BlockNumber curr_page)
757 : {
758 1336 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
759 : BTParallelScanDesc btscan;
760 :
761 : Assert(BlockNumberIsValid(next_scan_page));
762 :
763 1336 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
764 : parallel_scan->ps_offset_am);
765 :
766 1336 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
767 1336 : btscan->btps_nextScanPage = next_scan_page;
768 1336 : btscan->btps_lastCurrPage = curr_page;
769 1336 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
770 1336 : LWLockRelease(&btscan->btps_lock);
771 1336 : ConditionVariableSignal(&btscan->btps_cv);
772 1336 : }
773 :
774 : /*
775 : * _bt_parallel_done() -- Mark the parallel scan as complete.
776 : *
777 : * When there are no pages left to scan, this function should be called to
778 : * notify other workers. Otherwise, they might wait forever for the scan to
779 : * advance to the next page.
780 : */
781 : void
782 6536928 : _bt_parallel_done(IndexScanDesc scan)
783 : {
784 6536928 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
785 6536928 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
786 : BTParallelScanDesc btscan;
787 6536928 : bool status_changed = false;
788 :
789 : Assert(!BTScanPosIsValid(so->currPos));
790 :
791 : /* Do nothing, for non-parallel scans */
792 6536928 : if (parallel_scan == NULL)
793 6536770 : return;
794 :
795 : /*
796 : * Should not mark parallel scan done when there's still a pending
797 : * primitive index scan
798 : */
799 158 : if (so->needPrimScan)
800 36 : return;
801 :
802 122 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
803 : parallel_scan->ps_offset_am);
804 :
805 : /*
806 : * Mark the parallel scan as done, unless some other process did so
807 : * already
808 : */
809 122 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
810 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
811 122 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
812 : {
813 88 : btscan->btps_pageStatus = BTPARALLEL_DONE;
814 88 : status_changed = true;
815 : }
816 122 : LWLockRelease(&btscan->btps_lock);
817 :
818 : /* wake up all the workers associated with this parallel scan */
819 122 : if (status_changed)
820 88 : ConditionVariableBroadcast(&btscan->btps_cv);
821 : }
822 :
823 : /*
824 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
825 : *
826 : * Caller passes the curr_page most recently passed to _bt_parallel_release
827 : * by its backend. Caller successfully schedules the next primitive index scan
828 : * if the shared parallel state hasn't been seized since caller's backend last
829 : * advanced the scan.
830 : */
831 : void
832 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
833 : {
834 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
835 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
836 : BTParallelScanDesc btscan;
837 :
838 : Assert(so->numArrayKeys);
839 :
840 36 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
841 : parallel_scan->ps_offset_am);
842 :
843 36 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
844 36 : if (btscan->btps_lastCurrPage == curr_page &&
845 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
846 : {
847 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
848 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
849 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
850 :
851 : /* Serialize scan's current array keys */
852 72 : for (int i = 0; i < so->numArrayKeys; i++)
853 : {
854 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
855 :
856 36 : btscan->btps_arrElems[i] = array->cur_elem;
857 : }
858 : }
859 36 : LWLockRelease(&btscan->btps_lock);
860 36 : }
861 :
862 : /*
863 : * Bulk deletion of all index entries pointing to a set of heap tuples.
864 : * The set of target tuples is specified via a callback routine that tells
865 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
866 : *
867 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
868 : */
869 : IndexBulkDeleteResult *
870 2656 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
871 : IndexBulkDeleteCallback callback, void *callback_state)
872 : {
873 2656 : Relation rel = info->index;
874 : BTCycleId cycleid;
875 :
876 : /* allocate stats if first time through, else re-use existing struct */
877 2656 : if (stats == NULL)
878 2656 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
879 :
880 : /* Establish the vacuum cycle ID to use for this scan */
881 : /* The ENSURE stuff ensures we clean up shared memory on failure */
882 2656 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
883 : {
884 2656 : cycleid = _bt_start_vacuum(rel);
885 :
886 2656 : btvacuumscan(info, stats, callback, callback_state, cycleid);
887 : }
888 2656 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
889 2656 : _bt_end_vacuum(rel);
890 :
891 2656 : return stats;
892 : }
893 :
894 : /*
895 : * Post-VACUUM cleanup.
896 : *
897 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
898 : */
899 : IndexBulkDeleteResult *
900 166438 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
901 : {
902 : BlockNumber num_delpages;
903 :
904 : /* No-op in ANALYZE ONLY mode */
905 166438 : if (info->analyze_only)
906 16302 : return stats;
907 :
908 : /*
909 : * If btbulkdelete was called, we need not do anything (we just maintain
910 : * the information used within _bt_vacuum_needs_cleanup() by calling
911 : * _bt_set_cleanup_info() below).
912 : *
913 : * If btbulkdelete was _not_ called, then we have a choice to make: we
914 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
915 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
916 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
917 : * now.
918 : */
919 150136 : if (stats == NULL)
920 : {
921 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
922 148060 : if (!_bt_vacuum_needs_cleanup(info->index))
923 148046 : return NULL;
924 :
925 : /*
926 : * Since we aren't going to actually delete any leaf items, there's no
927 : * need to go through all the vacuum-cycle-ID pushups here.
928 : *
929 : * Posting list tuples are a source of inaccuracy for cleanup-only
930 : * scans. btvacuumscan() will assume that the number of index tuples
931 : * from each page can be used as num_index_tuples, even though
932 : * num_index_tuples is supposed to represent the number of TIDs in the
933 : * index. This naive approach can underestimate the number of tuples
934 : * in the index significantly.
935 : *
936 : * We handle the problem by making num_index_tuples an estimate in
937 : * cleanup-only case.
938 : */
939 14 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
940 14 : btvacuumscan(info, stats, NULL, NULL, 0);
941 14 : stats->estimated_count = true;
942 : }
943 :
944 : /*
945 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
946 : *
947 : * num_delpages is the number of deleted pages now in the index that were
948 : * not safe to place in the FSM to be recycled just yet. num_delpages is
949 : * greater than 0 only when _bt_pagedel() actually deleted pages during
950 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
951 : * have failed to place any newly deleted pages in the FSM just moments
952 : * ago. (Actually, there are edge cases where recycling of the current
953 : * VACUUM's newly deleted pages does not even become safe by the time the
954 : * next VACUUM comes around. See nbtree/README.)
955 : */
956 : Assert(stats->pages_deleted >= stats->pages_free);
957 2090 : num_delpages = stats->pages_deleted - stats->pages_free;
958 2090 : _bt_set_cleanup_info(info->index, num_delpages);
959 :
960 : /*
961 : * It's quite possible for us to be fooled by concurrent page splits into
962 : * double-counting some index tuples, so disbelieve any total that exceeds
963 : * the underlying heap's count ... if we know that accurately. Otherwise
964 : * this might just make matters worse.
965 : */
966 2090 : if (!info->estimated_count)
967 : {
968 2036 : if (stats->num_index_tuples > info->num_heap_tuples)
969 24 : stats->num_index_tuples = info->num_heap_tuples;
970 : }
971 :
972 2090 : return stats;
973 : }
974 :
975 : /*
976 : * btvacuumscan --- scan the index for VACUUMing purposes
977 : *
978 : * This combines the functions of looking for leaf tuples that are deletable
979 : * according to the vacuum callback, looking for empty pages that can be
980 : * deleted, and looking for old deleted pages that can be recycled. Both
981 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
982 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
983 : *
984 : * The caller is responsible for initially allocating/zeroing a stats struct
985 : * and for obtaining a vacuum cycle ID if necessary.
986 : */
987 : static void
988 2670 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
989 : IndexBulkDeleteCallback callback, void *callback_state,
990 : BTCycleId cycleid)
991 : {
992 2670 : Relation rel = info->index;
993 : BTVacState vstate;
994 : BlockNumber num_pages;
995 : bool needLock;
996 : BlockRangeReadStreamPrivate p;
997 2670 : ReadStream *stream = NULL;
998 :
999 : /*
1000 : * Reset fields that track information about the entire index now. This
1001 : * avoids double-counting in the case where a single VACUUM command
1002 : * requires multiple scans of the index.
1003 : *
1004 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1005 : * since they track information about the VACUUM command, and so must last
1006 : * across each call to btvacuumscan().
1007 : *
1008 : * (Note that pages_free is treated as state about the whole index, not
1009 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1010 : * calls are idempotent, and get repeated for the same deleted pages in
1011 : * some scenarios. The point for us is to track the number of recyclable
1012 : * pages in the index at the end of the VACUUM command.)
1013 : */
1014 2670 : stats->num_pages = 0;
1015 2670 : stats->num_index_tuples = 0;
1016 2670 : stats->pages_deleted = 0;
1017 2670 : stats->pages_free = 0;
1018 :
1019 : /* Set up info to pass down to btvacuumpage */
1020 2670 : vstate.info = info;
1021 2670 : vstate.stats = stats;
1022 2670 : vstate.callback = callback;
1023 2670 : vstate.callback_state = callback_state;
1024 2670 : vstate.cycleid = cycleid;
1025 :
1026 : /* Create a temporary memory context to run _bt_pagedel in */
1027 2670 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1028 : "_bt_pagedel",
1029 : ALLOCSET_DEFAULT_SIZES);
1030 :
1031 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1032 2670 : vstate.bufsize = 0;
1033 2670 : vstate.maxbufsize = 0;
1034 2670 : vstate.pendingpages = NULL;
1035 2670 : vstate.npendingpages = 0;
1036 : /* Consider applying _bt_pendingfsm_finalize optimization */
1037 2670 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1038 :
1039 : /*
1040 : * The outer loop iterates over all index pages except the metapage, in
1041 : * physical order (we hope the kernel will cooperate in providing
1042 : * read-ahead for speed). It is critical that we visit all leaf pages,
1043 : * including ones added after we start the scan, else we might fail to
1044 : * delete some deletable tuples. Hence, we must repeatedly check the
1045 : * relation length. We must acquire the relation-extension lock while
1046 : * doing so to avoid a race condition: if someone else is extending the
1047 : * relation, there is a window where bufmgr/smgr have created a new
1048 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1049 : * we manage to scan such a page here, we'll improperly assume it can be
1050 : * recycled. Taking the lock synchronizes things enough to prevent a
1051 : * problem: either num_pages won't include the new page, or _bt_getbuf
1052 : * already has write lock on the buffer and it will be fully initialized
1053 : * before we can examine it. Also, we need not worry if a page is added
1054 : * immediately after we look; the page splitting code already has
1055 : * write-lock on the left page before it adds a right page, so we must
1056 : * already have processed any tuples due to be moved into such a page.
1057 : *
1058 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1059 : * think the use of the extension lock is still required.
1060 : *
1061 : * We can skip locking for new or temp relations, however, since no one
1062 : * else could be accessing them.
1063 : */
1064 2670 : needLock = !RELATION_IS_LOCAL(rel);
1065 :
1066 2670 : p.current_blocknum = BTREE_METAPAGE + 1;
1067 :
1068 : /*
1069 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1070 : * locks.
1071 : */
1072 2670 : stream = read_stream_begin_relation(READ_STREAM_FULL |
1073 : READ_STREAM_USE_BATCHING,
1074 : info->strategy,
1075 : rel,
1076 : MAIN_FORKNUM,
1077 : block_range_read_stream_cb,
1078 : &p,
1079 : 0);
1080 : for (;;)
1081 : {
1082 : /* Get the current relation length */
1083 5066 : if (needLock)
1084 5062 : LockRelationForExtension(rel, ExclusiveLock);
1085 5066 : num_pages = RelationGetNumberOfBlocks(rel);
1086 5066 : if (needLock)
1087 5062 : UnlockRelationForExtension(rel, ExclusiveLock);
1088 :
1089 5066 : if (info->report_progress)
1090 886 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1091 : num_pages);
1092 :
1093 : /* Quit if we've scanned the whole relation */
1094 5066 : if (p.current_blocknum >= num_pages)
1095 2670 : break;
1096 :
1097 2396 : p.last_exclusive = num_pages;
1098 :
1099 : /* Iterate over pages, then loop back to recheck relation length */
1100 : while (true)
1101 22440 : {
1102 : BlockNumber current_block;
1103 : Buffer buf;
1104 :
1105 : /* call vacuum_delay_point while not holding any buffer lock */
1106 24836 : vacuum_delay_point(false);
1107 :
1108 24836 : buf = read_stream_next_buffer(stream, NULL);
1109 :
1110 24836 : if (!BufferIsValid(buf))
1111 2396 : break;
1112 :
1113 22440 : current_block = btvacuumpage(&vstate, buf);
1114 :
1115 22440 : if (info->report_progress)
1116 948 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1117 : current_block);
1118 : }
1119 :
1120 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
1121 :
1122 : /*
1123 : * We have to reset the read stream to use it again. After returning
1124 : * InvalidBuffer, the read stream API won't invoke our callback again
1125 : * until the stream has been reset.
1126 : */
1127 2396 : read_stream_reset(stream);
1128 : }
1129 :
1130 2670 : read_stream_end(stream);
1131 :
1132 : /* Set statistics num_pages field to final size of index */
1133 2670 : stats->num_pages = num_pages;
1134 :
1135 2670 : MemoryContextDelete(vstate.pagedelcontext);
1136 :
1137 : /*
1138 : * If there were any calls to _bt_pagedel() during scan of the index then
1139 : * see if any of the resulting pages can be placed in the FSM now. When
1140 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1141 : *
1142 : * Finally, if we placed any pages in the FSM (either just now or during
1143 : * the scan), forcibly update the upper-level FSM pages to ensure that
1144 : * searchers can find them.
1145 : */
1146 2670 : _bt_pendingfsm_finalize(rel, &vstate);
1147 2670 : if (stats->pages_free > 0)
1148 28 : IndexFreeSpaceMapVacuum(rel);
1149 2670 : }
1150 :
1151 : /*
1152 : * btvacuumpage --- VACUUM one page
1153 : *
1154 : * This processes a single page for btvacuumscan(). In some cases we must
1155 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1156 : * a previous call here. This is how we handle page splits (that happened
1157 : * after our cycleid was acquired) whose right half page happened to reuse
1158 : * a block that we might have processed at some point before it was
1159 : * recycled (i.e. before the page split).
1160 : *
1161 : * Returns BlockNumber of a scanned page (not backtracked).
1162 : */
1163 : static BlockNumber
1164 22440 : btvacuumpage(BTVacState *vstate, Buffer buf)
1165 : {
1166 22440 : IndexVacuumInfo *info = vstate->info;
1167 22440 : IndexBulkDeleteResult *stats = vstate->stats;
1168 22440 : IndexBulkDeleteCallback callback = vstate->callback;
1169 22440 : void *callback_state = vstate->callback_state;
1170 22440 : Relation rel = info->index;
1171 22440 : Relation heaprel = info->heaprel;
1172 : bool attempt_pagedel;
1173 : BlockNumber blkno,
1174 : backtrack_to;
1175 22440 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1176 : Page page;
1177 : BTPageOpaque opaque;
1178 :
1179 22440 : blkno = scanblkno;
1180 :
1181 22440 : backtrack:
1182 :
1183 22440 : attempt_pagedel = false;
1184 22440 : backtrack_to = P_NONE;
1185 :
1186 22440 : _bt_lockbuf(rel, buf, BT_READ);
1187 22440 : page = BufferGetPage(buf);
1188 22440 : opaque = NULL;
1189 22440 : if (!PageIsNew(page))
1190 : {
1191 22440 : _bt_checkpage(rel, buf);
1192 22440 : opaque = BTPageGetOpaque(page);
1193 : }
1194 :
1195 : Assert(blkno <= scanblkno);
1196 22440 : if (blkno != scanblkno)
1197 : {
1198 : /*
1199 : * We're backtracking.
1200 : *
1201 : * We followed a right link to a sibling leaf page (a page that
1202 : * happens to be from a block located before scanblkno). The only
1203 : * case we want to do anything with is a live leaf page having the
1204 : * current vacuum cycle ID.
1205 : *
1206 : * The page had better be in a state that's consistent with what we
1207 : * expect. Check for conditions that imply corruption in passing. It
1208 : * can't be half-dead because only an interrupted VACUUM process can
1209 : * leave pages in that state, so we'd definitely have dealt with it
1210 : * back when the page was the scanblkno page (half-dead pages are
1211 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1212 : */
1213 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1214 : {
1215 : Assert(false);
1216 0 : ereport(LOG,
1217 : (errcode(ERRCODE_INDEX_CORRUPTED),
1218 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1219 : blkno, scanblkno, RelationGetRelationName(rel))));
1220 0 : _bt_relbuf(rel, buf);
1221 0 : return scanblkno;
1222 : }
1223 :
1224 : /*
1225 : * We may have already processed the page in an earlier call, when the
1226 : * page was scanblkno. This happens when the leaf page split occurred
1227 : * after the scan began, but before the right sibling page became the
1228 : * scanblkno.
1229 : *
1230 : * Page may also have been deleted by current btvacuumpage() call,
1231 : * since _bt_pagedel() sometimes deletes the right sibling page of
1232 : * scanblkno in passing (it does so after we decided where to
1233 : * backtrack to). We don't need to process this page as a deleted
1234 : * page a second time now (in fact, it would be wrong to count it as a
1235 : * deleted page in the bulk delete statistics a second time).
1236 : */
1237 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1238 : {
1239 : /* Done with current scanblkno (and all lower split pages) */
1240 0 : _bt_relbuf(rel, buf);
1241 0 : return scanblkno;
1242 : }
1243 : }
1244 :
1245 22440 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1246 : {
1247 : /* Okay to recycle this page (which could be leaf or internal) */
1248 1132 : RecordFreeIndexPage(rel, blkno);
1249 1132 : stats->pages_deleted++;
1250 1132 : stats->pages_free++;
1251 : }
1252 21308 : else if (P_ISDELETED(opaque))
1253 : {
1254 : /*
1255 : * Already deleted page (which could be leaf or internal). Can't
1256 : * recycle yet.
1257 : */
1258 238 : stats->pages_deleted++;
1259 : }
1260 21070 : else if (P_ISHALFDEAD(opaque))
1261 : {
1262 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1263 0 : attempt_pagedel = true;
1264 :
1265 : /*
1266 : * _bt_pagedel() will increment both pages_newly_deleted and
1267 : * pages_deleted stats in all cases (barring corruption)
1268 : */
1269 : }
1270 21070 : else if (P_ISLEAF(opaque))
1271 : {
1272 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1273 : int ndeletable;
1274 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1275 : int nupdatable;
1276 : OffsetNumber offnum,
1277 : minoff,
1278 : maxoff;
1279 : int nhtidsdead,
1280 : nhtidslive;
1281 :
1282 : /*
1283 : * Trade in the initial read lock for a full cleanup lock on this
1284 : * page. We must get such a lock on every leaf page over the course
1285 : * of the vacuum scan, whether or not it actually contains any
1286 : * deletable tuples --- see nbtree/README.
1287 : */
1288 19682 : _bt_upgradelockbufcleanup(rel, buf);
1289 :
1290 : /*
1291 : * Check whether we need to backtrack to earlier pages. What we are
1292 : * concerned about is a page split that happened since we started the
1293 : * vacuum scan. If the split moved tuples on the right half of the
1294 : * split (i.e. the tuples that sort high) to a block that we already
1295 : * passed over, then we might have missed the tuples. We need to
1296 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1297 : * or deleting scanblkno page below!)
1298 : */
1299 19682 : if (vstate->cycleid != 0 &&
1300 19538 : opaque->btpo_cycleid == vstate->cycleid &&
1301 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1302 0 : !P_RIGHTMOST(opaque) &&
1303 0 : opaque->btpo_next < scanblkno)
1304 0 : backtrack_to = opaque->btpo_next;
1305 :
1306 19682 : ndeletable = 0;
1307 19682 : nupdatable = 0;
1308 19682 : minoff = P_FIRSTDATAKEY(opaque);
1309 19682 : maxoff = PageGetMaxOffsetNumber(page);
1310 19682 : nhtidsdead = 0;
1311 19682 : nhtidslive = 0;
1312 19682 : if (callback)
1313 : {
1314 : /* btbulkdelete callback tells us what to delete (or update) */
1315 3867686 : for (offnum = minoff;
1316 : offnum <= maxoff;
1317 3848148 : offnum = OffsetNumberNext(offnum))
1318 : {
1319 : IndexTuple itup;
1320 :
1321 3848148 : itup = (IndexTuple) PageGetItem(page,
1322 3848148 : PageGetItemId(page, offnum));
1323 :
1324 : Assert(!BTreeTupleIsPivot(itup));
1325 3848148 : if (!BTreeTupleIsPosting(itup))
1326 : {
1327 : /* Regular tuple, standard table TID representation */
1328 3737476 : if (callback(&itup->t_tid, callback_state))
1329 : {
1330 1380708 : deletable[ndeletable++] = offnum;
1331 1380708 : nhtidsdead++;
1332 : }
1333 : else
1334 2356768 : nhtidslive++;
1335 : }
1336 : else
1337 : {
1338 : BTVacuumPosting vacposting;
1339 : int nremaining;
1340 :
1341 : /* Posting list tuple */
1342 110672 : vacposting = btreevacuumposting(vstate, itup, offnum,
1343 : &nremaining);
1344 110672 : if (vacposting == NULL)
1345 : {
1346 : /*
1347 : * All table TIDs from the posting tuple remain, so no
1348 : * delete or update required
1349 : */
1350 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1351 : }
1352 68330 : else if (nremaining > 0)
1353 : {
1354 :
1355 : /*
1356 : * Store metadata about posting list tuple in
1357 : * updatable array for entire page. Existing tuple
1358 : * will be updated during the later call to
1359 : * _bt_delitems_vacuum().
1360 : */
1361 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1362 34440 : updatable[nupdatable++] = vacposting;
1363 34440 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1364 : }
1365 : else
1366 : {
1367 : /*
1368 : * All table TIDs from the posting list must be
1369 : * deleted. We'll delete the index tuple completely
1370 : * (no update required).
1371 : */
1372 : Assert(nremaining == 0);
1373 33890 : deletable[ndeletable++] = offnum;
1374 33890 : nhtidsdead += BTreeTupleGetNPosting(itup);
1375 33890 : pfree(vacposting);
1376 : }
1377 :
1378 110672 : nhtidslive += nremaining;
1379 : }
1380 : }
1381 : }
1382 :
1383 : /*
1384 : * Apply any needed deletes or updates. We issue just one
1385 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1386 : */
1387 19682 : if (ndeletable > 0 || nupdatable > 0)
1388 : {
1389 : Assert(nhtidsdead >= ndeletable + nupdatable);
1390 12408 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1391 : nupdatable);
1392 :
1393 12408 : stats->tuples_removed += nhtidsdead;
1394 : /* must recompute maxoff */
1395 12408 : maxoff = PageGetMaxOffsetNumber(page);
1396 :
1397 : /* can't leak memory here */
1398 46848 : for (int i = 0; i < nupdatable; i++)
1399 34440 : pfree(updatable[i]);
1400 : }
1401 : else
1402 : {
1403 : /*
1404 : * If the leaf page has been split during this vacuum cycle, it
1405 : * seems worth expending a write to clear btpo_cycleid even if we
1406 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1407 : * takes care of this.) This ensures we won't process the page
1408 : * again.
1409 : *
1410 : * We treat this like a hint-bit update because there's no need to
1411 : * WAL-log it.
1412 : */
1413 : Assert(nhtidsdead == 0);
1414 7274 : if (vstate->cycleid != 0 &&
1415 7130 : opaque->btpo_cycleid == vstate->cycleid)
1416 : {
1417 0 : opaque->btpo_cycleid = 0;
1418 0 : MarkBufferDirtyHint(buf, true);
1419 : }
1420 : }
1421 :
1422 : /*
1423 : * If the leaf page is now empty, try to delete it; else count the
1424 : * live tuples (live table TIDs in posting lists are counted as
1425 : * separate live tuples). We don't delete when backtracking, though,
1426 : * since that would require teaching _bt_pagedel() about backtracking
1427 : * (doesn't seem worth adding more complexity to deal with that).
1428 : *
1429 : * We don't count the number of live TIDs during cleanup-only calls to
1430 : * btvacuumscan (i.e. when callback is not set). We count the number
1431 : * of index tuples directly instead. This avoids the expense of
1432 : * directly examining all of the tuples on each page. VACUUM will
1433 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1434 : * doesn't matter that this underestimates num_index_tuples
1435 : * significantly in some cases.
1436 : */
1437 19682 : if (minoff > maxoff)
1438 5622 : attempt_pagedel = (blkno == scanblkno);
1439 14060 : else if (callback)
1440 13924 : stats->num_index_tuples += nhtidslive;
1441 : else
1442 136 : stats->num_index_tuples += maxoff - minoff + 1;
1443 :
1444 : Assert(!attempt_pagedel || nhtidslive == 0);
1445 : }
1446 :
1447 22440 : if (attempt_pagedel)
1448 : {
1449 : MemoryContext oldcontext;
1450 :
1451 : /* Run pagedel in a temp context to avoid memory leakage */
1452 5622 : MemoryContextReset(vstate->pagedelcontext);
1453 5622 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1454 :
1455 : /*
1456 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1457 : * pages_newly_deleted and pages_deleted are likely to be incremented
1458 : * during call
1459 : */
1460 : Assert(blkno == scanblkno);
1461 5622 : _bt_pagedel(rel, buf, vstate);
1462 :
1463 5622 : MemoryContextSwitchTo(oldcontext);
1464 : /* pagedel released buffer, so we shouldn't */
1465 : }
1466 : else
1467 16818 : _bt_relbuf(rel, buf);
1468 :
1469 22440 : if (backtrack_to != P_NONE)
1470 : {
1471 0 : blkno = backtrack_to;
1472 :
1473 : /* check for vacuum delay while not holding any buffer lock */
1474 0 : vacuum_delay_point(false);
1475 :
1476 : /*
1477 : * We can't use _bt_getbuf() here because it always applies
1478 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1479 : * recycle all-zero pages, not fail. Also, we want to use a
1480 : * nondefault buffer access strategy.
1481 : */
1482 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1483 : info->strategy);
1484 0 : goto backtrack;
1485 : }
1486 :
1487 22440 : return scanblkno;
1488 : }
1489 :
1490 : /*
1491 : * btreevacuumposting --- determine TIDs still needed in posting list
1492 : *
1493 : * Returns metadata describing how to build replacement tuple without the TIDs
1494 : * that VACUUM needs to delete. Returned value is NULL in the common case
1495 : * where no changes are needed to caller's posting list tuple (we avoid
1496 : * allocating memory here as an optimization).
1497 : *
1498 : * The number of TIDs that should remain in the posting list tuple is set for
1499 : * caller in *nremaining.
1500 : */
1501 : static BTVacuumPosting
1502 110672 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1503 : OffsetNumber updatedoffset, int *nremaining)
1504 : {
1505 110672 : int live = 0;
1506 110672 : int nitem = BTreeTupleGetNPosting(posting);
1507 110672 : ItemPointer items = BTreeTupleGetPosting(posting);
1508 110672 : BTVacuumPosting vacposting = NULL;
1509 :
1510 632590 : for (int i = 0; i < nitem; i++)
1511 : {
1512 521918 : if (!vstate->callback(items + i, vstate->callback_state))
1513 : {
1514 : /* Live table TID */
1515 285632 : live++;
1516 : }
1517 236286 : else if (vacposting == NULL)
1518 : {
1519 : /*
1520 : * First dead table TID encountered.
1521 : *
1522 : * It's now clear that we need to delete one or more dead table
1523 : * TIDs, so start maintaining metadata describing how to update
1524 : * existing posting list tuple.
1525 : */
1526 68330 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1527 : nitem * sizeof(uint16));
1528 :
1529 68330 : vacposting->itup = posting;
1530 68330 : vacposting->updatedoffset = updatedoffset;
1531 68330 : vacposting->ndeletedtids = 0;
1532 68330 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1533 : }
1534 : else
1535 : {
1536 : /* Second or subsequent dead table TID */
1537 167956 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1538 : }
1539 : }
1540 :
1541 110672 : *nremaining = live;
1542 110672 : return vacposting;
1543 : }
1544 :
1545 : /*
1546 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1547 : *
1548 : * btrees always do, so this is trivial.
1549 : */
1550 : bool
1551 1034542 : btcanreturn(Relation index, int attno)
1552 : {
1553 1034542 : return true;
1554 : }
1555 :
1556 : /*
1557 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1558 : */
1559 : int
1560 661784 : btgettreeheight(Relation rel)
1561 : {
1562 661784 : return _bt_getrootheight(rel);
1563 : }
1564 :
1565 : CompareType
1566 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1567 : {
1568 0 : switch (strategy)
1569 : {
1570 0 : case BTLessStrategyNumber:
1571 0 : return COMPARE_LT;
1572 0 : case BTLessEqualStrategyNumber:
1573 0 : return COMPARE_LE;
1574 0 : case BTEqualStrategyNumber:
1575 0 : return COMPARE_EQ;
1576 0 : case BTGreaterEqualStrategyNumber:
1577 0 : return COMPARE_GE;
1578 0 : case BTGreaterStrategyNumber:
1579 0 : return COMPARE_GT;
1580 0 : default:
1581 0 : return COMPARE_INVALID;
1582 : }
1583 : }
1584 :
1585 : StrategyNumber
1586 323808 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1587 : {
1588 323808 : switch (cmptype)
1589 : {
1590 0 : case COMPARE_LT:
1591 0 : return BTLessStrategyNumber;
1592 48 : case COMPARE_LE:
1593 48 : return BTLessEqualStrategyNumber;
1594 323712 : case COMPARE_EQ:
1595 323712 : return BTEqualStrategyNumber;
1596 24 : case COMPARE_GE:
1597 24 : return BTGreaterEqualStrategyNumber;
1598 24 : case COMPARE_GT:
1599 24 : return BTGreaterStrategyNumber;
1600 0 : default:
1601 0 : return InvalidStrategy;
1602 : }
1603 : }
|