Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "utils/fmgrprotos.h"
34 : #include "utils/index_selfuncs.h"
35 : #include "utils/memutils.h"
36 :
37 :
38 : /*
39 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
40 : *
41 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
42 : * scan to advance it via another call to _bt_first.
43 : *
44 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
45 : * a new page; others must wait.
46 : *
47 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
48 : * to a new page; some process can start doing that.
49 : *
50 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51 : */
52 : typedef enum
53 : {
54 : BTPARALLEL_NOT_INITIALIZED,
55 : BTPARALLEL_NEED_PRIMSCAN,
56 : BTPARALLEL_ADVANCING,
57 : BTPARALLEL_IDLE,
58 : BTPARALLEL_DONE,
59 : } BTPS_State;
60 :
61 : /*
62 : * BTParallelScanDescData contains btree specific shared information required
63 : * for parallel scan.
64 : */
65 : typedef struct BTParallelScanDescData
66 : {
67 : BlockNumber btps_nextScanPage; /* next page to be scanned */
68 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
69 : * btps_nextScanPage */
70 : BTPS_State btps_pageStatus; /* indicates whether next page is
71 : * available for scan. see above for
72 : * possible states of parallel scan. */
73 : slock_t btps_mutex; /* protects above variables, btps_arrElems */
74 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 :
76 : /*
77 : * btps_arrElems is used when scans need to schedule another primitive
78 : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 : */
80 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
81 : } BTParallelScanDescData;
82 :
83 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
84 :
85 :
86 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87 : IndexBulkDeleteCallback callback, void *callback_state,
88 : BTCycleId cycleid);
89 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
90 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
91 : IndexTuple posting,
92 : OffsetNumber updatedoffset,
93 : int *nremaining);
94 :
95 :
96 : /*
97 : * Btree handler function: return IndexAmRoutine with access method parameters
98 : * and callbacks.
99 : */
100 : Datum
101 3140706 : bthandler(PG_FUNCTION_ARGS)
102 : {
103 3140706 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 :
105 3140706 : amroutine->amstrategies = BTMaxStrategyNumber;
106 3140706 : amroutine->amsupport = BTNProcs;
107 3140706 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
108 3140706 : amroutine->amcanorder = true;
109 3140706 : amroutine->amcanorderbyop = false;
110 3140706 : amroutine->amcanbackward = true;
111 3140706 : amroutine->amcanunique = true;
112 3140706 : amroutine->amcanmulticol = true;
113 3140706 : amroutine->amoptionalkey = true;
114 3140706 : amroutine->amsearcharray = true;
115 3140706 : amroutine->amsearchnulls = true;
116 3140706 : amroutine->amstorage = false;
117 3140706 : amroutine->amclusterable = true;
118 3140706 : amroutine->ampredlocks = true;
119 3140706 : amroutine->amcanparallel = true;
120 3140706 : amroutine->amcanbuildparallel = true;
121 3140706 : amroutine->amcaninclude = true;
122 3140706 : amroutine->amusemaintenanceworkmem = false;
123 3140706 : amroutine->amsummarizing = false;
124 3140706 : amroutine->amparallelvacuumoptions =
125 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
126 3140706 : amroutine->amkeytype = InvalidOid;
127 :
128 3140706 : amroutine->ambuild = btbuild;
129 3140706 : amroutine->ambuildempty = btbuildempty;
130 3140706 : amroutine->aminsert = btinsert;
131 3140706 : amroutine->aminsertcleanup = NULL;
132 3140706 : amroutine->ambulkdelete = btbulkdelete;
133 3140706 : amroutine->amvacuumcleanup = btvacuumcleanup;
134 3140706 : amroutine->amcanreturn = btcanreturn;
135 3140706 : amroutine->amcostestimate = btcostestimate;
136 3140706 : amroutine->amgettreeheight = btgettreeheight;
137 3140706 : amroutine->amoptions = btoptions;
138 3140706 : amroutine->amproperty = btproperty;
139 3140706 : amroutine->ambuildphasename = btbuildphasename;
140 3140706 : amroutine->amvalidate = btvalidate;
141 3140706 : amroutine->amadjustmembers = btadjustmembers;
142 3140706 : amroutine->ambeginscan = btbeginscan;
143 3140706 : amroutine->amrescan = btrescan;
144 3140706 : amroutine->amgettuple = btgettuple;
145 3140706 : amroutine->amgetbitmap = btgetbitmap;
146 3140706 : amroutine->amendscan = btendscan;
147 3140706 : amroutine->ammarkpos = btmarkpos;
148 3140706 : amroutine->amrestrpos = btrestrpos;
149 3140706 : amroutine->amestimateparallelscan = btestimateparallelscan;
150 3140706 : amroutine->aminitparallelscan = btinitparallelscan;
151 3140706 : amroutine->amparallelrescan = btparallelrescan;
152 3140706 : amroutine->amtranslatestrategy = bttranslatestrategy;
153 3140706 : amroutine->amtranslatecmptype = bttranslatecmptype;
154 :
155 3140706 : PG_RETURN_POINTER(amroutine);
156 : }
157 :
158 : /*
159 : * btbuildempty() -- build an empty btree index in the initialization fork
160 : */
161 : void
162 148 : btbuildempty(Relation index)
163 : {
164 148 : bool allequalimage = _bt_allequalimage(index, false);
165 : BulkWriteState *bulkstate;
166 : BulkWriteBuffer metabuf;
167 :
168 148 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
169 :
170 : /* Construct metapage. */
171 148 : metabuf = smgr_bulk_get_buf(bulkstate);
172 148 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
173 148 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
174 :
175 148 : smgr_bulk_finish(bulkstate);
176 148 : }
177 :
178 : /*
179 : * btinsert() -- insert an index tuple into a btree.
180 : *
181 : * Descend the tree recursively, find the appropriate location for our
182 : * new tuple, and put it there.
183 : */
184 : bool
185 6889916 : btinsert(Relation rel, Datum *values, bool *isnull,
186 : ItemPointer ht_ctid, Relation heapRel,
187 : IndexUniqueCheck checkUnique,
188 : bool indexUnchanged,
189 : IndexInfo *indexInfo)
190 : {
191 : bool result;
192 : IndexTuple itup;
193 :
194 : /* generate an index tuple */
195 6889916 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
196 6889916 : itup->t_tid = *ht_ctid;
197 :
198 6889916 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
199 :
200 6889406 : pfree(itup);
201 :
202 6889406 : return result;
203 : }
204 :
205 : /*
206 : * btgettuple() -- Get the next tuple in the scan.
207 : */
208 : bool
209 31657498 : btgettuple(IndexScanDesc scan, ScanDirection dir)
210 : {
211 31657498 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
212 : bool res;
213 :
214 : /* btree indexes are never lossy */
215 31657498 : scan->xs_recheck = false;
216 :
217 : /* Each loop iteration performs another primitive index scan */
218 : do
219 : {
220 : /*
221 : * If we've already initialized this scan, we can just advance it in
222 : * the appropriate direction. If we haven't done so yet, we call
223 : * _bt_first() to get the first item in the scan.
224 : */
225 31674384 : if (!BTScanPosIsValid(so->currPos))
226 13934726 : res = _bt_first(scan, dir);
227 : else
228 : {
229 : /*
230 : * Check to see if we should kill the previously-fetched tuple.
231 : */
232 17739658 : if (scan->kill_prior_tuple)
233 : {
234 : /*
235 : * Yes, remember it for later. (We'll deal with all such
236 : * tuples at once right before leaving the index page.) The
237 : * test for numKilled overrun is not just paranoia: if the
238 : * caller reverses direction in the indexscan then the same
239 : * item might get entered multiple times. It's not worth
240 : * trying to optimize that, so we don't detect it, but instead
241 : * just forget any excess entries.
242 : */
243 447014 : if (so->killedItems == NULL)
244 161156 : so->killedItems = (int *)
245 161156 : palloc(MaxTIDsPerBTreePage * sizeof(int));
246 447014 : if (so->numKilled < MaxTIDsPerBTreePage)
247 447014 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
248 : }
249 :
250 : /*
251 : * Now continue the scan.
252 : */
253 17739658 : res = _bt_next(scan, dir);
254 : }
255 :
256 : /* If we have a tuple, return it ... */
257 31674384 : if (res)
258 25335606 : break;
259 : /* ... otherwise see if we need another primitive index scan */
260 6338778 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
261 :
262 31657498 : return res;
263 : }
264 :
265 : /*
266 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
267 : */
268 : int64
269 18010 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
270 : {
271 18010 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
272 18010 : int64 ntids = 0;
273 : ItemPointer heapTid;
274 :
275 : /* Each loop iteration performs another primitive index scan */
276 : do
277 : {
278 : /* Fetch the first page & tuple */
279 18406 : if (_bt_first(scan, ForwardScanDirection))
280 : {
281 : /* Save tuple ID, and continue scanning */
282 9282 : heapTid = &scan->xs_heaptid;
283 9282 : tbm_add_tuples(tbm, heapTid, 1, false);
284 9282 : ntids++;
285 :
286 : for (;;)
287 : {
288 : /*
289 : * Advance to next tuple within page. This is the same as the
290 : * easy case in _bt_next().
291 : */
292 1894002 : if (++so->currPos.itemIndex > so->currPos.lastItem)
293 : {
294 : /* let _bt_next do the heavy lifting */
295 14242 : if (!_bt_next(scan, ForwardScanDirection))
296 9282 : break;
297 : }
298 :
299 : /* Save tuple ID, and continue scanning */
300 1884720 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
301 1884720 : tbm_add_tuples(tbm, heapTid, 1, false);
302 1884720 : ntids++;
303 : }
304 : }
305 : /* Now see if we need another primitive index scan */
306 18406 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
307 :
308 18010 : return ntids;
309 : }
310 :
311 : /*
312 : * btbeginscan() -- start a scan on a btree index
313 : */
314 : IndexScanDesc
315 13289124 : btbeginscan(Relation rel, int nkeys, int norderbys)
316 : {
317 : IndexScanDesc scan;
318 : BTScanOpaque so;
319 :
320 : /* no order by operators allowed */
321 : Assert(norderbys == 0);
322 :
323 : /* get the scan */
324 13289124 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
325 :
326 : /* allocate private workspace */
327 13289124 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
328 13289124 : BTScanPosInvalidate(so->currPos);
329 13289124 : BTScanPosInvalidate(so->markPos);
330 13289124 : if (scan->numberOfKeys > 0)
331 13276574 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
332 : else
333 12550 : so->keyData = NULL;
334 :
335 13289124 : so->needPrimScan = false;
336 13289124 : so->scanBehind = false;
337 13289124 : so->oppositeDirCheck = false;
338 13289124 : so->arrayKeys = NULL;
339 13289124 : so->orderProcs = NULL;
340 13289124 : so->arrayContext = NULL;
341 :
342 13289124 : so->killedItems = NULL; /* until needed */
343 13289124 : so->numKilled = 0;
344 :
345 : /*
346 : * We don't know yet whether the scan will be index-only, so we do not
347 : * allocate the tuple workspace arrays until btrescan. However, we set up
348 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
349 : */
350 13289124 : so->currTuples = so->markTuples = NULL;
351 :
352 13289124 : scan->xs_itupdesc = RelationGetDescr(rel);
353 :
354 13289124 : scan->opaque = so;
355 :
356 13289124 : return scan;
357 : }
358 :
359 : /*
360 : * btrescan() -- rescan an index relation
361 : */
362 : void
363 13936962 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
364 : ScanKey orderbys, int norderbys)
365 : {
366 13936962 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
367 :
368 : /* we aren't holding any read locks, but gotta drop the pins */
369 13936962 : if (BTScanPosIsValid(so->currPos))
370 : {
371 : /* Before leaving current page, deal with any killed items */
372 86558 : if (so->numKilled > 0)
373 904 : _bt_killitems(scan);
374 86558 : BTScanPosUnpinIfPinned(so->currPos);
375 86558 : BTScanPosInvalidate(so->currPos);
376 : }
377 :
378 13936962 : so->markItemIndex = -1;
379 13936962 : so->needPrimScan = false;
380 13936962 : so->scanBehind = false;
381 13936962 : so->oppositeDirCheck = false;
382 13936962 : BTScanPosUnpinIfPinned(so->markPos);
383 13936962 : BTScanPosInvalidate(so->markPos);
384 :
385 : /*
386 : * Allocate tuple workspace arrays, if needed for an index-only scan and
387 : * not already done in a previous rescan call. To save on palloc
388 : * overhead, both workspaces are allocated as one palloc block; only this
389 : * function and btendscan know that.
390 : *
391 : * NOTE: this data structure also makes it safe to return data from a
392 : * "name" column, even though btree name_ops uses an underlying storage
393 : * datatype of cstring. The risk there is that "name" is supposed to be
394 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
395 : * However, since we only return data out of tuples sitting in the
396 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
397 : * data out of the markTuples array --- running off the end of memory for
398 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
399 : * adding special-case treatment for name_ops elsewhere.
400 : */
401 13936962 : if (scan->xs_want_itup && so->currTuples == NULL)
402 : {
403 235436 : so->currTuples = (char *) palloc(BLCKSZ * 2);
404 235436 : so->markTuples = so->currTuples + BLCKSZ;
405 : }
406 :
407 : /*
408 : * Reset the scan keys
409 : */
410 13936962 : if (scankey && scan->numberOfKeys > 0)
411 13924278 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
412 13936962 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
413 13936962 : so->numArrayKeys = 0; /* ditto */
414 13936962 : }
415 :
416 : /*
417 : * btendscan() -- close down a scan
418 : */
419 : void
420 13287642 : btendscan(IndexScanDesc scan)
421 : {
422 13287642 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
423 :
424 : /* we aren't holding any read locks, but gotta drop the pins */
425 13287642 : if (BTScanPosIsValid(so->currPos))
426 : {
427 : /* Before leaving current page, deal with any killed items */
428 7508338 : if (so->numKilled > 0)
429 84514 : _bt_killitems(scan);
430 7508338 : BTScanPosUnpinIfPinned(so->currPos);
431 : }
432 :
433 13287642 : so->markItemIndex = -1;
434 13287642 : BTScanPosUnpinIfPinned(so->markPos);
435 :
436 : /* No need to invalidate positions, the RAM is about to be freed. */
437 :
438 : /* Release storage */
439 13287642 : if (so->keyData != NULL)
440 13275122 : pfree(so->keyData);
441 : /* so->arrayKeys and so->orderProcs are in arrayContext */
442 13287642 : if (so->arrayContext != NULL)
443 1138 : MemoryContextDelete(so->arrayContext);
444 13287642 : if (so->killedItems != NULL)
445 161098 : pfree(so->killedItems);
446 13287642 : if (so->currTuples != NULL)
447 235392 : pfree(so->currTuples);
448 : /* so->markTuples should not be pfree'd, see btrescan */
449 13287642 : pfree(so);
450 13287642 : }
451 :
452 : /*
453 : * btmarkpos() -- save current scan position
454 : */
455 : void
456 130042 : btmarkpos(IndexScanDesc scan)
457 : {
458 130042 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
459 :
460 : /* There may be an old mark with a pin (but no lock). */
461 130042 : BTScanPosUnpinIfPinned(so->markPos);
462 :
463 : /*
464 : * Just record the current itemIndex. If we later step to next page
465 : * before releasing the marked position, _bt_steppage makes a full copy of
466 : * the currPos struct in markPos. If (as often happens) the mark is moved
467 : * before we leave the page, we don't have to do that work.
468 : */
469 130042 : if (BTScanPosIsValid(so->currPos))
470 130042 : so->markItemIndex = so->currPos.itemIndex;
471 : else
472 : {
473 0 : BTScanPosInvalidate(so->markPos);
474 0 : so->markItemIndex = -1;
475 : }
476 130042 : }
477 :
478 : /*
479 : * btrestrpos() -- restore scan to last saved position
480 : */
481 : void
482 54018 : btrestrpos(IndexScanDesc scan)
483 : {
484 54018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
485 :
486 54018 : if (so->markItemIndex >= 0)
487 : {
488 : /*
489 : * The scan has never moved to a new page since the last mark. Just
490 : * restore the itemIndex.
491 : *
492 : * NB: In this case we can't count on anything in so->markPos to be
493 : * accurate.
494 : */
495 53910 : so->currPos.itemIndex = so->markItemIndex;
496 : }
497 : else
498 : {
499 : /*
500 : * The scan moved to a new page after last mark or restore, and we are
501 : * now restoring to the marked page. We aren't holding any read
502 : * locks, but if we're still holding the pin for the current position,
503 : * we must drop it.
504 : */
505 108 : if (BTScanPosIsValid(so->currPos))
506 : {
507 : /* Before leaving current page, deal with any killed items */
508 108 : if (so->numKilled > 0)
509 0 : _bt_killitems(scan);
510 108 : BTScanPosUnpinIfPinned(so->currPos);
511 : }
512 :
513 108 : if (BTScanPosIsValid(so->markPos))
514 : {
515 : /* bump pin on mark buffer for assignment to current buffer */
516 108 : if (BTScanPosIsPinned(so->markPos))
517 0 : IncrBufferRefCount(so->markPos.buf);
518 108 : memcpy(&so->currPos, &so->markPos,
519 : offsetof(BTScanPosData, items[1]) +
520 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
521 108 : if (so->currTuples)
522 0 : memcpy(so->currTuples, so->markTuples,
523 0 : so->markPos.nextTupleOffset);
524 : /* Reset the scan's array keys (see _bt_steppage for why) */
525 108 : if (so->numArrayKeys)
526 : {
527 0 : _bt_start_array_keys(scan, so->currPos.dir);
528 0 : so->needPrimScan = false;
529 : }
530 : }
531 : else
532 0 : BTScanPosInvalidate(so->currPos);
533 : }
534 54018 : }
535 :
536 : /*
537 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
538 : */
539 : Size
540 64 : btestimateparallelscan(int nkeys, int norderbys)
541 : {
542 : /* Pessimistically assume all input scankeys will be output with arrays */
543 64 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
544 : }
545 :
546 : /*
547 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
548 : */
549 : void
550 64 : btinitparallelscan(void *target)
551 : {
552 64 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
553 :
554 64 : SpinLockInit(&bt_target->btps_mutex);
555 64 : bt_target->btps_nextScanPage = InvalidBlockNumber;
556 64 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
557 64 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
558 64 : ConditionVariableInit(&bt_target->btps_cv);
559 64 : }
560 :
561 : /*
562 : * btparallelrescan() -- reset parallel scan
563 : */
564 : void
565 24 : btparallelrescan(IndexScanDesc scan)
566 : {
567 : BTParallelScanDesc btscan;
568 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
569 :
570 : Assert(parallel_scan);
571 :
572 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
573 : parallel_scan->ps_offset);
574 :
575 : /*
576 : * In theory, we don't need to acquire the spinlock here, because there
577 : * shouldn't be any other workers running at this point, but we do so for
578 : * consistency.
579 : */
580 24 : SpinLockAcquire(&btscan->btps_mutex);
581 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
582 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
583 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
584 24 : SpinLockRelease(&btscan->btps_mutex);
585 24 : }
586 :
587 : /*
588 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
589 : * page. Other scans must wait until we call _bt_parallel_release()
590 : * or _bt_parallel_done().
591 : *
592 : * The return value is true if we successfully seized the scan and false
593 : * if we did not. The latter case occurs when no pages remain, or when
594 : * another primitive index scan is scheduled that caller's backend cannot
595 : * start just yet (only backends that call from _bt_first are capable of
596 : * starting primitive index scans, which they indicate by passing first=true).
597 : *
598 : * If the return value is true, *next_scan_page returns the next page of the
599 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
600 : * An invalid *next_scan_page means the scan hasn't yet started, or that
601 : * caller needs to start the next primitive index scan (if it's the latter
602 : * case we'll set so.needPrimScan).
603 : *
604 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
605 : * the return value is false.
606 : */
607 : bool
608 1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
609 : BlockNumber *last_curr_page, bool first)
610 : {
611 1658 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
612 1658 : bool exit_loop = false,
613 1658 : status = true,
614 1658 : endscan = false;
615 1658 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
616 : BTParallelScanDesc btscan;
617 :
618 1658 : *next_scan_page = InvalidBlockNumber;
619 1658 : *last_curr_page = InvalidBlockNumber;
620 :
621 : /*
622 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
623 : * call to _bt_readnextpage treats this backend similarly to a serial
624 : * backend that steps from *last_curr_page to *next_scan_page (unless this
625 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
626 : */
627 1658 : BTScanPosInvalidate(so->currPos);
628 1658 : so->currPos.moreLeft = so->currPos.moreRight = true;
629 :
630 1658 : if (first)
631 : {
632 : /*
633 : * Initialize array related state when called from _bt_first, assuming
634 : * that this will be the first primitive index scan for the scan
635 : */
636 446 : so->needPrimScan = false;
637 446 : so->scanBehind = false;
638 446 : so->oppositeDirCheck = false;
639 : }
640 : else
641 : {
642 : /*
643 : * Don't attempt to seize the scan when it requires another primitive
644 : * index scan, since caller's backend cannot start it right now
645 : */
646 1212 : if (so->needPrimScan)
647 0 : return false;
648 : }
649 :
650 1658 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
651 : parallel_scan->ps_offset);
652 :
653 : while (1)
654 : {
655 1658 : SpinLockAcquire(&btscan->btps_mutex);
656 :
657 1658 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
658 : {
659 : /* We're done with this parallel index scan */
660 312 : status = false;
661 : }
662 1346 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
663 1222 : btscan->btps_nextScanPage == P_NONE)
664 : {
665 : /* End this parallel index scan */
666 10 : status = false;
667 10 : endscan = true;
668 : }
669 1336 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
670 : {
671 : Assert(so->numArrayKeys);
672 :
673 36 : if (first)
674 : {
675 : /* Can start scheduled primitive scan right away, so do so */
676 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
677 72 : for (int i = 0; i < so->numArrayKeys; i++)
678 : {
679 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
680 36 : ScanKey skey = &so->keyData[array->scan_key];
681 :
682 36 : array->cur_elem = btscan->btps_arrElems[i];
683 36 : skey->sk_argument = array->elem_values[array->cur_elem];
684 : }
685 36 : exit_loop = true;
686 : }
687 : else
688 : {
689 : /*
690 : * Don't attempt to seize the scan when it requires another
691 : * primitive index scan, since caller's backend cannot start
692 : * it right now
693 : */
694 0 : status = false;
695 : }
696 :
697 : /*
698 : * Either way, update backend local state to indicate that a
699 : * pending primitive scan is required
700 : */
701 36 : so->needPrimScan = true;
702 36 : so->scanBehind = false;
703 36 : so->oppositeDirCheck = false;
704 : }
705 1300 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
706 : {
707 : /*
708 : * We have successfully seized control of the scan for the purpose
709 : * of advancing it to a new page!
710 : */
711 1300 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
712 : Assert(btscan->btps_nextScanPage != P_NONE);
713 1300 : *next_scan_page = btscan->btps_nextScanPage;
714 1300 : *last_curr_page = btscan->btps_lastCurrPage;
715 1300 : exit_loop = true;
716 : }
717 1658 : SpinLockRelease(&btscan->btps_mutex);
718 1658 : if (exit_loop || !status)
719 : break;
720 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
721 : }
722 1658 : ConditionVariableCancelSleep();
723 :
724 : /* When the scan has reached the rightmost (or leftmost) page, end it */
725 1658 : if (endscan)
726 10 : _bt_parallel_done(scan);
727 :
728 1658 : return status;
729 : }
730 :
731 : /*
732 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
733 : * new page. We now have the new value btps_nextScanPage; another backend
734 : * can now begin advancing the scan.
735 : *
736 : * Callers whose scan uses array keys must save their curr_page argument so
737 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
738 : * determine that another primitive index scan is required.
739 : *
740 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
741 : * rightmost/leftmost page. This is treated as reaching the end of the scan
742 : * within _bt_parallel_seize.
743 : *
744 : * Note: unlike the serial case, parallel scans don't need to remember both
745 : * sibling links. next_scan_page is whichever link is next given the scan's
746 : * direction. That's all we'll ever need, since the direction of a parallel
747 : * scan can never change.
748 : */
749 : void
750 1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
751 : BlockNumber curr_page)
752 : {
753 1336 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
754 : BTParallelScanDesc btscan;
755 :
756 : Assert(BlockNumberIsValid(next_scan_page));
757 :
758 1336 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
759 : parallel_scan->ps_offset);
760 :
761 1336 : SpinLockAcquire(&btscan->btps_mutex);
762 1336 : btscan->btps_nextScanPage = next_scan_page;
763 1336 : btscan->btps_lastCurrPage = curr_page;
764 1336 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
765 1336 : SpinLockRelease(&btscan->btps_mutex);
766 1336 : ConditionVariableSignal(&btscan->btps_cv);
767 1336 : }
768 :
769 : /*
770 : * _bt_parallel_done() -- Mark the parallel scan as complete.
771 : *
772 : * When there are no pages left to scan, this function should be called to
773 : * notify other workers. Otherwise, they might wait forever for the scan to
774 : * advance to the next page.
775 : */
776 : void
777 6356902 : _bt_parallel_done(IndexScanDesc scan)
778 : {
779 6356902 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
780 6356902 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
781 : BTParallelScanDesc btscan;
782 6356902 : bool status_changed = false;
783 :
784 : Assert(!BTScanPosIsValid(so->currPos));
785 :
786 : /* Do nothing, for non-parallel scans */
787 6356902 : if (parallel_scan == NULL)
788 6356744 : return;
789 :
790 : /*
791 : * Should not mark parallel scan done when there's still a pending
792 : * primitive index scan
793 : */
794 158 : if (so->needPrimScan)
795 36 : return;
796 :
797 122 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
798 : parallel_scan->ps_offset);
799 :
800 : /*
801 : * Mark the parallel scan as done, unless some other process did so
802 : * already
803 : */
804 122 : SpinLockAcquire(&btscan->btps_mutex);
805 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
806 122 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
807 : {
808 88 : btscan->btps_pageStatus = BTPARALLEL_DONE;
809 88 : status_changed = true;
810 : }
811 122 : SpinLockRelease(&btscan->btps_mutex);
812 :
813 : /* wake up all the workers associated with this parallel scan */
814 122 : if (status_changed)
815 88 : ConditionVariableBroadcast(&btscan->btps_cv);
816 : }
817 :
818 : /*
819 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
820 : *
821 : * Caller passes the curr_page most recently passed to _bt_parallel_release
822 : * by its backend. Caller successfully schedules the next primitive index scan
823 : * if the shared parallel state hasn't been seized since caller's backend last
824 : * advanced the scan.
825 : */
826 : void
827 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
828 : {
829 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
830 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
831 : BTParallelScanDesc btscan;
832 :
833 : Assert(so->numArrayKeys);
834 :
835 36 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
836 : parallel_scan->ps_offset);
837 :
838 36 : SpinLockAcquire(&btscan->btps_mutex);
839 36 : if (btscan->btps_lastCurrPage == curr_page &&
840 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
841 : {
842 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
843 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
844 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
845 :
846 : /* Serialize scan's current array keys */
847 72 : for (int i = 0; i < so->numArrayKeys; i++)
848 : {
849 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
850 :
851 36 : btscan->btps_arrElems[i] = array->cur_elem;
852 : }
853 : }
854 36 : SpinLockRelease(&btscan->btps_mutex);
855 36 : }
856 :
857 : /*
858 : * Bulk deletion of all index entries pointing to a set of heap tuples.
859 : * The set of target tuples is specified via a callback routine that tells
860 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
861 : *
862 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
863 : */
864 : IndexBulkDeleteResult *
865 2528 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
866 : IndexBulkDeleteCallback callback, void *callback_state)
867 : {
868 2528 : Relation rel = info->index;
869 : BTCycleId cycleid;
870 :
871 : /* allocate stats if first time through, else re-use existing struct */
872 2528 : if (stats == NULL)
873 2528 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
874 :
875 : /* Establish the vacuum cycle ID to use for this scan */
876 : /* The ENSURE stuff ensures we clean up shared memory on failure */
877 2528 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
878 : {
879 2528 : cycleid = _bt_start_vacuum(rel);
880 :
881 2528 : btvacuumscan(info, stats, callback, callback_state, cycleid);
882 : }
883 2528 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
884 2528 : _bt_end_vacuum(rel);
885 :
886 2528 : return stats;
887 : }
888 :
889 : /*
890 : * Post-VACUUM cleanup.
891 : *
892 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
893 : */
894 : IndexBulkDeleteResult *
895 161946 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
896 : {
897 : BlockNumber num_delpages;
898 :
899 : /* No-op in ANALYZE ONLY mode */
900 161946 : if (info->analyze_only)
901 16278 : return stats;
902 :
903 : /*
904 : * If btbulkdelete was called, we need not do anything (we just maintain
905 : * the information used within _bt_vacuum_needs_cleanup() by calling
906 : * _bt_set_cleanup_info() below).
907 : *
908 : * If btbulkdelete was _not_ called, then we have a choice to make: we
909 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
910 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
911 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
912 : * now.
913 : */
914 145668 : if (stats == NULL)
915 : {
916 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
917 143744 : if (!_bt_vacuum_needs_cleanup(info->index))
918 143736 : return NULL;
919 :
920 : /*
921 : * Since we aren't going to actually delete any leaf items, there's no
922 : * need to go through all the vacuum-cycle-ID pushups here.
923 : *
924 : * Posting list tuples are a source of inaccuracy for cleanup-only
925 : * scans. btvacuumscan() will assume that the number of index tuples
926 : * from each page can be used as num_index_tuples, even though
927 : * num_index_tuples is supposed to represent the number of TIDs in the
928 : * index. This naive approach can underestimate the number of tuples
929 : * in the index significantly.
930 : *
931 : * We handle the problem by making num_index_tuples an estimate in
932 : * cleanup-only case.
933 : */
934 8 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
935 8 : btvacuumscan(info, stats, NULL, NULL, 0);
936 8 : stats->estimated_count = true;
937 : }
938 :
939 : /*
940 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
941 : *
942 : * num_delpages is the number of deleted pages now in the index that were
943 : * not safe to place in the FSM to be recycled just yet. num_delpages is
944 : * greater than 0 only when _bt_pagedel() actually deleted pages during
945 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
946 : * have failed to place any newly deleted pages in the FSM just moments
947 : * ago. (Actually, there are edge cases where recycling of the current
948 : * VACUUM's newly deleted pages does not even become safe by the time the
949 : * next VACUUM comes around. See nbtree/README.)
950 : */
951 : Assert(stats->pages_deleted >= stats->pages_free);
952 1932 : num_delpages = stats->pages_deleted - stats->pages_free;
953 1932 : _bt_set_cleanup_info(info->index, num_delpages);
954 :
955 : /*
956 : * It's quite possible for us to be fooled by concurrent page splits into
957 : * double-counting some index tuples, so disbelieve any total that exceeds
958 : * the underlying heap's count ... if we know that accurately. Otherwise
959 : * this might just make matters worse.
960 : */
961 1932 : if (!info->estimated_count)
962 : {
963 1884 : if (stats->num_index_tuples > info->num_heap_tuples)
964 12 : stats->num_index_tuples = info->num_heap_tuples;
965 : }
966 :
967 1932 : return stats;
968 : }
969 :
970 : /*
971 : * btvacuumscan --- scan the index for VACUUMing purposes
972 : *
973 : * This combines the functions of looking for leaf tuples that are deletable
974 : * according to the vacuum callback, looking for empty pages that can be
975 : * deleted, and looking for old deleted pages that can be recycled. Both
976 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
977 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
978 : *
979 : * The caller is responsible for initially allocating/zeroing a stats struct
980 : * and for obtaining a vacuum cycle ID if necessary.
981 : */
982 : static void
983 2536 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
984 : IndexBulkDeleteCallback callback, void *callback_state,
985 : BTCycleId cycleid)
986 : {
987 2536 : Relation rel = info->index;
988 : BTVacState vstate;
989 : BlockNumber num_pages;
990 : BlockNumber scanblkno;
991 : bool needLock;
992 :
993 : /*
994 : * Reset fields that track information about the entire index now. This
995 : * avoids double-counting in the case where a single VACUUM command
996 : * requires multiple scans of the index.
997 : *
998 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
999 : * since they track information about the VACUUM command, and so must last
1000 : * across each call to btvacuumscan().
1001 : *
1002 : * (Note that pages_free is treated as state about the whole index, not
1003 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1004 : * calls are idempotent, and get repeated for the same deleted pages in
1005 : * some scenarios. The point for us is to track the number of recyclable
1006 : * pages in the index at the end of the VACUUM command.)
1007 : */
1008 2536 : stats->num_pages = 0;
1009 2536 : stats->num_index_tuples = 0;
1010 2536 : stats->pages_deleted = 0;
1011 2536 : stats->pages_free = 0;
1012 :
1013 : /* Set up info to pass down to btvacuumpage */
1014 2536 : vstate.info = info;
1015 2536 : vstate.stats = stats;
1016 2536 : vstate.callback = callback;
1017 2536 : vstate.callback_state = callback_state;
1018 2536 : vstate.cycleid = cycleid;
1019 :
1020 : /* Create a temporary memory context to run _bt_pagedel in */
1021 2536 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1022 : "_bt_pagedel",
1023 : ALLOCSET_DEFAULT_SIZES);
1024 :
1025 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1026 2536 : vstate.bufsize = 0;
1027 2536 : vstate.maxbufsize = 0;
1028 2536 : vstate.pendingpages = NULL;
1029 2536 : vstate.npendingpages = 0;
1030 : /* Consider applying _bt_pendingfsm_finalize optimization */
1031 2536 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1032 :
1033 : /*
1034 : * The outer loop iterates over all index pages except the metapage, in
1035 : * physical order (we hope the kernel will cooperate in providing
1036 : * read-ahead for speed). It is critical that we visit all leaf pages,
1037 : * including ones added after we start the scan, else we might fail to
1038 : * delete some deletable tuples. Hence, we must repeatedly check the
1039 : * relation length. We must acquire the relation-extension lock while
1040 : * doing so to avoid a race condition: if someone else is extending the
1041 : * relation, there is a window where bufmgr/smgr have created a new
1042 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1043 : * we manage to scan such a page here, we'll improperly assume it can be
1044 : * recycled. Taking the lock synchronizes things enough to prevent a
1045 : * problem: either num_pages won't include the new page, or _bt_getbuf
1046 : * already has write lock on the buffer and it will be fully initialized
1047 : * before we can examine it. Also, we need not worry if a page is added
1048 : * immediately after we look; the page splitting code already has
1049 : * write-lock on the left page before it adds a right page, so we must
1050 : * already have processed any tuples due to be moved into such a page.
1051 : *
1052 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1053 : * think the use of the extension lock is still required.
1054 : *
1055 : * We can skip locking for new or temp relations, however, since no one
1056 : * else could be accessing them.
1057 : */
1058 2536 : needLock = !RELATION_IS_LOCAL(rel);
1059 :
1060 2536 : scanblkno = BTREE_METAPAGE + 1;
1061 : for (;;)
1062 : {
1063 : /* Get the current relation length */
1064 4806 : if (needLock)
1065 4802 : LockRelationForExtension(rel, ExclusiveLock);
1066 4806 : num_pages = RelationGetNumberOfBlocks(rel);
1067 4806 : if (needLock)
1068 4802 : UnlockRelationForExtension(rel, ExclusiveLock);
1069 :
1070 4806 : if (info->report_progress)
1071 940 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1072 : num_pages);
1073 :
1074 : /* Quit if we've scanned the whole relation */
1075 4806 : if (scanblkno >= num_pages)
1076 2536 : break;
1077 : /* Iterate over pages, then loop back to recheck length */
1078 23464 : for (; scanblkno < num_pages; scanblkno++)
1079 : {
1080 21194 : btvacuumpage(&vstate, scanblkno);
1081 21194 : if (info->report_progress)
1082 978 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1083 : scanblkno);
1084 : }
1085 : }
1086 :
1087 : /* Set statistics num_pages field to final size of index */
1088 2536 : stats->num_pages = num_pages;
1089 :
1090 2536 : MemoryContextDelete(vstate.pagedelcontext);
1091 :
1092 : /*
1093 : * If there were any calls to _bt_pagedel() during scan of the index then
1094 : * see if any of the resulting pages can be placed in the FSM now. When
1095 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1096 : *
1097 : * Finally, if we placed any pages in the FSM (either just now or during
1098 : * the scan), forcibly update the upper-level FSM pages to ensure that
1099 : * searchers can find them.
1100 : */
1101 2536 : _bt_pendingfsm_finalize(rel, &vstate);
1102 2536 : if (stats->pages_free > 0)
1103 28 : IndexFreeSpaceMapVacuum(rel);
1104 2536 : }
1105 :
1106 : /*
1107 : * btvacuumpage --- VACUUM one page
1108 : *
1109 : * This processes a single page for btvacuumscan(). In some cases we must
1110 : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1111 : * a previous call here. This is how we handle page splits (that happened
1112 : * after our cycleid was acquired) whose right half page happened to reuse
1113 : * a block that we might have processed at some point before it was
1114 : * recycled (i.e. before the page split).
1115 : */
1116 : static void
1117 21194 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1118 : {
1119 21194 : IndexVacuumInfo *info = vstate->info;
1120 21194 : IndexBulkDeleteResult *stats = vstate->stats;
1121 21194 : IndexBulkDeleteCallback callback = vstate->callback;
1122 21194 : void *callback_state = vstate->callback_state;
1123 21194 : Relation rel = info->index;
1124 21194 : Relation heaprel = info->heaprel;
1125 : bool attempt_pagedel;
1126 : BlockNumber blkno,
1127 : backtrack_to;
1128 : Buffer buf;
1129 : Page page;
1130 : BTPageOpaque opaque;
1131 :
1132 21194 : blkno = scanblkno;
1133 :
1134 21194 : backtrack:
1135 :
1136 21194 : attempt_pagedel = false;
1137 21194 : backtrack_to = P_NONE;
1138 :
1139 : /* call vacuum_delay_point while not holding any buffer lock */
1140 21194 : vacuum_delay_point(false);
1141 :
1142 : /*
1143 : * We can't use _bt_getbuf() here because it always applies
1144 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1145 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1146 : * buffer access strategy.
1147 : */
1148 21194 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1149 : info->strategy);
1150 21194 : _bt_lockbuf(rel, buf, BT_READ);
1151 21194 : page = BufferGetPage(buf);
1152 21194 : opaque = NULL;
1153 21194 : if (!PageIsNew(page))
1154 : {
1155 21194 : _bt_checkpage(rel, buf);
1156 21194 : opaque = BTPageGetOpaque(page);
1157 : }
1158 :
1159 : Assert(blkno <= scanblkno);
1160 21194 : if (blkno != scanblkno)
1161 : {
1162 : /*
1163 : * We're backtracking.
1164 : *
1165 : * We followed a right link to a sibling leaf page (a page that
1166 : * happens to be from a block located before scanblkno). The only
1167 : * case we want to do anything with is a live leaf page having the
1168 : * current vacuum cycle ID.
1169 : *
1170 : * The page had better be in a state that's consistent with what we
1171 : * expect. Check for conditions that imply corruption in passing. It
1172 : * can't be half-dead because only an interrupted VACUUM process can
1173 : * leave pages in that state, so we'd definitely have dealt with it
1174 : * back when the page was the scanblkno page (half-dead pages are
1175 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1176 : */
1177 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1178 : {
1179 : Assert(false);
1180 0 : ereport(LOG,
1181 : (errcode(ERRCODE_INDEX_CORRUPTED),
1182 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1183 : blkno, scanblkno, RelationGetRelationName(rel))));
1184 0 : _bt_relbuf(rel, buf);
1185 0 : return;
1186 : }
1187 :
1188 : /*
1189 : * We may have already processed the page in an earlier call, when the
1190 : * page was scanblkno. This happens when the leaf page split occurred
1191 : * after the scan began, but before the right sibling page became the
1192 : * scanblkno.
1193 : *
1194 : * Page may also have been deleted by current btvacuumpage() call,
1195 : * since _bt_pagedel() sometimes deletes the right sibling page of
1196 : * scanblkno in passing (it does so after we decided where to
1197 : * backtrack to). We don't need to process this page as a deleted
1198 : * page a second time now (in fact, it would be wrong to count it as a
1199 : * deleted page in the bulk delete statistics a second time).
1200 : */
1201 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1202 : {
1203 : /* Done with current scanblkno (and all lower split pages) */
1204 0 : _bt_relbuf(rel, buf);
1205 0 : return;
1206 : }
1207 : }
1208 :
1209 21194 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1210 : {
1211 : /* Okay to recycle this page (which could be leaf or internal) */
1212 144 : RecordFreeIndexPage(rel, blkno);
1213 144 : stats->pages_deleted++;
1214 144 : stats->pages_free++;
1215 : }
1216 21050 : else if (P_ISDELETED(opaque))
1217 : {
1218 : /*
1219 : * Already deleted page (which could be leaf or internal). Can't
1220 : * recycle yet.
1221 : */
1222 108 : stats->pages_deleted++;
1223 : }
1224 20942 : else if (P_ISHALFDEAD(opaque))
1225 : {
1226 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1227 0 : attempt_pagedel = true;
1228 :
1229 : /*
1230 : * _bt_pagedel() will increment both pages_newly_deleted and
1231 : * pages_deleted stats in all cases (barring corruption)
1232 : */
1233 : }
1234 20942 : else if (P_ISLEAF(opaque))
1235 : {
1236 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1237 : int ndeletable;
1238 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1239 : int nupdatable;
1240 : OffsetNumber offnum,
1241 : minoff,
1242 : maxoff;
1243 : int nhtidsdead,
1244 : nhtidslive;
1245 :
1246 : /*
1247 : * Trade in the initial read lock for a full cleanup lock on this
1248 : * page. We must get such a lock on every leaf page over the course
1249 : * of the vacuum scan, whether or not it actually contains any
1250 : * deletable tuples --- see nbtree/README.
1251 : */
1252 19674 : _bt_upgradelockbufcleanup(rel, buf);
1253 :
1254 : /*
1255 : * Check whether we need to backtrack to earlier pages. What we are
1256 : * concerned about is a page split that happened since we started the
1257 : * vacuum scan. If the split moved tuples on the right half of the
1258 : * split (i.e. the tuples that sort high) to a block that we already
1259 : * passed over, then we might have missed the tuples. We need to
1260 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1261 : * or deleting scanblkno page below!)
1262 : */
1263 19674 : if (vstate->cycleid != 0 &&
1264 19574 : opaque->btpo_cycleid == vstate->cycleid &&
1265 4 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1266 4 : !P_RIGHTMOST(opaque) &&
1267 2 : opaque->btpo_next < scanblkno)
1268 0 : backtrack_to = opaque->btpo_next;
1269 :
1270 19674 : ndeletable = 0;
1271 19674 : nupdatable = 0;
1272 19674 : minoff = P_FIRSTDATAKEY(opaque);
1273 19674 : maxoff = PageGetMaxOffsetNumber(page);
1274 19674 : nhtidsdead = 0;
1275 19674 : nhtidslive = 0;
1276 19674 : if (callback)
1277 : {
1278 : /* btbulkdelete callback tells us what to delete (or update) */
1279 3877014 : for (offnum = minoff;
1280 : offnum <= maxoff;
1281 3857440 : offnum = OffsetNumberNext(offnum))
1282 : {
1283 : IndexTuple itup;
1284 :
1285 3857440 : itup = (IndexTuple) PageGetItem(page,
1286 3857440 : PageGetItemId(page, offnum));
1287 :
1288 : Assert(!BTreeTupleIsPivot(itup));
1289 3857440 : if (!BTreeTupleIsPosting(itup))
1290 : {
1291 : /* Regular tuple, standard table TID representation */
1292 3739280 : if (callback(&itup->t_tid, callback_state))
1293 : {
1294 1438228 : deletable[ndeletable++] = offnum;
1295 1438228 : nhtidsdead++;
1296 : }
1297 : else
1298 2301052 : nhtidslive++;
1299 : }
1300 : else
1301 : {
1302 : BTVacuumPosting vacposting;
1303 : int nremaining;
1304 :
1305 : /* Posting list tuple */
1306 118160 : vacposting = btreevacuumposting(vstate, itup, offnum,
1307 : &nremaining);
1308 118160 : if (vacposting == NULL)
1309 : {
1310 : /*
1311 : * All table TIDs from the posting tuple remain, so no
1312 : * delete or update required
1313 : */
1314 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1315 : }
1316 74062 : else if (nremaining > 0)
1317 : {
1318 :
1319 : /*
1320 : * Store metadata about posting list tuple in
1321 : * updatable array for entire page. Existing tuple
1322 : * will be updated during the later call to
1323 : * _bt_delitems_vacuum().
1324 : */
1325 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1326 35116 : updatable[nupdatable++] = vacposting;
1327 35116 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1328 : }
1329 : else
1330 : {
1331 : /*
1332 : * All table TIDs from the posting list must be
1333 : * deleted. We'll delete the index tuple completely
1334 : * (no update required).
1335 : */
1336 : Assert(nremaining == 0);
1337 38946 : deletable[ndeletable++] = offnum;
1338 38946 : nhtidsdead += BTreeTupleGetNPosting(itup);
1339 38946 : pfree(vacposting);
1340 : }
1341 :
1342 118160 : nhtidslive += nremaining;
1343 : }
1344 : }
1345 : }
1346 :
1347 : /*
1348 : * Apply any needed deletes or updates. We issue just one
1349 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1350 : */
1351 19674 : if (ndeletable > 0 || nupdatable > 0)
1352 : {
1353 : Assert(nhtidsdead >= ndeletable + nupdatable);
1354 12728 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1355 : nupdatable);
1356 :
1357 12728 : stats->tuples_removed += nhtidsdead;
1358 : /* must recompute maxoff */
1359 12728 : maxoff = PageGetMaxOffsetNumber(page);
1360 :
1361 : /* can't leak memory here */
1362 47844 : for (int i = 0; i < nupdatable; i++)
1363 35116 : pfree(updatable[i]);
1364 : }
1365 : else
1366 : {
1367 : /*
1368 : * If the leaf page has been split during this vacuum cycle, it
1369 : * seems worth expending a write to clear btpo_cycleid even if we
1370 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1371 : * takes care of this.) This ensures we won't process the page
1372 : * again.
1373 : *
1374 : * We treat this like a hint-bit update because there's no need to
1375 : * WAL-log it.
1376 : */
1377 : Assert(nhtidsdead == 0);
1378 6946 : if (vstate->cycleid != 0 &&
1379 6846 : opaque->btpo_cycleid == vstate->cycleid)
1380 : {
1381 4 : opaque->btpo_cycleid = 0;
1382 4 : MarkBufferDirtyHint(buf, true);
1383 : }
1384 : }
1385 :
1386 : /*
1387 : * If the leaf page is now empty, try to delete it; else count the
1388 : * live tuples (live table TIDs in posting lists are counted as
1389 : * separate live tuples). We don't delete when backtracking, though,
1390 : * since that would require teaching _bt_pagedel() about backtracking
1391 : * (doesn't seem worth adding more complexity to deal with that).
1392 : *
1393 : * We don't count the number of live TIDs during cleanup-only calls to
1394 : * btvacuumscan (i.e. when callback is not set). We count the number
1395 : * of index tuples directly instead. This avoids the expense of
1396 : * directly examining all of the tuples on each page. VACUUM will
1397 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1398 : * doesn't matter that this underestimates num_index_tuples
1399 : * significantly in some cases.
1400 : */
1401 19674 : if (minoff > maxoff)
1402 5574 : attempt_pagedel = (blkno == scanblkno);
1403 14100 : else if (callback)
1404 14008 : stats->num_index_tuples += nhtidslive;
1405 : else
1406 92 : stats->num_index_tuples += maxoff - minoff + 1;
1407 :
1408 : Assert(!attempt_pagedel || nhtidslive == 0);
1409 : }
1410 :
1411 21194 : if (attempt_pagedel)
1412 : {
1413 : MemoryContext oldcontext;
1414 :
1415 : /* Run pagedel in a temp context to avoid memory leakage */
1416 5574 : MemoryContextReset(vstate->pagedelcontext);
1417 5574 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1418 :
1419 : /*
1420 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1421 : * pages_newly_deleted and pages_deleted are likely to be incremented
1422 : * during call
1423 : */
1424 : Assert(blkno == scanblkno);
1425 5574 : _bt_pagedel(rel, buf, vstate);
1426 :
1427 5574 : MemoryContextSwitchTo(oldcontext);
1428 : /* pagedel released buffer, so we shouldn't */
1429 : }
1430 : else
1431 15620 : _bt_relbuf(rel, buf);
1432 :
1433 21194 : if (backtrack_to != P_NONE)
1434 : {
1435 0 : blkno = backtrack_to;
1436 0 : goto backtrack;
1437 : }
1438 : }
1439 :
1440 : /*
1441 : * btreevacuumposting --- determine TIDs still needed in posting list
1442 : *
1443 : * Returns metadata describing how to build replacement tuple without the TIDs
1444 : * that VACUUM needs to delete. Returned value is NULL in the common case
1445 : * where no changes are needed to caller's posting list tuple (we avoid
1446 : * allocating memory here as an optimization).
1447 : *
1448 : * The number of TIDs that should remain in the posting list tuple is set for
1449 : * caller in *nremaining.
1450 : */
1451 : static BTVacuumPosting
1452 118160 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1453 : OffsetNumber updatedoffset, int *nremaining)
1454 : {
1455 118160 : int live = 0;
1456 118160 : int nitem = BTreeTupleGetNPosting(posting);
1457 118160 : ItemPointer items = BTreeTupleGetPosting(posting);
1458 118160 : BTVacuumPosting vacposting = NULL;
1459 :
1460 669108 : for (int i = 0; i < nitem; i++)
1461 : {
1462 550948 : if (!vstate->callback(items + i, vstate->callback_state))
1463 : {
1464 : /* Live table TID */
1465 298296 : live++;
1466 : }
1467 252652 : else if (vacposting == NULL)
1468 : {
1469 : /*
1470 : * First dead table TID encountered.
1471 : *
1472 : * It's now clear that we need to delete one or more dead table
1473 : * TIDs, so start maintaining metadata describing how to update
1474 : * existing posting list tuple.
1475 : */
1476 74062 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1477 : nitem * sizeof(uint16));
1478 :
1479 74062 : vacposting->itup = posting;
1480 74062 : vacposting->updatedoffset = updatedoffset;
1481 74062 : vacposting->ndeletedtids = 0;
1482 74062 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1483 : }
1484 : else
1485 : {
1486 : /* Second or subsequent dead table TID */
1487 178590 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1488 : }
1489 : }
1490 :
1491 118160 : *nremaining = live;
1492 118160 : return vacposting;
1493 : }
1494 :
1495 : /*
1496 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1497 : *
1498 : * btrees always do, so this is trivial.
1499 : */
1500 : bool
1501 1322776 : btcanreturn(Relation index, int attno)
1502 : {
1503 1322776 : return true;
1504 : }
1505 :
1506 : /*
1507 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1508 : */
1509 : int
1510 848592 : btgettreeheight(Relation rel)
1511 : {
1512 848592 : return _bt_getrootheight(rel);
1513 : }
1514 :
1515 : CompareType
1516 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1517 : {
1518 0 : switch (strategy)
1519 : {
1520 0 : case BTLessStrategyNumber:
1521 0 : return COMPARE_LT;
1522 0 : case BTLessEqualStrategyNumber:
1523 0 : return COMPARE_LE;
1524 0 : case BTEqualStrategyNumber:
1525 0 : return COMPARE_EQ;
1526 0 : case BTGreaterEqualStrategyNumber:
1527 0 : return COMPARE_GE;
1528 0 : case BTGreaterStrategyNumber:
1529 0 : return COMPARE_GT;
1530 0 : default:
1531 0 : return COMPARE_INVALID;
1532 : }
1533 : }
1534 :
1535 : StrategyNumber
1536 323406 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1537 : {
1538 323406 : switch (cmptype)
1539 : {
1540 0 : case COMPARE_LT:
1541 0 : return BTLessStrategyNumber;
1542 0 : case COMPARE_LE:
1543 0 : return BTLessEqualStrategyNumber;
1544 323406 : case COMPARE_EQ:
1545 323406 : return BTEqualStrategyNumber;
1546 0 : case COMPARE_GE:
1547 0 : return BTGreaterEqualStrategyNumber;
1548 0 : case COMPARE_GT:
1549 0 : return BTGreaterStrategyNumber;
1550 0 : default:
1551 0 : return InvalidStrategy;
1552 : }
1553 : }
|