Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtpage.c
4 : * BTree-specific page management code for the Postgres btree access
5 : * method.
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * src/backend/access/nbtree/nbtpage.c
13 : *
14 : * NOTES
15 : * Postgres btree pages look like ordinary relation pages. The opaque
16 : * data at high addresses includes pointers to left and right siblings
17 : * and flag data describing page state. The first page in a btree, page
18 : * zero, is special -- it stores meta-information describing the tree.
19 : * Pages one and higher store the actual tree data.
20 : *
21 : *-------------------------------------------------------------------------
22 : */
23 : #include "postgres.h"
24 :
25 : #include "access/nbtree.h"
26 : #include "access/nbtxlog.h"
27 : #include "access/tableam.h"
28 : #include "access/transam.h"
29 : #include "access/xlog.h"
30 : #include "access/xloginsert.h"
31 : #include "common/int.h"
32 : #include "miscadmin.h"
33 : #include "storage/indexfsm.h"
34 : #include "storage/predicate.h"
35 : #include "storage/procarray.h"
36 : #include "utils/injection_point.h"
37 : #include "utils/memdebug.h"
38 : #include "utils/memutils.h"
39 : #include "utils/snapmgr.h"
40 :
41 : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
42 : static void _bt_delitems_delete(Relation rel, Buffer buf,
43 : TransactionId snapshotConflictHorizon,
44 : bool isCatalogRel,
45 : OffsetNumber *deletable, int ndeletable,
46 : BTVacuumPosting *updatable, int nupdatable);
47 : static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
48 : OffsetNumber *updatedoffsets,
49 : Size *updatedbuflen, bool needswal);
50 : static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
51 : Buffer leafbuf, BTStack stack);
52 : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
53 : BlockNumber scanblkno,
54 : bool *rightsib_empty,
55 : BTVacState *vstate);
56 : static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
57 : BlockNumber child, BTStack stack,
58 : Buffer *subtreeparent, OffsetNumber *poffset,
59 : BlockNumber *topparent,
60 : BlockNumber *topparentrightsib);
61 : static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
62 : FullTransactionId safexid);
63 :
64 : /*
65 : * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
66 : */
67 : void
68 30925 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
69 : bool allequalimage)
70 : {
71 : BTMetaPageData *metad;
72 : BTPageOpaque metaopaque;
73 :
74 30925 : _bt_pageinit(page, BLCKSZ);
75 :
76 30925 : metad = BTPageGetMeta(page);
77 30925 : metad->btm_magic = BTREE_MAGIC;
78 30925 : metad->btm_version = BTREE_VERSION;
79 30925 : metad->btm_root = rootbknum;
80 30925 : metad->btm_level = level;
81 30925 : metad->btm_fastroot = rootbknum;
82 30925 : metad->btm_fastlevel = level;
83 30925 : metad->btm_last_cleanup_num_delpages = 0;
84 30925 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
85 30925 : metad->btm_allequalimage = allequalimage;
86 :
87 30925 : metaopaque = BTPageGetOpaque(page);
88 30925 : metaopaque->btpo_flags = BTP_META;
89 :
90 : /*
91 : * Set pd_lower just past the end of the metadata. This is essential,
92 : * because without doing so, metadata will be lost if xlog.c compresses
93 : * the page.
94 : */
95 30925 : ((PageHeader) page)->pd_lower =
96 30925 : ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
97 30925 : }
98 :
99 : /*
100 : * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
101 : * 3, the last version that can be updated without broadly affecting
102 : * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
103 : *
104 : * This routine does purely in-memory image upgrade. Caller is
105 : * responsible for locking, WAL-logging etc.
106 : */
107 : void
108 0 : _bt_upgrademetapage(Page page)
109 : {
110 : BTMetaPageData *metad;
111 : BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
112 :
113 0 : metad = BTPageGetMeta(page);
114 0 : metaopaque = BTPageGetOpaque(page);
115 :
116 : /* It must be really a meta page of upgradable version */
117 : Assert(metaopaque->btpo_flags & BTP_META);
118 : Assert(metad->btm_version < BTREE_NOVAC_VERSION);
119 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
120 :
121 : /* Set version number and fill extra fields added into version 3 */
122 0 : metad->btm_version = BTREE_NOVAC_VERSION;
123 0 : metad->btm_last_cleanup_num_delpages = 0;
124 0 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
125 : /* Only a REINDEX can set this field */
126 : Assert(!metad->btm_allequalimage);
127 0 : metad->btm_allequalimage = false;
128 :
129 : /* Adjust pd_lower (see _bt_initmetapage() for details) */
130 0 : ((PageHeader) page)->pd_lower =
131 0 : ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
132 0 : }
133 :
134 : /*
135 : * Get metadata from share-locked buffer containing metapage, while performing
136 : * standard sanity checks.
137 : *
138 : * Callers that cache data returned here in local cache should note that an
139 : * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
140 : * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
141 : */
142 : static BTMetaPageData *
143 1115819 : _bt_getmeta(Relation rel, Buffer metabuf)
144 : {
145 : Page metapg;
146 : BTPageOpaque metaopaque;
147 : BTMetaPageData *metad;
148 :
149 1115819 : metapg = BufferGetPage(metabuf);
150 1115819 : metaopaque = BTPageGetOpaque(metapg);
151 1115819 : metad = BTPageGetMeta(metapg);
152 :
153 : /* sanity-check the metapage */
154 1115819 : if (!P_ISMETA(metaopaque) ||
155 1115819 : metad->btm_magic != BTREE_MAGIC)
156 0 : ereport(ERROR,
157 : (errcode(ERRCODE_INDEX_CORRUPTED),
158 : errmsg("index \"%s\" is not a btree",
159 : RelationGetRelationName(rel))));
160 :
161 1115819 : if (metad->btm_version < BTREE_MIN_VERSION ||
162 1115819 : metad->btm_version > BTREE_VERSION)
163 0 : ereport(ERROR,
164 : (errcode(ERRCODE_INDEX_CORRUPTED),
165 : errmsg("version mismatch in index \"%s\": file version %d, "
166 : "current version %d, minimal supported version %d",
167 : RelationGetRelationName(rel),
168 : metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
169 :
170 1115819 : return metad;
171 : }
172 :
173 : /*
174 : * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
175 : *
176 : * Called by btvacuumcleanup when btbulkdelete was never called because no
177 : * index tuples needed to be deleted.
178 : */
179 : bool
180 131966 : _bt_vacuum_needs_cleanup(Relation rel)
181 : {
182 : Buffer metabuf;
183 : Page metapg;
184 : BTMetaPageData *metad;
185 : uint32 btm_version;
186 : BlockNumber prev_num_delpages;
187 :
188 : /*
189 : * Copy details from metapage to local variables quickly.
190 : *
191 : * Note that we deliberately avoid using cached version of metapage here.
192 : */
193 131966 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
194 131966 : metapg = BufferGetPage(metabuf);
195 131966 : metad = BTPageGetMeta(metapg);
196 131966 : btm_version = metad->btm_version;
197 :
198 131966 : if (btm_version < BTREE_NOVAC_VERSION)
199 : {
200 : /*
201 : * Metapage needs to be dynamically upgraded to store fields that are
202 : * only present when btm_version >= BTREE_NOVAC_VERSION
203 : */
204 0 : _bt_relbuf(rel, metabuf);
205 0 : return true;
206 : }
207 :
208 131966 : prev_num_delpages = metad->btm_last_cleanup_num_delpages;
209 131966 : _bt_relbuf(rel, metabuf);
210 :
211 : /*
212 : * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
213 : * total size of the index. We can reasonably expect (though are not
214 : * guaranteed) to be able to recycle this many pages if we decide to do a
215 : * btvacuumscan call during the ongoing btvacuumcleanup. For further
216 : * details see the nbtree/README section on placing deleted pages in the
217 : * FSM.
218 : */
219 131966 : if (prev_num_delpages > 0 &&
220 7 : prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
221 7 : return true;
222 :
223 131959 : return false;
224 : }
225 :
226 : /*
227 : * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
228 : *
229 : * Called at the end of btvacuumcleanup, when num_delpages value has been
230 : * finalized.
231 : */
232 : void
233 1425 : _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
234 : {
235 : Buffer metabuf;
236 : Page metapg;
237 : BTMetaPageData *metad;
238 : XLogRecPtr recptr;
239 :
240 : /*
241 : * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242 : * field started out as a TransactionId field called btm_oldest_btpo_xact.
243 : * Both "versions" are just uint32 fields. It was convenient to repurpose
244 : * the field when we began to use 64-bit XIDs in deleted pages.
245 : *
246 : * It's possible that a pg_upgrade'd database will contain an XID value in
247 : * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248 : * field. _bt_vacuum_needs_cleanup() may even believe that this value
249 : * indicates that there are lots of pages that it needs to recycle, when
250 : * in reality there are only one or two. The worst that can happen is
251 : * that there will be a call to btvacuumscan a little earlier, which will
252 : * set btm_last_cleanup_num_delpages to a sane value when we're called.
253 : *
254 : * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255 : * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256 : * to be consistent.
257 : */
258 1425 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
259 1425 : metapg = BufferGetPage(metabuf);
260 1425 : metad = BTPageGetMeta(metapg);
261 :
262 : /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263 1425 : if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264 1425 : metad->btm_last_cleanup_num_delpages == num_delpages)
265 : {
266 : /* Usually means index continues to have num_delpages of 0 */
267 1325 : _bt_relbuf(rel, metabuf);
268 1325 : return;
269 : }
270 :
271 : /* trade in our read lock for a write lock */
272 100 : _bt_unlockbuf(rel, metabuf);
273 100 : _bt_lockbuf(rel, metabuf, BT_WRITE);
274 :
275 100 : START_CRIT_SECTION();
276 :
277 : /* upgrade meta-page if needed */
278 100 : if (metad->btm_version < BTREE_NOVAC_VERSION)
279 0 : _bt_upgrademetapage(metapg);
280 :
281 : /* update cleanup-related information */
282 100 : metad->btm_last_cleanup_num_delpages = num_delpages;
283 100 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
284 100 : MarkBufferDirty(metabuf);
285 :
286 : /* write wal record if needed */
287 100 : if (RelationNeedsWAL(rel))
288 100 : {
289 : xl_btree_metadata md;
290 :
291 100 : XLogBeginInsert();
292 100 : XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
293 :
294 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
295 100 : md.version = metad->btm_version;
296 100 : md.root = metad->btm_root;
297 100 : md.level = metad->btm_level;
298 100 : md.fastroot = metad->btm_fastroot;
299 100 : md.fastlevel = metad->btm_fastlevel;
300 100 : md.last_cleanup_num_delpages = num_delpages;
301 100 : md.allequalimage = metad->btm_allequalimage;
302 :
303 100 : XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
304 :
305 100 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
306 : }
307 : else
308 0 : recptr = XLogGetFakeLSN(rel);
309 :
310 100 : PageSetLSN(metapg, recptr);
311 :
312 100 : END_CRIT_SECTION();
313 :
314 100 : _bt_relbuf(rel, metabuf);
315 : }
316 :
317 : /*
318 : * _bt_getroot() -- Get the root page of the btree.
319 : *
320 : * Since the root page can move around the btree file, we have to read
321 : * its location from the metadata page, and then read the root page
322 : * itself. If no root page exists yet, we have to create one.
323 : *
324 : * The access type parameter (BT_READ or BT_WRITE) controls whether
325 : * a new root page will be created or not. If access = BT_READ,
326 : * and no root page exists, we just return InvalidBuffer. For
327 : * BT_WRITE, we try to create the root page if it doesn't exist.
328 : * NOTE that the returned root page will have only a read lock set
329 : * on it even if access = BT_WRITE!
330 : *
331 : * If access = BT_WRITE, heaprel must be set; otherwise caller can just
332 : * pass NULL. See _bt_allocbuf for an explanation.
333 : *
334 : * The returned page is not necessarily the true root --- it could be
335 : * a "fast root" (a page that is alone in its level due to deletions).
336 : * Also, if the root page is split while we are "in flight" to it,
337 : * what we will return is the old root, which is now just the leftmost
338 : * page on a probably-not-very-wide level. For most purposes this is
339 : * as good as or better than the true root, so we do not bother to
340 : * insist on finding the true root. We do, however, guarantee to
341 : * return a live (not deleted or half-dead) page.
342 : *
343 : * On successful return, the root page is pinned and read-locked.
344 : * The metadata page is not locked or pinned on exit.
345 : */
346 : Buffer
347 14922069 : _bt_getroot(Relation rel, Relation heaprel, int access)
348 : {
349 : Buffer metabuf;
350 : Buffer rootbuf;
351 : Page rootpage;
352 : BTPageOpaque rootopaque;
353 : BlockNumber rootblkno;
354 : uint32 rootlevel;
355 : BTMetaPageData *metad;
356 : XLogRecPtr recptr;
357 :
358 : Assert(access == BT_READ || heaprel != NULL);
359 :
360 : /*
361 : * Try to use previously-cached metapage data to find the root. This
362 : * normally saves one buffer access per index search, which is a very
363 : * helpful savings in bufmgr traffic and hence contention.
364 : */
365 14922069 : if (rel->rd_amcache != NULL)
366 : {
367 14585259 : metad = (BTMetaPageData *) rel->rd_amcache;
368 : /* We shouldn't have cached it if any of these fail */
369 : Assert(metad->btm_magic == BTREE_MAGIC);
370 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
371 : Assert(metad->btm_version <= BTREE_VERSION);
372 : Assert(!metad->btm_allequalimage ||
373 : metad->btm_version > BTREE_NOVAC_VERSION);
374 : Assert(metad->btm_root != P_NONE);
375 :
376 14585259 : rootblkno = metad->btm_fastroot;
377 : Assert(rootblkno != P_NONE);
378 14585259 : rootlevel = metad->btm_fastlevel;
379 :
380 14585259 : rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
381 14585259 : rootpage = BufferGetPage(rootbuf);
382 14585259 : rootopaque = BTPageGetOpaque(rootpage);
383 :
384 : /*
385 : * Since the cache might be stale, we check the page more carefully
386 : * here than normal. We *must* check that it's not deleted. If it's
387 : * not alone on its level, then we reject too --- this may be overly
388 : * paranoid but better safe than sorry. Note we don't check P_ISROOT,
389 : * because that's not set in a "fast root".
390 : */
391 14585259 : if (!P_IGNORE(rootopaque) &&
392 14585259 : rootopaque->btpo_level == rootlevel &&
393 14585259 : P_LEFTMOST(rootopaque) &&
394 14585259 : P_RIGHTMOST(rootopaque))
395 : {
396 : /* OK, accept cached page as the root */
397 14584342 : return rootbuf;
398 : }
399 917 : _bt_relbuf(rel, rootbuf);
400 : /* Cache is stale, throw it away */
401 917 : if (rel->rd_amcache)
402 917 : pfree(rel->rd_amcache);
403 917 : rel->rd_amcache = NULL;
404 : }
405 :
406 337727 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
407 337727 : metad = _bt_getmeta(rel, metabuf);
408 :
409 : /* if no root page initialized yet, do it */
410 337727 : if (metad->btm_root == P_NONE)
411 : {
412 : Page metapg;
413 :
414 : /* If access = BT_READ, caller doesn't want us to create root yet */
415 336541 : if (access == BT_READ)
416 : {
417 329334 : _bt_relbuf(rel, metabuf);
418 329334 : return InvalidBuffer;
419 : }
420 :
421 : /* trade in our read lock for a write lock */
422 7207 : _bt_unlockbuf(rel, metabuf);
423 7207 : _bt_lockbuf(rel, metabuf, BT_WRITE);
424 :
425 : /*
426 : * Race condition: if someone else initialized the metadata between
427 : * the time we released the read lock and acquired the write lock, we
428 : * must avoid doing it again.
429 : */
430 7207 : if (metad->btm_root != P_NONE)
431 : {
432 : /*
433 : * Metadata initialized by someone else. In order to guarantee no
434 : * deadlocks, we have to release the metadata page and start all
435 : * over again. (Is that really true? But it's hardly worth trying
436 : * to optimize this case.)
437 : */
438 0 : _bt_relbuf(rel, metabuf);
439 0 : return _bt_getroot(rel, heaprel, access);
440 : }
441 :
442 : /*
443 : * Get, initialize, write, and leave a lock of the appropriate type on
444 : * the new root page. Since this is the first page in the tree, it's
445 : * a leaf as well as the root.
446 : */
447 7207 : rootbuf = _bt_allocbuf(rel, heaprel);
448 7207 : rootblkno = BufferGetBlockNumber(rootbuf);
449 7207 : rootpage = BufferGetPage(rootbuf);
450 7207 : rootopaque = BTPageGetOpaque(rootpage);
451 7207 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
452 7207 : rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
453 7207 : rootopaque->btpo_level = 0;
454 7207 : rootopaque->btpo_cycleid = 0;
455 : /* Get raw page pointer for metapage */
456 7207 : metapg = BufferGetPage(metabuf);
457 :
458 : /* NO ELOG(ERROR) till meta is updated */
459 7207 : START_CRIT_SECTION();
460 :
461 : /* upgrade metapage if needed */
462 7207 : if (metad->btm_version < BTREE_NOVAC_VERSION)
463 0 : _bt_upgrademetapage(metapg);
464 :
465 7207 : metad->btm_root = rootblkno;
466 7207 : metad->btm_level = 0;
467 7207 : metad->btm_fastroot = rootblkno;
468 7207 : metad->btm_fastlevel = 0;
469 7207 : metad->btm_last_cleanup_num_delpages = 0;
470 7207 : metad->btm_last_cleanup_num_heap_tuples = -1.0;
471 :
472 7207 : MarkBufferDirty(rootbuf);
473 7207 : MarkBufferDirty(metabuf);
474 :
475 : /* XLOG stuff */
476 7207 : if (RelationNeedsWAL(rel))
477 6892 : {
478 : xl_btree_newroot xlrec;
479 : xl_btree_metadata md;
480 :
481 6892 : XLogBeginInsert();
482 6892 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
483 6892 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
484 :
485 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
486 6892 : md.version = metad->btm_version;
487 6892 : md.root = rootblkno;
488 6892 : md.level = 0;
489 6892 : md.fastroot = rootblkno;
490 6892 : md.fastlevel = 0;
491 6892 : md.last_cleanup_num_delpages = 0;
492 6892 : md.allequalimage = metad->btm_allequalimage;
493 :
494 6892 : XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
495 :
496 6892 : xlrec.rootblk = rootblkno;
497 6892 : xlrec.level = 0;
498 :
499 6892 : XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
500 :
501 6892 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
502 : }
503 : else
504 315 : recptr = XLogGetFakeLSN(rel);
505 :
506 7207 : PageSetLSN(rootpage, recptr);
507 7207 : PageSetLSN(metapg, recptr);
508 :
509 7207 : END_CRIT_SECTION();
510 :
511 : /*
512 : * swap root write lock for read lock. There is no danger of anyone
513 : * else accessing the new root page while it's unlocked, since no one
514 : * else knows where it is yet.
515 : */
516 7207 : _bt_unlockbuf(rel, rootbuf);
517 7207 : _bt_lockbuf(rel, rootbuf, BT_READ);
518 :
519 : /* okay, metadata is correct, release lock on it without caching */
520 7207 : _bt_relbuf(rel, metabuf);
521 : }
522 : else
523 : {
524 1186 : rootblkno = metad->btm_fastroot;
525 : Assert(rootblkno != P_NONE);
526 1186 : rootlevel = metad->btm_fastlevel;
527 :
528 : /*
529 : * Cache the metapage data for next time
530 : */
531 1186 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
532 : sizeof(BTMetaPageData));
533 1186 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
534 :
535 : /*
536 : * We are done with the metapage; arrange to release it via first
537 : * _bt_relandgetbuf call
538 : */
539 1186 : rootbuf = metabuf;
540 :
541 : for (;;)
542 : {
543 1186 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
544 1186 : rootpage = BufferGetPage(rootbuf);
545 1186 : rootopaque = BTPageGetOpaque(rootpage);
546 :
547 1186 : if (!P_IGNORE(rootopaque))
548 1186 : break;
549 :
550 : /* it's dead, Jim. step right one page */
551 0 : if (P_RIGHTMOST(rootopaque))
552 0 : elog(ERROR, "no live root page found in index \"%s\"",
553 : RelationGetRelationName(rel));
554 0 : rootblkno = rootopaque->btpo_next;
555 : }
556 :
557 1186 : if (rootopaque->btpo_level != rootlevel)
558 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
559 : rootblkno, RelationGetRelationName(rel),
560 : rootopaque->btpo_level, rootlevel);
561 : }
562 :
563 : /*
564 : * By here, we have a pin and read lock on the root page, and no lock set
565 : * on the metadata page. Return the root page's buffer.
566 : */
567 8393 : return rootbuf;
568 : }
569 :
570 : /*
571 : * _bt_gettrueroot() -- Get the true root page of the btree.
572 : *
573 : * This is the same as the BT_READ case of _bt_getroot(), except
574 : * we follow the true-root link not the fast-root link.
575 : *
576 : * By the time we acquire lock on the root page, it might have been split and
577 : * not be the true root anymore. This is okay for the present uses of this
578 : * routine; we only really need to be able to move up at least one tree level
579 : * from whatever non-root page we were at. If we ever do need to lock the
580 : * one true root page, we could loop here, re-reading the metapage on each
581 : * failure. (Note that it wouldn't do to hold the lock on the metapage while
582 : * moving to the root --- that'd deadlock against any concurrent root split.)
583 : */
584 : Buffer
585 16 : _bt_gettrueroot(Relation rel)
586 : {
587 : Buffer metabuf;
588 : Page metapg;
589 : BTPageOpaque metaopaque;
590 : Buffer rootbuf;
591 : Page rootpage;
592 : BTPageOpaque rootopaque;
593 : BlockNumber rootblkno;
594 : uint32 rootlevel;
595 : BTMetaPageData *metad;
596 :
597 : /*
598 : * We don't try to use cached metapage data here, since (a) this path is
599 : * not performance-critical, and (b) if we are here it suggests our cache
600 : * is out-of-date anyway. In light of point (b), it's probably safest to
601 : * actively flush any cached metapage info.
602 : */
603 16 : if (rel->rd_amcache)
604 16 : pfree(rel->rd_amcache);
605 16 : rel->rd_amcache = NULL;
606 :
607 16 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
608 16 : metapg = BufferGetPage(metabuf);
609 16 : metaopaque = BTPageGetOpaque(metapg);
610 16 : metad = BTPageGetMeta(metapg);
611 :
612 16 : if (!P_ISMETA(metaopaque) ||
613 16 : metad->btm_magic != BTREE_MAGIC)
614 0 : ereport(ERROR,
615 : (errcode(ERRCODE_INDEX_CORRUPTED),
616 : errmsg("index \"%s\" is not a btree",
617 : RelationGetRelationName(rel))));
618 :
619 16 : if (metad->btm_version < BTREE_MIN_VERSION ||
620 16 : metad->btm_version > BTREE_VERSION)
621 0 : ereport(ERROR,
622 : (errcode(ERRCODE_INDEX_CORRUPTED),
623 : errmsg("version mismatch in index \"%s\": file version %d, "
624 : "current version %d, minimal supported version %d",
625 : RelationGetRelationName(rel),
626 : metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
627 :
628 : /* if no root page initialized yet, fail */
629 16 : if (metad->btm_root == P_NONE)
630 : {
631 0 : _bt_relbuf(rel, metabuf);
632 0 : return InvalidBuffer;
633 : }
634 :
635 16 : rootblkno = metad->btm_root;
636 16 : rootlevel = metad->btm_level;
637 :
638 : /*
639 : * We are done with the metapage; arrange to release it via first
640 : * _bt_relandgetbuf call
641 : */
642 16 : rootbuf = metabuf;
643 :
644 : for (;;)
645 : {
646 16 : rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
647 16 : rootpage = BufferGetPage(rootbuf);
648 16 : rootopaque = BTPageGetOpaque(rootpage);
649 :
650 16 : if (!P_IGNORE(rootopaque))
651 16 : break;
652 :
653 : /* it's dead, Jim. step right one page */
654 0 : if (P_RIGHTMOST(rootopaque))
655 0 : elog(ERROR, "no live root page found in index \"%s\"",
656 : RelationGetRelationName(rel));
657 0 : rootblkno = rootopaque->btpo_next;
658 : }
659 :
660 16 : if (rootopaque->btpo_level != rootlevel)
661 0 : elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
662 : rootblkno, RelationGetRelationName(rel),
663 : rootopaque->btpo_level, rootlevel);
664 :
665 16 : return rootbuf;
666 : }
667 :
668 : /*
669 : * _bt_getrootheight() -- Get the height of the btree search tree.
670 : *
671 : * We return the level (counting from zero) of the current fast root.
672 : * This represents the number of tree levels we'd have to descend through
673 : * to start any btree index search.
674 : *
675 : * This is used by the planner for cost-estimation purposes. Since it's
676 : * only an estimate, slightly-stale data is fine, hence we don't worry
677 : * about updating previously cached data.
678 : */
679 : int
680 3236248 : _bt_getrootheight(Relation rel)
681 : {
682 : BTMetaPageData *metad;
683 :
684 3236248 : if (rel->rd_amcache == NULL)
685 : {
686 : Buffer metabuf;
687 :
688 66312 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
689 66312 : metad = _bt_getmeta(rel, metabuf);
690 :
691 : /*
692 : * If there's no root page yet, _bt_getroot() doesn't expect a cache
693 : * to be made, so just stop here and report the index height is zero.
694 : * (XXX perhaps _bt_getroot() should be changed to allow this case.)
695 : */
696 66312 : if (metad->btm_root == P_NONE)
697 : {
698 38500 : _bt_relbuf(rel, metabuf);
699 38500 : return 0;
700 : }
701 :
702 : /*
703 : * Cache the metapage data for next time
704 : */
705 27812 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
706 : sizeof(BTMetaPageData));
707 27812 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
708 27812 : _bt_relbuf(rel, metabuf);
709 : }
710 :
711 : /* Get cached page */
712 3197748 : metad = (BTMetaPageData *) rel->rd_amcache;
713 : /* We shouldn't have cached it if any of these fail */
714 : Assert(metad->btm_magic == BTREE_MAGIC);
715 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
716 : Assert(metad->btm_version <= BTREE_VERSION);
717 : Assert(!metad->btm_allequalimage ||
718 : metad->btm_version > BTREE_NOVAC_VERSION);
719 : Assert(metad->btm_fastroot != P_NONE);
720 :
721 3197748 : return metad->btm_fastlevel;
722 : }
723 :
724 : /*
725 : * _bt_metaversion() -- Get version/status info from metapage.
726 : *
727 : * Sets caller's *heapkeyspace and *allequalimage arguments using data
728 : * from the B-Tree metapage (could be locally-cached version). This
729 : * information needs to be stashed in insertion scankey, so we provide a
730 : * single function that fetches both at once.
731 : *
732 : * This is used to determine the rules that must be used to descend a
733 : * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
734 : * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
735 : * performance when inserting a new BTScanInsert-wise duplicate tuple
736 : * among many leaf pages already full of such duplicates.
737 : *
738 : * Also sets allequalimage field, which indicates whether or not it is
739 : * safe to apply deduplication. We rely on the assumption that
740 : * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
741 : * pg_upgrade'd from Postgres 12.
742 : */
743 : void
744 16953664 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
745 : {
746 : BTMetaPageData *metad;
747 :
748 16953664 : if (rel->rd_amcache == NULL)
749 : {
750 : Buffer metabuf;
751 :
752 711780 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
753 711780 : metad = _bt_getmeta(rel, metabuf);
754 :
755 : /*
756 : * If there's no root page yet, _bt_getroot() doesn't expect a cache
757 : * to be made, so just stop here. (XXX perhaps _bt_getroot() should
758 : * be changed to allow this case.)
759 : */
760 711780 : if (metad->btm_root == P_NONE)
761 : {
762 331736 : *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
763 331736 : *allequalimage = metad->btm_allequalimage;
764 :
765 331736 : _bt_relbuf(rel, metabuf);
766 331736 : return;
767 : }
768 :
769 : /*
770 : * Cache the metapage data for next time
771 : *
772 : * An on-the-fly version upgrade performed by _bt_upgrademetapage()
773 : * can change the nbtree version for an index without invalidating any
774 : * local cache. This is okay because it can only happen when moving
775 : * from version 2 to version 3, both of which are !heapkeyspace
776 : * versions.
777 : */
778 380044 : rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
779 : sizeof(BTMetaPageData));
780 380044 : memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
781 380044 : _bt_relbuf(rel, metabuf);
782 : }
783 :
784 : /* Get cached page */
785 16621928 : metad = (BTMetaPageData *) rel->rd_amcache;
786 : /* We shouldn't have cached it if any of these fail */
787 : Assert(metad->btm_magic == BTREE_MAGIC);
788 : Assert(metad->btm_version >= BTREE_MIN_VERSION);
789 : Assert(metad->btm_version <= BTREE_VERSION);
790 : Assert(!metad->btm_allequalimage ||
791 : metad->btm_version > BTREE_NOVAC_VERSION);
792 : Assert(metad->btm_fastroot != P_NONE);
793 :
794 16621928 : *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
795 16621928 : *allequalimage = metad->btm_allequalimage;
796 : }
797 :
798 : /*
799 : * _bt_checkpage() -- Verify that a freshly-read page looks sane.
800 : */
801 : void
802 28097169 : _bt_checkpage(Relation rel, Buffer buf)
803 : {
804 28097169 : Page page = BufferGetPage(buf);
805 :
806 : /*
807 : * ReadBuffer verifies that every newly-read page passes
808 : * PageHeaderIsValid, which means it either contains a reasonably sane
809 : * page header or is all-zero. We have to defend against the all-zero
810 : * case, however.
811 : */
812 28097169 : if (PageIsNew(page))
813 0 : ereport(ERROR,
814 : (errcode(ERRCODE_INDEX_CORRUPTED),
815 : errmsg("index \"%s\" contains unexpected zero page at block %u",
816 : RelationGetRelationName(rel),
817 : BufferGetBlockNumber(buf)),
818 : errhint("Please REINDEX it.")));
819 :
820 : /*
821 : * Additionally check that the special area looks sane.
822 : */
823 28097169 : if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
824 0 : ereport(ERROR,
825 : (errcode(ERRCODE_INDEX_CORRUPTED),
826 : errmsg("index \"%s\" contains corrupted page at block %u",
827 : RelationGetRelationName(rel),
828 : BufferGetBlockNumber(buf)),
829 : errhint("Please REINDEX it.")));
830 28097169 : }
831 :
832 : /*
833 : * _bt_getbuf() -- Get an existing block in a buffer, for read or write.
834 : *
835 : * The general rule in nbtree is that it's never okay to access a
836 : * page without holding both a buffer pin and a buffer lock on
837 : * the page's buffer.
838 : *
839 : * When this routine returns, the appropriate lock is set on the
840 : * requested buffer and its reference count has been incremented
841 : * (ie, the buffer is "locked and pinned"). Also, we apply
842 : * _bt_checkpage to sanity-check the page, and perform Valgrind
843 : * client requests that help Valgrind detect unsafe page accesses.
844 : *
845 : * Note: raw LockBuffer() calls are disallowed in nbtree; all
846 : * buffer lock requests need to go through wrapper functions such
847 : * as _bt_lockbuf().
848 : */
849 : Buffer
850 15972354 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
851 : {
852 : Buffer buf;
853 :
854 : Assert(BlockNumberIsValid(blkno));
855 :
856 : /* Read an existing block of the relation */
857 15972354 : buf = ReadBuffer(rel, blkno);
858 15972354 : _bt_lockbuf(rel, buf, access);
859 15972354 : _bt_checkpage(rel, buf);
860 :
861 15972354 : return buf;
862 : }
863 :
864 : /*
865 : * _bt_allocbuf() -- Allocate a new block/page.
866 : *
867 : * Returns a write-locked buffer containing an unallocated nbtree page.
868 : *
869 : * Callers are required to pass a valid heaprel. We need heaprel so that we
870 : * can handle generating a snapshotConflictHorizon that makes reusing a page
871 : * from the FSM safe for queries that may be running on standbys.
872 : */
873 : Buffer
874 22107 : _bt_allocbuf(Relation rel, Relation heaprel)
875 : {
876 : Buffer buf;
877 : BlockNumber blkno;
878 : Page page;
879 :
880 : Assert(heaprel != NULL);
881 :
882 : /*
883 : * First see if the FSM knows of any free pages.
884 : *
885 : * We can't trust the FSM's report unreservedly; we have to check that the
886 : * page is still free. (For example, an already-free page could have been
887 : * re-used between the time the last VACUUM scanned it and the time the
888 : * VACUUM made its FSM updates.)
889 : *
890 : * In fact, it's worse than that: we can't even assume that it's safe to
891 : * take a lock on the reported page. If somebody else has a lock on it,
892 : * or even worse our own caller does, we could deadlock. (The own-caller
893 : * scenario is actually not improbable. Consider an index on a serial or
894 : * timestamp column. Nearly all splits will be at the rightmost page, so
895 : * it's entirely likely that _bt_split will call us while holding a lock
896 : * on the page most recently acquired from FSM. A VACUUM running
897 : * concurrently with the previous split could well have placed that page
898 : * back in FSM.)
899 : *
900 : * To get around that, we ask for only a conditional lock on the reported
901 : * page. If we fail, then someone else is using the page, and we may
902 : * reasonably assume it's not free. (If we happen to be wrong, the worst
903 : * consequence is the page will be lost to use till the next VACUUM, which
904 : * is no big problem.)
905 : */
906 : for (;;)
907 : {
908 22107 : blkno = GetFreeIndexPage(rel);
909 22107 : if (blkno == InvalidBlockNumber)
910 21900 : break;
911 207 : buf = ReadBuffer(rel, blkno);
912 207 : if (_bt_conditionallockbuf(rel, buf))
913 : {
914 207 : page = BufferGetPage(buf);
915 :
916 : /*
917 : * It's possible to find an all-zeroes page in an index. For
918 : * example, a backend might successfully extend the relation one
919 : * page and then crash before it is able to make a WAL entry for
920 : * adding the page. If we find a zeroed page then reclaim it
921 : * immediately.
922 : */
923 207 : if (PageIsNew(page))
924 : {
925 : /* Okay to use page. Initialize and return it. */
926 0 : _bt_pageinit(page, BufferGetPageSize(buf));
927 0 : return buf;
928 : }
929 :
930 207 : if (BTPageIsRecyclable(page, heaprel))
931 : {
932 : /*
933 : * If we are generating WAL for Hot Standby then create a WAL
934 : * record that will allow us to conflict with queries running
935 : * on standby, in case they have snapshots older than safexid
936 : * value
937 : */
938 207 : if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
939 : {
940 : xl_btree_reuse_page xlrec_reuse;
941 :
942 : /*
943 : * Note that we don't register the buffer with the record,
944 : * because this operation doesn't modify the page (that
945 : * already happened, back when VACUUM deleted the page).
946 : * This record only exists to provide a conflict point for
947 : * Hot Standby. See record REDO routine comments.
948 : */
949 144 : xlrec_reuse.locator = rel->rd_locator;
950 144 : xlrec_reuse.block = blkno;
951 144 : xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
952 144 : xlrec_reuse.isCatalogRel =
953 144 : RelationIsAccessibleInLogicalDecoding(heaprel);
954 :
955 144 : XLogBeginInsert();
956 144 : XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);
957 :
958 144 : XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
959 : }
960 :
961 : /* Okay to use page. Re-initialize and return it. */
962 207 : _bt_pageinit(page, BufferGetPageSize(buf));
963 207 : return buf;
964 : }
965 0 : elog(DEBUG2, "FSM returned nonrecyclable page");
966 0 : _bt_relbuf(rel, buf);
967 : }
968 : else
969 : {
970 0 : elog(DEBUG2, "FSM returned nonlockable page");
971 : /* couldn't get lock, so just drop pin */
972 0 : ReleaseBuffer(buf);
973 : }
974 : }
975 :
976 : /*
977 : * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
978 : * risk a race condition against btvacuumscan --- see comments therein.
979 : * This forces us to repeat the valgrind request that _bt_lockbuf()
980 : * otherwise would make, as we can't use _bt_lockbuf() without introducing
981 : * a race.
982 : */
983 21900 : buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
984 21900 : if (!RelationUsesLocalBuffers(rel))
985 : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
986 :
987 : /* Initialize the new page before returning it */
988 21900 : page = BufferGetPage(buf);
989 : Assert(PageIsNew(page));
990 21900 : _bt_pageinit(page, BufferGetPageSize(buf));
991 :
992 21900 : return buf;
993 : }
994 :
995 : /*
996 : * _bt_relandgetbuf() -- release a locked buffer and get another one.
997 : *
998 : * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is
999 : * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
1000 : * simplifies some callers.
1001 : *
1002 : * The original motivation for using this was to avoid two entries to the
1003 : * bufmgr when one would do. However, now it's mainly just a notational
1004 : * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
1005 : * is when the target page is the same one already in the buffer.
1006 : */
1007 : Buffer
1008 12058574 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
1009 : {
1010 : Buffer buf;
1011 :
1012 : Assert(BlockNumberIsValid(blkno));
1013 12058574 : if (BufferIsValid(obuf))
1014 12048034 : _bt_unlockbuf(rel, obuf);
1015 12058574 : buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1016 12058574 : _bt_lockbuf(rel, buf, access);
1017 :
1018 12058574 : _bt_checkpage(rel, buf);
1019 12058574 : return buf;
1020 : }
1021 :
1022 : /*
1023 : * _bt_relbuf() -- release a locked buffer.
1024 : *
1025 : * Lock and pin (refcount) are both dropped.
1026 : */
1027 : void
1028 13325308 : _bt_relbuf(Relation rel, Buffer buf)
1029 : {
1030 13325308 : _bt_unlockbuf(rel, buf);
1031 13325308 : ReleaseBuffer(buf);
1032 13325308 : }
1033 :
1034 : /*
1035 : * _bt_lockbuf() -- lock a pinned buffer.
1036 : *
1037 : * Lock is acquired without acquiring another pin. This is like a raw
1038 : * LockBuffer() call, but performs extra steps needed by Valgrind.
1039 : *
1040 : * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1041 : * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1042 : */
1043 : void
1044 28619665 : _bt_lockbuf(Relation rel, Buffer buf, int access)
1045 : {
1046 : /* LockBuffer() asserts that pin is held by this backend */
1047 28619665 : LockBuffer(buf, access);
1048 :
1049 : /*
1050 : * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1051 : * an nbtree error (e.g. a unique violation error). That won't cause
1052 : * Valgrind false positives.
1053 : *
1054 : * The nbtree client requests are superimposed on top of the bufmgr.c
1055 : * buffer pin client requests. In the event of an nbtree error the buffer
1056 : * will certainly get marked as defined when the backend once again
1057 : * acquires its first pin on the buffer. (Of course, if the backend never
1058 : * touches the buffer again then it doesn't matter that it remains
1059 : * non-accessible to Valgrind.)
1060 : *
1061 : * Note: When an IndexTuple C pointer gets computed using an ItemId read
1062 : * from a page while a lock was held, the C pointer becomes unsafe to
1063 : * dereference forever as soon as the lock is released. Valgrind can only
1064 : * detect cases where the pointer gets dereferenced with no _current_
1065 : * lock/pin held, though.
1066 : */
1067 28619665 : if (!RelationUsesLocalBuffers(rel))
1068 : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1069 28619665 : }
1070 :
1071 : /*
1072 : * _bt_unlockbuf() -- unlock a pinned buffer.
1073 : */
1074 : void
1075 28669859 : _bt_unlockbuf(Relation rel, Buffer buf)
1076 : {
1077 : /*
1078 : * Buffer is pinned and locked, which means that it is expected to be
1079 : * defined and addressable. Check that proactively.
1080 : */
1081 : VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1082 :
1083 : /* LockBuffer() asserts that pin is held by this backend */
1084 28669859 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1085 :
1086 28669859 : if (!RelationUsesLocalBuffers(rel))
1087 : VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1088 28669859 : }
1089 :
1090 : /*
1091 : * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1092 : * buffer.
1093 : *
1094 : * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1095 : * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1096 : */
1097 : bool
1098 28822 : _bt_conditionallockbuf(Relation rel, Buffer buf)
1099 : {
1100 : /* ConditionalLockBuffer() asserts that pin is held by this backend */
1101 28822 : if (!ConditionalLockBuffer(buf))
1102 521 : return false;
1103 :
1104 28301 : if (!RelationUsesLocalBuffers(rel))
1105 : VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1106 :
1107 28301 : return true;
1108 : }
1109 :
1110 : /*
1111 : * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
1112 : */
1113 : void
1114 14279 : _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1115 : {
1116 : /*
1117 : * Buffer is pinned and locked, which means that it is expected to be
1118 : * defined and addressable. Check that proactively.
1119 : */
1120 : VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1121 :
1122 : /* LockBuffer() asserts that pin is held by this backend */
1123 14279 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1124 14279 : LockBufferForCleanup(buf);
1125 14279 : }
1126 :
1127 : /*
1128 : * _bt_pageinit() -- Initialize a new page.
1129 : *
1130 : * On return, the page header is initialized; data space is empty;
1131 : * special space is zeroed out.
1132 : */
1133 : void
1134 102628 : _bt_pageinit(Page page, Size size)
1135 : {
1136 102628 : PageInit(page, size, sizeof(BTPageOpaqueData));
1137 102628 : }
1138 :
1139 : /*
1140 : * Delete item(s) from a btree leaf page during VACUUM.
1141 : *
1142 : * This routine assumes that the caller already has a full cleanup lock on
1143 : * the buffer. Also, the given deletable and updatable arrays *must* be
1144 : * sorted in ascending order.
1145 : *
1146 : * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1147 : * in an existing posting list item are to be removed. This works by
1148 : * updating/overwriting an existing item with caller's new version of the item
1149 : * (a version that lacks the TIDs that are to be deleted).
1150 : *
1151 : * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1152 : * generate their own snapshotConflictHorizon directly from the tableam,
1153 : * whereas VACUUMs rely on the initial VACUUM table scan performing
1154 : * WAL-logging that takes care of the issue for the table's indexes
1155 : * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree
1156 : * deletes don't do.
1157 : */
1158 : void
1159 8731 : _bt_delitems_vacuum(Relation rel, Buffer buf,
1160 : OffsetNumber *deletable, int ndeletable,
1161 : BTVacuumPosting *updatable, int nupdatable)
1162 : {
1163 8731 : Page page = BufferGetPage(buf);
1164 : BTPageOpaque opaque;
1165 8731 : bool needswal = RelationNeedsWAL(rel);
1166 8731 : char *updatedbuf = NULL;
1167 8731 : Size updatedbuflen = 0;
1168 : OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1169 : XLogRecPtr recptr;
1170 :
1171 : /* Shouldn't be called unless there's something to do */
1172 : Assert(ndeletable > 0 || nupdatable > 0);
1173 :
1174 : /* Generate new version of posting lists without deleted TIDs */
1175 8731 : if (nupdatable > 0)
1176 956 : updatedbuf = _bt_delitems_update(updatable, nupdatable,
1177 : updatedoffsets, &updatedbuflen,
1178 : needswal);
1179 :
1180 : /* No ereport(ERROR) until changes are logged */
1181 8731 : START_CRIT_SECTION();
1182 :
1183 : /*
1184 : * Handle posting tuple updates.
1185 : *
1186 : * Deliberately do this before handling simple deletes. If we did it the
1187 : * other way around (i.e. WAL record order -- simple deletes before
1188 : * updates) then we'd have to make compensating changes to the 'updatable'
1189 : * array of offset numbers.
1190 : *
1191 : * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1192 : * happens to already be set. It's important that we not interfere with
1193 : * any future simple index tuple deletion operations.
1194 : */
1195 31934 : for (int i = 0; i < nupdatable; i++)
1196 : {
1197 23203 : OffsetNumber updatedoffset = updatedoffsets[i];
1198 : IndexTuple itup;
1199 : Size itemsz;
1200 :
1201 23203 : itup = updatable[i]->itup;
1202 23203 : itemsz = MAXALIGN(IndexTupleSize(itup));
1203 23203 : if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1204 0 : elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1205 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1206 : }
1207 :
1208 : /* Now handle simple deletes of entire tuples */
1209 8731 : if (ndeletable > 0)
1210 8442 : PageIndexMultiDelete(page, deletable, ndeletable);
1211 :
1212 : /*
1213 : * We can clear the vacuum cycle ID since this page has certainly been
1214 : * processed by the current vacuum scan.
1215 : */
1216 8731 : opaque = BTPageGetOpaque(page);
1217 8731 : opaque->btpo_cycleid = 0;
1218 :
1219 : /*
1220 : * Clear the BTP_HAS_GARBAGE page flag.
1221 : *
1222 : * This flag indicates the presence of LP_DEAD items on the page (though
1223 : * not reliably). Note that we only rely on it with pg_upgrade'd
1224 : * !heapkeyspace indexes. That's why clearing it here won't usually
1225 : * interfere with simple index tuple deletion.
1226 : */
1227 8731 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1228 :
1229 8731 : MarkBufferDirty(buf);
1230 :
1231 : /* XLOG stuff */
1232 8731 : if (needswal)
1233 : {
1234 : xl_btree_vacuum xlrec_vacuum;
1235 :
1236 8730 : xlrec_vacuum.ndeleted = ndeletable;
1237 8730 : xlrec_vacuum.nupdated = nupdatable;
1238 :
1239 8730 : XLogBeginInsert();
1240 8730 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1241 8730 : XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1242 :
1243 8730 : if (ndeletable > 0)
1244 8441 : XLogRegisterBufData(0, deletable,
1245 : ndeletable * sizeof(OffsetNumber));
1246 :
1247 8730 : if (nupdatable > 0)
1248 : {
1249 956 : XLogRegisterBufData(0, updatedoffsets,
1250 : nupdatable * sizeof(OffsetNumber));
1251 956 : XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1252 : }
1253 :
1254 8730 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1255 : }
1256 : else
1257 1 : recptr = XLogGetFakeLSN(rel);
1258 :
1259 8731 : PageSetLSN(page, recptr);
1260 :
1261 8731 : END_CRIT_SECTION();
1262 :
1263 : /* can't leak memory here */
1264 8731 : if (updatedbuf != NULL)
1265 956 : pfree(updatedbuf);
1266 : /* free tuples allocated within _bt_delitems_update() */
1267 31934 : for (int i = 0; i < nupdatable; i++)
1268 23203 : pfree(updatable[i]->itup);
1269 8731 : }
1270 :
1271 : /*
1272 : * Delete item(s) from a btree leaf page during single-page cleanup.
1273 : *
1274 : * This routine assumes that the caller has pinned and write locked the
1275 : * buffer. Also, the given deletable and updatable arrays *must* be sorted in
1276 : * ascending order.
1277 : *
1278 : * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1279 : * in an existing posting list item are to be removed. This works by
1280 : * updating/overwriting an existing item with caller's new version of the item
1281 : * (a version that lacks the TIDs that are to be deleted).
1282 : *
1283 : * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1284 : * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
1285 : * (from the tableam). This is used by the REDO routine to generate recovery
1286 : * conflicts. The other difference is that only _bt_delitems_vacuum will
1287 : * clear page's VACUUM cycle ID.
1288 : */
1289 : static void
1290 5534 : _bt_delitems_delete(Relation rel, Buffer buf,
1291 : TransactionId snapshotConflictHorizon, bool isCatalogRel,
1292 : OffsetNumber *deletable, int ndeletable,
1293 : BTVacuumPosting *updatable, int nupdatable)
1294 : {
1295 5534 : Page page = BufferGetPage(buf);
1296 : BTPageOpaque opaque;
1297 5534 : bool needswal = RelationNeedsWAL(rel);
1298 5534 : char *updatedbuf = NULL;
1299 5534 : Size updatedbuflen = 0;
1300 : OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1301 : XLogRecPtr recptr;
1302 :
1303 : /* Shouldn't be called unless there's something to do */
1304 : Assert(ndeletable > 0 || nupdatable > 0);
1305 :
1306 : /* Generate new versions of posting lists without deleted TIDs */
1307 5534 : if (nupdatable > 0)
1308 504 : updatedbuf = _bt_delitems_update(updatable, nupdatable,
1309 : updatedoffsets, &updatedbuflen,
1310 : needswal);
1311 :
1312 : /* No ereport(ERROR) until changes are logged */
1313 5534 : START_CRIT_SECTION();
1314 :
1315 : /* Handle updates and deletes just like _bt_delitems_vacuum */
1316 12207 : for (int i = 0; i < nupdatable; i++)
1317 : {
1318 6673 : OffsetNumber updatedoffset = updatedoffsets[i];
1319 : IndexTuple itup;
1320 : Size itemsz;
1321 :
1322 6673 : itup = updatable[i]->itup;
1323 6673 : itemsz = MAXALIGN(IndexTupleSize(itup));
1324 6673 : if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1325 0 : elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1326 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1327 : }
1328 :
1329 5534 : if (ndeletable > 0)
1330 5468 : PageIndexMultiDelete(page, deletable, ndeletable);
1331 :
1332 : /*
1333 : * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1334 : * this point. The VACUUM command alone controls vacuum cycle IDs.
1335 : */
1336 5534 : opaque = BTPageGetOpaque(page);
1337 :
1338 : /*
1339 : * Clear the BTP_HAS_GARBAGE page flag.
1340 : *
1341 : * This flag indicates the presence of LP_DEAD items on the page (though
1342 : * not reliably). Note that we only rely on it with pg_upgrade'd
1343 : * !heapkeyspace indexes.
1344 : */
1345 5534 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1346 :
1347 5534 : MarkBufferDirty(buf);
1348 :
1349 : /* XLOG stuff */
1350 5534 : if (needswal)
1351 : {
1352 : xl_btree_delete xlrec_delete;
1353 :
1354 5510 : xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1355 5510 : xlrec_delete.ndeleted = ndeletable;
1356 5510 : xlrec_delete.nupdated = nupdatable;
1357 5510 : xlrec_delete.isCatalogRel = isCatalogRel;
1358 :
1359 5510 : XLogBeginInsert();
1360 5510 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1361 5510 : XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
1362 :
1363 5510 : if (ndeletable > 0)
1364 5444 : XLogRegisterBufData(0, deletable,
1365 : ndeletable * sizeof(OffsetNumber));
1366 :
1367 5510 : if (nupdatable > 0)
1368 : {
1369 504 : XLogRegisterBufData(0, updatedoffsets,
1370 : nupdatable * sizeof(OffsetNumber));
1371 504 : XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1372 : }
1373 :
1374 5510 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1375 : }
1376 : else
1377 24 : recptr = XLogGetFakeLSN(rel);
1378 :
1379 5534 : PageSetLSN(page, recptr);
1380 :
1381 5534 : END_CRIT_SECTION();
1382 :
1383 : /* can't leak memory here */
1384 5534 : if (updatedbuf != NULL)
1385 504 : pfree(updatedbuf);
1386 : /* free tuples allocated within _bt_delitems_update() */
1387 12207 : for (int i = 0; i < nupdatable; i++)
1388 6673 : pfree(updatable[i]->itup);
1389 5534 : }
1390 :
1391 : /*
1392 : * Set up state needed to delete TIDs from posting list tuples via "updating"
1393 : * the tuple. Performs steps common to both _bt_delitems_vacuum and
1394 : * _bt_delitems_delete. These steps must take place before each function's
1395 : * critical section begins.
1396 : *
1397 : * updatable and nupdatable are inputs, though note that we will use
1398 : * _bt_update_posting() to replace the original itup with a pointer to a final
1399 : * version in palloc()'d memory. Caller should free the tuples when its done.
1400 : *
1401 : * The first nupdatable entries from updatedoffsets are set to the page offset
1402 : * number for posting list tuples that caller updates. This is mostly useful
1403 : * because caller may need to WAL-log the page offsets (though we always do
1404 : * this for caller out of convenience).
1405 : *
1406 : * Returns buffer consisting of an array of xl_btree_update structs that
1407 : * describe the steps we perform here for caller (though only when needswal is
1408 : * true). Also sets *updatedbuflen to the final size of the buffer. This
1409 : * buffer is used by caller when WAL logging is required.
1410 : */
1411 : static char *
1412 1460 : _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1413 : OffsetNumber *updatedoffsets, Size *updatedbuflen,
1414 : bool needswal)
1415 : {
1416 1460 : char *updatedbuf = NULL;
1417 1460 : Size buflen = 0;
1418 :
1419 : /* Shouldn't be called unless there's something to do */
1420 : Assert(nupdatable > 0);
1421 :
1422 31336 : for (int i = 0; i < nupdatable; i++)
1423 : {
1424 29876 : BTVacuumPosting vacposting = updatable[i];
1425 : Size itemsz;
1426 :
1427 : /* Replace work area IndexTuple with updated version */
1428 29876 : _bt_update_posting(vacposting);
1429 :
1430 : /* Keep track of size of xl_btree_update for updatedbuf in passing */
1431 29876 : itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1432 29876 : buflen += itemsz;
1433 :
1434 : /* Build updatedoffsets buffer in passing */
1435 29876 : updatedoffsets[i] = vacposting->updatedoffset;
1436 : }
1437 :
1438 : /* XLOG stuff */
1439 1460 : if (needswal)
1440 : {
1441 1460 : Size offset = 0;
1442 :
1443 : /* Allocate, set final size for caller */
1444 1460 : updatedbuf = palloc(buflen);
1445 1460 : *updatedbuflen = buflen;
1446 31336 : for (int i = 0; i < nupdatable; i++)
1447 : {
1448 29876 : BTVacuumPosting vacposting = updatable[i];
1449 : Size itemsz;
1450 : xl_btree_update update;
1451 :
1452 29876 : update.ndeletedtids = vacposting->ndeletedtids;
1453 29876 : memcpy(updatedbuf + offset, &update.ndeletedtids,
1454 : SizeOfBtreeUpdate);
1455 29876 : offset += SizeOfBtreeUpdate;
1456 :
1457 29876 : itemsz = update.ndeletedtids * sizeof(uint16);
1458 29876 : memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1459 29876 : offset += itemsz;
1460 : }
1461 : }
1462 :
1463 1460 : return updatedbuf;
1464 : }
1465 :
1466 : /*
1467 : * Comparator used by _bt_delitems_delete_check() to restore deltids array
1468 : * back to its original leaf-page-wise sort order
1469 : */
1470 : static int
1471 3290188 : _bt_delitems_cmp(const void *a, const void *b)
1472 : {
1473 3290188 : const TM_IndexDelete *indexdelete1 = a;
1474 3290188 : const TM_IndexDelete *indexdelete2 = b;
1475 :
1476 : Assert(indexdelete1->id != indexdelete2->id);
1477 :
1478 3290188 : return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
1479 : }
1480 :
1481 : /*
1482 : * Try to delete item(s) from a btree leaf page during single-page cleanup.
1483 : *
1484 : * nbtree interface to table_index_delete_tuples(). Deletes a subset of index
1485 : * tuples from caller's deltids array: those whose TIDs are found safe to
1486 : * delete by the tableam (or already marked LP_DEAD in index, and so already
1487 : * known to be deletable by our simple index deletion caller). We physically
1488 : * delete index tuples from buf leaf page last of all (for index tuples where
1489 : * that is known to be safe following our table_index_delete_tuples() call).
1490 : *
1491 : * Simple index deletion caller only includes TIDs from index tuples marked
1492 : * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1493 : * included without increasing the total number of distinct table blocks for
1494 : * the deletion operation as a whole. This approach often allows us to delete
1495 : * some extra index tuples that were practically free for tableam to check in
1496 : * passing (when they actually turn out to be safe to delete). It probably
1497 : * only makes sense for the tableam to go ahead with these extra checks when
1498 : * it is block-oriented (otherwise the checks probably won't be practically
1499 : * free, which we rely on). The tableam interface requires the tableam side
1500 : * to handle the problem, though, so this is okay (we as an index AM are free
1501 : * to make the simplifying assumption that all tableams must be block-based).
1502 : *
1503 : * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1504 : * without expecting that tableam will check most of them. The tableam has
1505 : * considerable discretion around which entries/blocks it checks. Our role in
1506 : * costing the bottom-up deletion operation is strictly advisory.
1507 : *
1508 : * Note: Caller must have added deltids entries (i.e. entries that go in
1509 : * delstate's main array) in leaf-page-wise order: page offset number order,
1510 : * TID order among entries taken from the same posting list tuple (tiebreak on
1511 : * TID). This order is convenient to work with here.
1512 : *
1513 : * Note: We also rely on the id field of each deltids element "capturing" this
1514 : * original leaf-page-wise order. That is, we expect to be able to get back
1515 : * to the original leaf-page-wise order just by sorting deltids on the id
1516 : * field (tableam will sort deltids for its own reasons, so we'll need to put
1517 : * it back in leaf-page-wise order afterwards).
1518 : */
1519 : void
1520 7424 : _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
1521 : TM_IndexDeleteOp *delstate)
1522 : {
1523 7424 : Page page = BufferGetPage(buf);
1524 : TransactionId snapshotConflictHorizon;
1525 : bool isCatalogRel;
1526 7424 : OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1527 7424 : int ndeletable = 0,
1528 7424 : nupdatable = 0;
1529 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1530 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1531 :
1532 : /* Use tableam interface to determine which tuples to delete first */
1533 7424 : snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1534 7424 : isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1535 :
1536 : /* Should not WAL-log snapshotConflictHorizon unless it's required */
1537 7424 : if (!XLogStandbyInfoActive())
1538 1991 : snapshotConflictHorizon = InvalidTransactionId;
1539 :
1540 : /*
1541 : * Construct a leaf-page-wise description of what _bt_delitems_delete()
1542 : * needs to do to physically delete index tuples from the page.
1543 : *
1544 : * Must sort deltids array to restore leaf-page-wise order (original order
1545 : * before call to tableam). This is the order that the loop expects.
1546 : *
1547 : * Note that deltids array might be a lot smaller now. It might even have
1548 : * no entries at all (with bottom-up deletion caller), in which case there
1549 : * is nothing left to do.
1550 : */
1551 7424 : qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1552 : _bt_delitems_cmp);
1553 7424 : if (delstate->ndeltids == 0)
1554 : {
1555 : Assert(delstate->bottomup);
1556 1890 : return;
1557 : }
1558 :
1559 : /* We definitely have to delete at least one index tuple (or one TID) */
1560 486986 : for (int i = 0; i < delstate->ndeltids; i++)
1561 : {
1562 481452 : TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1563 481452 : OffsetNumber idxoffnum = dstatus->idxoffnum;
1564 481452 : ItemId itemid = PageGetItemId(page, idxoffnum);
1565 481452 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1566 : int nestedi,
1567 : nitem;
1568 : BTVacuumPosting vacposting;
1569 :
1570 : Assert(OffsetNumberIsValid(idxoffnum));
1571 :
1572 481452 : if (idxoffnum == postingidxoffnum)
1573 : {
1574 : /*
1575 : * This deltid entry is a TID from a posting list tuple that has
1576 : * already been completely processed
1577 : */
1578 : Assert(BTreeTupleIsPosting(itup));
1579 : Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
1580 : &delstate->deltids[i].tid) < 0);
1581 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
1582 : &delstate->deltids[i].tid) >= 0);
1583 18334 : continue;
1584 : }
1585 :
1586 463118 : if (!BTreeTupleIsPosting(itup))
1587 : {
1588 : /* Plain non-pivot tuple */
1589 : Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1590 447046 : if (dstatus->knowndeletable)
1591 361732 : deletable[ndeletable++] = idxoffnum;
1592 447046 : continue;
1593 : }
1594 :
1595 : /*
1596 : * itup is a posting list tuple whose lowest deltids entry (which may
1597 : * or may not be for the first TID from itup) is considered here now.
1598 : * We should process all of the deltids entries for the posting list
1599 : * together now, though (not just the lowest). Remember to skip over
1600 : * later itup-related entries during later iterations of outermost
1601 : * loop.
1602 : */
1603 16072 : postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1604 16072 : nestedi = i; /* Initialize for first itup deltids entry */
1605 16072 : vacposting = NULL; /* Describes final action for itup */
1606 16072 : nitem = BTreeTupleGetNPosting(itup);
1607 73590 : for (int p = 0; p < nitem; p++)
1608 : {
1609 57518 : ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1610 57518 : int ptidcmp = -1;
1611 :
1612 : /*
1613 : * This nested loop reuses work across ptid TIDs taken from itup.
1614 : * We take advantage of the fact that both itup's TIDs and deltids
1615 : * entries (within a single itup/posting list grouping) must both
1616 : * be in ascending TID order.
1617 : */
1618 82734 : for (; nestedi < delstate->ndeltids; nestedi++)
1619 : {
1620 79304 : TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1621 79304 : TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1622 :
1623 : /* Stop once we get past all itup related deltids entries */
1624 : Assert(tdstatus->idxoffnum >= idxoffnum);
1625 79304 : if (tdstatus->idxoffnum != idxoffnum)
1626 15124 : break;
1627 :
1628 : /* Skip past non-deletable itup related entries up front */
1629 64180 : if (!tdstatus->knowndeletable)
1630 5018 : continue;
1631 :
1632 : /* Entry is first partial ptid match (or an exact match)? */
1633 59162 : ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1634 59162 : if (ptidcmp >= 0)
1635 : {
1636 : /* Greater than or equal (partial or exact) match... */
1637 38964 : break;
1638 : }
1639 : }
1640 :
1641 : /* ...exact ptid match to a deletable deltids entry? */
1642 57518 : if (ptidcmp != 0)
1643 28130 : continue;
1644 :
1645 : /* Exact match for deletable deltids entry -- ptid gets deleted */
1646 29388 : if (vacposting == NULL)
1647 : {
1648 14511 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1649 : nitem * sizeof(uint16));
1650 14511 : vacposting->itup = itup;
1651 14511 : vacposting->updatedoffset = idxoffnum;
1652 14511 : vacposting->ndeletedtids = 0;
1653 : }
1654 29388 : vacposting->deletetids[vacposting->ndeletedtids++] = p;
1655 : }
1656 :
1657 : /* Final decision on itup, a posting list tuple */
1658 :
1659 16072 : if (vacposting == NULL)
1660 : {
1661 : /* No TIDs to delete from itup -- do nothing */
1662 : }
1663 14511 : else if (vacposting->ndeletedtids == nitem)
1664 : {
1665 : /* Straight delete of itup (to delete all TIDs) */
1666 7838 : deletable[ndeletable++] = idxoffnum;
1667 : /* Turns out we won't need granular information */
1668 7838 : pfree(vacposting);
1669 : }
1670 : else
1671 : {
1672 : /* Delete some (but not all) TIDs from itup */
1673 : Assert(vacposting->ndeletedtids > 0 &&
1674 : vacposting->ndeletedtids < nitem);
1675 6673 : updatable[nupdatable++] = vacposting;
1676 : }
1677 : }
1678 :
1679 : /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1680 5534 : _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1681 : deletable, ndeletable, updatable, nupdatable);
1682 :
1683 : /* be tidy */
1684 12207 : for (int i = 0; i < nupdatable; i++)
1685 6673 : pfree(updatable[i]);
1686 : }
1687 :
1688 : /*
1689 : * Check that leftsib page (the btpo_prev of target page) is not marked with
1690 : * INCOMPLETE_SPLIT flag. Used during page deletion.
1691 : *
1692 : * Returning true indicates that page flag is set in leftsib (which is
1693 : * definitely still the left sibling of target). When that happens, the
1694 : * target doesn't have a downlink in parent, and the page deletion algorithm
1695 : * isn't prepared to handle that. Deletion of the target page (or the whole
1696 : * subtree that contains the target page) cannot take place.
1697 : *
1698 : * Caller should not have a lock on the target page itself, since pages on the
1699 : * same level must always be locked left to right to avoid deadlocks.
1700 : */
1701 : static bool
1702 3571 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
1703 : {
1704 : Buffer buf;
1705 : Page page;
1706 : BTPageOpaque opaque;
1707 : bool result;
1708 :
1709 : /* Easy case: No left sibling */
1710 3571 : if (leftsib == P_NONE)
1711 2846 : return false;
1712 :
1713 725 : buf = _bt_getbuf(rel, leftsib, BT_READ);
1714 725 : page = BufferGetPage(buf);
1715 725 : opaque = BTPageGetOpaque(page);
1716 :
1717 : /*
1718 : * If the left sibling was concurrently split, so that its next-pointer
1719 : * doesn't point to the current page anymore, the split that created
1720 : * target must be completed. Caller can reasonably expect that there will
1721 : * be a downlink to the target page that it can relocate using its stack.
1722 : * (We don't allow splitting an incompletely split page again until the
1723 : * previous split has been completed.)
1724 : */
1725 725 : result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1726 725 : _bt_relbuf(rel, buf);
1727 :
1728 725 : return result;
1729 : }
1730 :
1731 : /*
1732 : * Check that leafrightsib page (the btpo_next of target leaf page) is not
1733 : * marked with ISHALFDEAD flag. Used during page deletion.
1734 : *
1735 : * Returning true indicates that page flag is set in leafrightsib, so page
1736 : * deletion cannot go ahead. Our caller is not prepared to deal with the case
1737 : * where the parent page does not have a pivot tuples whose downlink points to
1738 : * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1739 : * seem worth going to the trouble of teaching our caller to deal with it.
1740 : * The situation will be resolved after VACUUM finishes the deletion of the
1741 : * half-dead page (when a future VACUUM operation reaches the target page
1742 : * again).
1743 : *
1744 : * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1745 : * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1746 : * okay because of the restriction on deleting pages that are the rightmost
1747 : * page of their parent (i.e. that such deletions can only take place when the
1748 : * entire subtree must be deleted). The leaf level check made here will apply
1749 : * to a right "cousin" leaf page rather than a simple right sibling leaf page
1750 : * in cases where caller actually goes on to attempt deleting pages that are
1751 : * above the leaf page. The right cousin leaf page is representative of the
1752 : * left edge of the subtree to the right of the to-be-deleted subtree as a
1753 : * whole, which is exactly the condition that our caller cares about.
1754 : * (Besides, internal pages are never marked half-dead, so it isn't even
1755 : * possible to _directly_ assess if an internal page is part of some other
1756 : * to-be-deleted subtree.)
1757 : */
1758 : static bool
1759 3447 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
1760 : {
1761 : Buffer buf;
1762 : Page page;
1763 : BTPageOpaque opaque;
1764 : bool result;
1765 :
1766 : Assert(leafrightsib != P_NONE);
1767 :
1768 3447 : buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1769 3447 : page = BufferGetPage(buf);
1770 3447 : opaque = BTPageGetOpaque(page);
1771 :
1772 : Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1773 3447 : result = P_ISHALFDEAD(opaque);
1774 3447 : _bt_relbuf(rel, buf);
1775 :
1776 3447 : return result;
1777 : }
1778 :
1779 : /*
1780 : * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1781 : *
1782 : * This action unlinks the leaf page from the b-tree structure, removing all
1783 : * pointers leading to it --- but not touching its own left and right links.
1784 : * The page cannot be physically reclaimed right away, since other processes
1785 : * may currently be trying to follow links leading to the page; they have to
1786 : * be allowed to use its right-link to recover. See nbtree/README.
1787 : *
1788 : * On entry, the target buffer must be pinned and locked (either read or write
1789 : * lock is OK). The page must be an empty leaf page, which may be half-dead
1790 : * already (a half-dead page should only be passed to us when an earlier
1791 : * VACUUM operation was interrupted, though). Note in particular that caller
1792 : * should never pass a buffer containing an existing deleted page here. The
1793 : * lock and pin on caller's buffer will be dropped before we return.
1794 : *
1795 : * Maintains bulk delete stats for caller, which are taken from vstate. We
1796 : * need to cooperate closely with caller here so that whole VACUUM operation
1797 : * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1798 : * delete in passing. If such pages happen to be from a block number that is
1799 : * ahead of the current scanblkno position, then caller is expected to count
1800 : * them directly later on. It's simpler for us to understand caller's
1801 : * requirements than it would be for caller to understand when or how a
1802 : * deleted page became deleted after the fact.
1803 : *
1804 : * NOTE: this leaks memory. Rather than trying to clean up everything
1805 : * carefully, it's better to run it in a temp context that can be reset
1806 : * frequently.
1807 : */
1808 : void
1809 3562 : _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1810 : {
1811 : BlockNumber rightsib;
1812 : bool rightsib_empty;
1813 : Page page;
1814 : BTPageOpaque opaque;
1815 :
1816 : /*
1817 : * Save original leafbuf block number from caller. Only deleted blocks
1818 : * that are <= scanblkno are added to bulk delete stat's pages_deleted
1819 : * count.
1820 : */
1821 3562 : BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1822 :
1823 : /*
1824 : * "stack" is a search stack leading (approximately) to the target page.
1825 : * It is initially NULL, but when iterating, we keep it to avoid
1826 : * duplicated search effort.
1827 : *
1828 : * Also, when "stack" is not NULL, we have already checked that the
1829 : * current page is not the right half of an incomplete split, i.e. the
1830 : * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1831 : * when the current target page is to the right of caller's initial page
1832 : * (the scanblkno page).
1833 : */
1834 3562 : BTStack stack = NULL;
1835 :
1836 : for (;;)
1837 : {
1838 7009 : page = BufferGetPage(leafbuf);
1839 7009 : opaque = BTPageGetOpaque(page);
1840 :
1841 : /*
1842 : * Internal pages are never deleted directly, only as part of deleting
1843 : * the whole subtree all the way down to leaf level.
1844 : *
1845 : * Also check for deleted pages here. Caller never passes us a fully
1846 : * deleted page. Only VACUUM can delete pages, so there can't have
1847 : * been a concurrent deletion. Assume that we reached any deleted
1848 : * page encountered here by following a sibling link, and that the
1849 : * index is corrupt.
1850 : */
1851 : Assert(!P_ISDELETED(opaque));
1852 7009 : if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1853 : {
1854 : /*
1855 : * Pre-9.4 page deletion only marked internal pages as half-dead,
1856 : * but now we only use that flag on leaf pages. The old algorithm
1857 : * was never supposed to leave half-dead pages in the tree, it was
1858 : * just a transient state, but it was nevertheless possible in
1859 : * error scenarios. We don't know how to deal with them here. They
1860 : * are harmless as far as searches are considered, but inserts
1861 : * into the deleted keyspace could add out-of-order downlinks in
1862 : * the upper levels. Log a notice, hopefully the admin will notice
1863 : * and reindex.
1864 : */
1865 0 : if (P_ISHALFDEAD(opaque))
1866 0 : ereport(LOG,
1867 : (errcode(ERRCODE_INDEX_CORRUPTED),
1868 : errmsg("index \"%s\" contains a half-dead internal page",
1869 : RelationGetRelationName(rel)),
1870 : errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1871 :
1872 0 : if (P_ISDELETED(opaque))
1873 0 : ereport(LOG,
1874 : (errcode(ERRCODE_INDEX_CORRUPTED),
1875 : errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1876 : BufferGetBlockNumber(leafbuf),
1877 : scanblkno,
1878 : RelationGetRelationName(rel))));
1879 :
1880 0 : _bt_relbuf(rel, leafbuf);
1881 123 : return;
1882 : }
1883 :
1884 : /*
1885 : * We can never delete rightmost pages nor root pages. While at it,
1886 : * check that page is empty, since it's possible that the leafbuf page
1887 : * was empty a moment ago, but has since had some inserts.
1888 : *
1889 : * To keep the algorithm simple, we also never delete an incompletely
1890 : * split page (they should be rare enough that this doesn't make any
1891 : * meaningful difference to disk usage):
1892 : *
1893 : * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1894 : * left half of an incomplete split, but ensuring that it's not the
1895 : * right half is more complicated. For that, we have to check that
1896 : * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1897 : * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1898 : * release the lock on scanblkno/leafbuf, check the left sibling, and
1899 : * construct a search stack to scanblkno. On subsequent iterations,
1900 : * we know we stepped right from a page that passed these tests, so
1901 : * it's OK.
1902 : */
1903 7009 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1904 6890 : P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1905 6890 : P_INCOMPLETE_SPLIT(opaque))
1906 : {
1907 : /* Should never fail to delete a half-dead page */
1908 : Assert(!P_ISHALFDEAD(opaque));
1909 :
1910 119 : _bt_relbuf(rel, leafbuf);
1911 119 : return;
1912 : }
1913 :
1914 : /*
1915 : * First, remove downlink pointing to the page (or a parent of the
1916 : * page, if we are going to delete a taller subtree), and mark the
1917 : * leafbuf page half-dead
1918 : */
1919 6890 : if (!P_ISHALFDEAD(opaque))
1920 : {
1921 : /*
1922 : * We need an approximate pointer to the page's parent page. We
1923 : * use a variant of the standard search mechanism to search for
1924 : * the page's high key; this will give us a link to either the
1925 : * current parent or someplace to its left (if there are multiple
1926 : * equal high keys, which is possible with !heapkeyspace indexes).
1927 : *
1928 : * Also check if this is the right-half of an incomplete split
1929 : * (see comment above).
1930 : */
1931 6890 : if (!stack)
1932 3443 : {
1933 : BTScanInsert itup_key;
1934 : ItemId itemid;
1935 : IndexTuple targetkey;
1936 : BlockNumber leftsib,
1937 : leafblkno;
1938 : Buffer sleafbuf;
1939 :
1940 3443 : itemid = PageGetItemId(page, P_HIKEY);
1941 3443 : targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1942 :
1943 3443 : leftsib = opaque->btpo_prev;
1944 3443 : leafblkno = BufferGetBlockNumber(leafbuf);
1945 :
1946 : /*
1947 : * To avoid deadlocks, we'd better drop the leaf page lock
1948 : * before going further.
1949 : */
1950 3443 : _bt_unlockbuf(rel, leafbuf);
1951 :
1952 : /*
1953 : * Check that the left sibling of leafbuf (if any) is not
1954 : * marked with INCOMPLETE_SPLIT flag before proceeding
1955 : */
1956 : Assert(leafblkno == scanblkno);
1957 3443 : if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1958 : {
1959 0 : ReleaseBuffer(leafbuf);
1960 0 : return;
1961 : }
1962 :
1963 : /*
1964 : * We need an insertion scan key, so build one.
1965 : *
1966 : * _bt_search searches for the leaf page that contains any
1967 : * matching non-pivot tuples, but we need it to "search" for
1968 : * the high key pivot from the page that we're set to delete.
1969 : * Compensate for the mismatch by having _bt_search locate the
1970 : * last position < equal-to-untruncated-prefix non-pivots.
1971 : */
1972 3443 : itup_key = _bt_mkscankey(rel, targetkey);
1973 :
1974 : /* Set up a BTLessStrategyNumber-like insertion scan key */
1975 3443 : itup_key->nextkey = false;
1976 3443 : itup_key->backward = true;
1977 3443 : stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, true);
1978 : /* won't need a second lock or pin on leafbuf */
1979 3443 : _bt_relbuf(rel, sleafbuf);
1980 :
1981 : /*
1982 : * Re-lock the leaf page, and start over to use our stack
1983 : * within _bt_mark_page_halfdead. We must do it that way
1984 : * because it's possible that leafbuf can no longer be
1985 : * deleted. We need to recheck.
1986 : *
1987 : * Note: We can't simply hold on to the sleafbuf lock instead,
1988 : * because it's barely possible that sleafbuf is not the same
1989 : * page as leafbuf. This happens when leafbuf split after our
1990 : * original lock was dropped, but before _bt_search finished
1991 : * its descent. We rely on the assumption that we'll find
1992 : * leafbuf isn't safe to delete anymore in this scenario.
1993 : * (Page deletion can cope with the stack being to the left of
1994 : * leafbuf, but not to the right of leafbuf.)
1995 : */
1996 3443 : _bt_lockbuf(rel, leafbuf, BT_WRITE);
1997 3443 : continue;
1998 : }
1999 :
2000 : /*
2001 : * See if it's safe to delete the leaf page, and determine how
2002 : * many parent/internal pages above the leaf level will be
2003 : * deleted. If it's safe then _bt_mark_page_halfdead will also
2004 : * perform the first phase of deletion, which includes marking the
2005 : * leafbuf page half-dead.
2006 : */
2007 : Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2008 3447 : if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2009 : stack))
2010 : {
2011 4 : _bt_relbuf(rel, leafbuf);
2012 4 : return;
2013 : }
2014 : }
2015 : else
2016 : {
2017 0 : INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
2018 : }
2019 :
2020 : /*
2021 : * Then unlink it from its siblings. Each call to
2022 : * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2023 : * making it shallower. Iterate until the leafbuf page is deleted.
2024 : */
2025 3443 : rightsib_empty = false;
2026 : Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2027 7005 : while (P_ISHALFDEAD(opaque))
2028 : {
2029 : /* Check for interrupts in _bt_unlink_halfdead_page */
2030 3562 : if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2031 : &rightsib_empty, vstate))
2032 : {
2033 : /*
2034 : * _bt_unlink_halfdead_page should never fail, since we
2035 : * established that deletion is generally safe in
2036 : * _bt_mark_page_halfdead -- index must be corrupt.
2037 : *
2038 : * Note that _bt_unlink_halfdead_page already released the
2039 : * lock and pin on leafbuf for us.
2040 : */
2041 : Assert(false);
2042 0 : return;
2043 : }
2044 : }
2045 :
2046 : Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2047 :
2048 3443 : rightsib = opaque->btpo_next;
2049 :
2050 3443 : _bt_relbuf(rel, leafbuf);
2051 :
2052 : /*
2053 : * Check here, as calling loops will have locks held, preventing
2054 : * interrupts from being processed.
2055 : */
2056 3443 : CHECK_FOR_INTERRUPTS();
2057 :
2058 : /*
2059 : * The page has now been deleted. If its right sibling is completely
2060 : * empty, it's possible that the reason we haven't deleted it earlier
2061 : * is that it was the rightmost child of the parent. Now that we
2062 : * removed the downlink for this page, the right sibling might now be
2063 : * the only child of the parent, and could be removed. It would be
2064 : * picked up by the next vacuum anyway, but might as well try to
2065 : * remove it now, so loop back to process the right sibling.
2066 : *
2067 : * Note: This relies on the assumption that _bt_getstackbuf() will be
2068 : * able to reuse our original descent stack with a different child
2069 : * block (provided that the child block is to the right of the
2070 : * original leaf page reached by _bt_search()). It will even update
2071 : * the descent stack each time we loop around, avoiding repeated work.
2072 : */
2073 3443 : if (!rightsib_empty)
2074 3439 : break;
2075 :
2076 4 : leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2077 : }
2078 : }
2079 :
2080 : /*
2081 : * First stage of page deletion.
2082 : *
2083 : * Establish the height of the to-be-deleted subtree with leafbuf at its
2084 : * lowest level, remove the downlink to the subtree, and mark leafbuf
2085 : * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
2086 : * but may include additional internal pages (at most one per level of the
2087 : * tree below the root).
2088 : *
2089 : * Caller must pass a valid heaprel, since it's just about possible that our
2090 : * call to _bt_lock_subtree_parent will need to allocate a new index page to
2091 : * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel.
2092 : *
2093 : * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2094 : * the rightmost child of its parent (and parent has more than one downlink).
2095 : * Returns 'true' when the first stage of page deletion completed
2096 : * successfully.
2097 : */
2098 : static bool
2099 3447 : _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
2100 : BTStack stack)
2101 : {
2102 : BlockNumber leafblkno;
2103 : BlockNumber leafrightsib;
2104 : BlockNumber topparent;
2105 : BlockNumber topparentrightsib;
2106 : ItemId itemid;
2107 : Page page;
2108 : BTPageOpaque opaque;
2109 : Buffer subtreeparent;
2110 : OffsetNumber poffset;
2111 : OffsetNumber nextoffset;
2112 : IndexTuple itup;
2113 : IndexTupleData trunctuple;
2114 : XLogRecPtr recptr;
2115 :
2116 3447 : page = BufferGetPage(leafbuf);
2117 3447 : opaque = BTPageGetOpaque(page);
2118 :
2119 : Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2120 : P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2121 : P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2122 : Assert(heaprel != NULL);
2123 :
2124 : /*
2125 : * Save info about the leaf page.
2126 : */
2127 3447 : leafblkno = BufferGetBlockNumber(leafbuf);
2128 3447 : leafrightsib = opaque->btpo_next;
2129 :
2130 : /*
2131 : * Before attempting to lock the parent page, check that the right sibling
2132 : * is not in half-dead state. A half-dead right sibling would have no
2133 : * downlink in the parent, which would be highly confusing later when we
2134 : * delete the downlink. It would fail the "right sibling of target page
2135 : * is also the next child in parent page" cross-check below.
2136 : */
2137 3447 : if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2138 : {
2139 0 : elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2140 : leafblkno, leafrightsib);
2141 0 : return false;
2142 : }
2143 :
2144 : /*
2145 : * We cannot delete a page that is the rightmost child of its immediate
2146 : * parent, unless it is the only child --- in which case the parent has to
2147 : * be deleted too, and the same condition applies recursively to it. We
2148 : * have to check this condition all the way up before trying to delete,
2149 : * and lock the parent of the root of the to-be-deleted subtree (the
2150 : * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
2151 : * for us. We remove the downlink to the "top parent" page (subtree root
2152 : * page) from the subtree parent page below.
2153 : *
2154 : * Initialize topparent to be leafbuf page now. The final to-be-deleted
2155 : * subtree is often a degenerate one page subtree consisting only of the
2156 : * leafbuf page. When that happens, the leafbuf page is the final subtree
2157 : * root page/top parent page.
2158 : */
2159 3447 : topparent = leafblkno;
2160 3447 : topparentrightsib = leafrightsib;
2161 3447 : if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
2162 : &subtreeparent, &poffset,
2163 : &topparent, &topparentrightsib))
2164 4 : return false;
2165 :
2166 3443 : page = BufferGetPage(subtreeparent);
2167 3443 : opaque = BTPageGetOpaque(page);
2168 :
2169 : #ifdef USE_ASSERT_CHECKING
2170 :
2171 : /*
2172 : * This is just an assertion because _bt_lock_subtree_parent should have
2173 : * guaranteed tuple has the expected contents
2174 : */
2175 : itemid = PageGetItemId(page, poffset);
2176 : itup = (IndexTuple) PageGetItem(page, itemid);
2177 : Assert(BTreeTupleGetDownLink(itup) == topparent);
2178 : #endif
2179 :
2180 3443 : nextoffset = OffsetNumberNext(poffset);
2181 3443 : itemid = PageGetItemId(page, nextoffset);
2182 3443 : itup = (IndexTuple) PageGetItem(page, itemid);
2183 :
2184 : /*
2185 : * Check that the parent-page index items we're about to delete/overwrite
2186 : * in subtree parent page contain what we expect. This can fail if the
2187 : * index has become corrupt for some reason. When that happens we back
2188 : * out of deletion of the leafbuf subtree. (This is just like the case
2189 : * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
2190 : */
2191 3443 : if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2192 : {
2193 0 : ereport(LOG,
2194 : (errcode(ERRCODE_INDEX_CORRUPTED),
2195 : errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2196 : topparentrightsib, topparent,
2197 : BTreeTupleGetDownLink(itup),
2198 : BufferGetBlockNumber(subtreeparent),
2199 : RelationGetRelationName(rel))));
2200 :
2201 0 : _bt_relbuf(rel, subtreeparent);
2202 : Assert(false);
2203 0 : return false;
2204 : }
2205 :
2206 : /*
2207 : * Any insert which would have gone on the leaf block will now go to its
2208 : * right sibling. In other words, the key space moves right.
2209 : */
2210 3443 : PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2211 :
2212 : /* No ereport(ERROR) until changes are logged */
2213 3443 : START_CRIT_SECTION();
2214 :
2215 : /*
2216 : * Update parent of subtree. We want to delete the downlink to the top
2217 : * parent page/root of the subtree, and the *following* key. Easiest way
2218 : * is to copy the right sibling's downlink over the downlink that points
2219 : * to top parent page, and then delete the right sibling's original pivot
2220 : * tuple.
2221 : *
2222 : * Lanin and Shasha make the key space move left when deleting a page,
2223 : * whereas the key space moves right here. That's why we cannot simply
2224 : * delete the pivot tuple with the downlink to the top parent page. See
2225 : * nbtree/README.
2226 : */
2227 3443 : page = BufferGetPage(subtreeparent);
2228 3443 : opaque = BTPageGetOpaque(page);
2229 :
2230 3443 : itemid = PageGetItemId(page, poffset);
2231 3443 : itup = (IndexTuple) PageGetItem(page, itemid);
2232 3443 : BTreeTupleSetDownLink(itup, topparentrightsib);
2233 :
2234 3443 : nextoffset = OffsetNumberNext(poffset);
2235 3443 : PageIndexTupleDelete(page, nextoffset);
2236 :
2237 : /*
2238 : * Mark the leaf page as half-dead, and stamp it with a link to the top
2239 : * parent page. When the leaf page is also the top parent page, the link
2240 : * is set to InvalidBlockNumber.
2241 : */
2242 3443 : page = BufferGetPage(leafbuf);
2243 3443 : opaque = BTPageGetOpaque(page);
2244 3443 : opaque->btpo_flags |= BTP_HALF_DEAD;
2245 :
2246 : Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
2247 3443 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2248 3443 : trunctuple.t_info = sizeof(IndexTupleData);
2249 3443 : if (topparent != leafblkno)
2250 59 : BTreeTupleSetTopParent(&trunctuple, topparent);
2251 : else
2252 3384 : BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
2253 :
2254 3443 : if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple)))
2255 0 : elog(ERROR, "could not overwrite high key in half-dead page");
2256 :
2257 : /* Must mark buffers dirty before XLogInsert */
2258 3443 : MarkBufferDirty(subtreeparent);
2259 3443 : MarkBufferDirty(leafbuf);
2260 :
2261 : /* XLOG stuff */
2262 3443 : if (RelationNeedsWAL(rel))
2263 3443 : {
2264 : xl_btree_mark_page_halfdead xlrec;
2265 :
2266 3443 : xlrec.poffset = poffset;
2267 3443 : xlrec.leafblk = leafblkno;
2268 3443 : if (topparent != leafblkno)
2269 59 : xlrec.topparent = topparent;
2270 : else
2271 3384 : xlrec.topparent = InvalidBlockNumber;
2272 :
2273 3443 : XLogBeginInsert();
2274 3443 : XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2275 3443 : XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2276 :
2277 3443 : page = BufferGetPage(leafbuf);
2278 3443 : opaque = BTPageGetOpaque(page);
2279 3443 : xlrec.leftblk = opaque->btpo_prev;
2280 3443 : xlrec.rightblk = opaque->btpo_next;
2281 :
2282 3443 : XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
2283 :
2284 3443 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2285 : }
2286 : else
2287 0 : recptr = XLogGetFakeLSN(rel);
2288 :
2289 3443 : page = BufferGetPage(subtreeparent);
2290 3443 : PageSetLSN(page, recptr);
2291 3443 : page = BufferGetPage(leafbuf);
2292 3443 : PageSetLSN(page, recptr);
2293 :
2294 3443 : END_CRIT_SECTION();
2295 :
2296 3443 : _bt_relbuf(rel, subtreeparent);
2297 3443 : return true;
2298 : }
2299 :
2300 : /*
2301 : * Second stage of page deletion.
2302 : *
2303 : * Unlinks a single page (in the subtree undergoing deletion) from its
2304 : * siblings. Also marks the page deleted.
2305 : *
2306 : * To get rid of the whole subtree, including the leaf page itself, call here
2307 : * until the leaf page is deleted. The original "top parent" established in
2308 : * the first stage of deletion is deleted in the first call here, while the
2309 : * leaf page is deleted in the last call here. Note that the leaf page itself
2310 : * is often the initial top parent page.
2311 : *
2312 : * Returns 'false' if the page could not be unlinked (shouldn't happen). If
2313 : * the right sibling of the current target page is empty, *rightsib_empty is
2314 : * set to true, allowing caller to delete the target's right sibling page in
2315 : * passing. Note that *rightsib_empty is only actually used by caller when
2316 : * target page is leafbuf, following last call here for leafbuf/the subtree
2317 : * containing leafbuf. (We always set *rightsib_empty for caller, just to be
2318 : * consistent.)
2319 : *
2320 : * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2321 : * On success exit, we'll be holding pin and write lock. On failure exit,
2322 : * we'll release both pin and lock before returning (we define it that way
2323 : * to avoid having to reacquire a lock we already released).
2324 : */
2325 : static bool
2326 3562 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
2327 : bool *rightsib_empty, BTVacState *vstate)
2328 : {
2329 3562 : BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2330 3562 : IndexBulkDeleteResult *stats = vstate->stats;
2331 : BlockNumber leafleftsib;
2332 : BlockNumber leafrightsib;
2333 : BlockNumber target;
2334 : BlockNumber leftsib;
2335 : BlockNumber rightsib;
2336 3562 : Buffer lbuf = InvalidBuffer;
2337 : Buffer buf;
2338 : Buffer rbuf;
2339 3562 : Buffer metabuf = InvalidBuffer;
2340 3562 : Page metapg = NULL;
2341 3562 : BTMetaPageData *metad = NULL;
2342 : ItemId itemid;
2343 : Page page;
2344 : BTPageOpaque opaque;
2345 : FullTransactionId safexid;
2346 : bool rightsib_is_rightmost;
2347 : uint32 targetlevel;
2348 : IndexTuple leafhikey;
2349 : BlockNumber leaftopparent;
2350 : XLogRecPtr recptr;
2351 :
2352 3562 : page = BufferGetPage(leafbuf);
2353 3562 : opaque = BTPageGetOpaque(page);
2354 :
2355 : Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2356 :
2357 : /*
2358 : * Remember some information about the leaf page.
2359 : */
2360 3562 : itemid = PageGetItemId(page, P_HIKEY);
2361 3562 : leafhikey = (IndexTuple) PageGetItem(page, itemid);
2362 3562 : target = BTreeTupleGetTopParent(leafhikey);
2363 3562 : leafleftsib = opaque->btpo_prev;
2364 3562 : leafrightsib = opaque->btpo_next;
2365 :
2366 3562 : _bt_unlockbuf(rel, leafbuf);
2367 :
2368 3562 : INJECTION_POINT("nbtree-leave-page-half-dead", NULL);
2369 :
2370 : /*
2371 : * Check here, as calling loops will have locks held, preventing
2372 : * interrupts from being processed.
2373 : */
2374 3562 : CHECK_FOR_INTERRUPTS();
2375 :
2376 : /* Unlink the current top parent of the subtree */
2377 3562 : if (!BlockNumberIsValid(target))
2378 : {
2379 : /* Target is leaf page (or leaf page is top parent, if you prefer) */
2380 3443 : target = leafblkno;
2381 :
2382 3443 : buf = leafbuf;
2383 3443 : leftsib = leafleftsib;
2384 3443 : targetlevel = 0;
2385 : }
2386 : else
2387 : {
2388 : /* Target is the internal page taken from leaf's top parent link */
2389 : Assert(target != leafblkno);
2390 :
2391 : /* Fetch the block number of the target's left sibling */
2392 119 : buf = _bt_getbuf(rel, target, BT_READ);
2393 119 : page = BufferGetPage(buf);
2394 119 : opaque = BTPageGetOpaque(page);
2395 119 : leftsib = opaque->btpo_prev;
2396 119 : targetlevel = opaque->btpo_level;
2397 : Assert(targetlevel > 0);
2398 :
2399 : /*
2400 : * To avoid deadlocks, we'd better drop the target page lock before
2401 : * going further.
2402 : */
2403 119 : _bt_unlockbuf(rel, buf);
2404 : }
2405 :
2406 : /*
2407 : * We have to lock the pages we need to modify in the standard order:
2408 : * moving right, then up. Else we will deadlock against other writers.
2409 : *
2410 : * So, first lock the leaf page, if it's not the target. Then find and
2411 : * write-lock the current left sibling of the target page. The sibling
2412 : * that was current a moment ago could have split, so we may have to move
2413 : * right.
2414 : */
2415 3562 : if (target != leafblkno)
2416 119 : _bt_lockbuf(rel, leafbuf, BT_WRITE);
2417 3562 : if (leftsib != P_NONE)
2418 : {
2419 712 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2420 712 : page = BufferGetPage(lbuf);
2421 712 : opaque = BTPageGetOpaque(page);
2422 712 : while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2423 : {
2424 0 : bool leftsibvalid = true;
2425 :
2426 : /*
2427 : * Before we follow the link from the page that was the left
2428 : * sibling mere moments ago, validate its right link. This
2429 : * reduces the opportunities for loop to fail to ever make any
2430 : * progress in the presence of index corruption.
2431 : *
2432 : * Note: we rely on the assumption that there can only be one
2433 : * vacuum process running at a time (against the same index).
2434 : */
2435 0 : if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2436 0 : leftsib == opaque->btpo_next)
2437 0 : leftsibvalid = false;
2438 :
2439 0 : leftsib = opaque->btpo_next;
2440 0 : _bt_relbuf(rel, lbuf);
2441 :
2442 0 : if (!leftsibvalid)
2443 : {
2444 : /*
2445 : * This is known to fail in the field; sibling link corruption
2446 : * is relatively common. Press on with vacuuming rather than
2447 : * just throwing an ERROR.
2448 : */
2449 0 : ereport(LOG,
2450 : (errcode(ERRCODE_INDEX_CORRUPTED),
2451 : errmsg_internal("valid left sibling for deletion target could not be located: "
2452 : "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
2453 : leftsib, target, leafblkno, scanblkno,
2454 : targetlevel, RelationGetRelationName(rel))));
2455 :
2456 : /* Must release all pins and locks on failure exit */
2457 0 : ReleaseBuffer(buf);
2458 0 : if (target != leafblkno)
2459 0 : _bt_relbuf(rel, leafbuf);
2460 :
2461 0 : return false;
2462 : }
2463 :
2464 0 : CHECK_FOR_INTERRUPTS();
2465 :
2466 : /* step right one page */
2467 0 : lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2468 0 : page = BufferGetPage(lbuf);
2469 0 : opaque = BTPageGetOpaque(page);
2470 : }
2471 : }
2472 : else
2473 2850 : lbuf = InvalidBuffer;
2474 :
2475 : /* Next write-lock the target page itself */
2476 3562 : _bt_lockbuf(rel, buf, BT_WRITE);
2477 3562 : page = BufferGetPage(buf);
2478 3562 : opaque = BTPageGetOpaque(page);
2479 :
2480 : /*
2481 : * Check page is still empty etc, else abandon deletion. This is just for
2482 : * paranoia's sake; a half-dead page cannot resurrect because there can be
2483 : * only one vacuum process running at a time.
2484 : */
2485 3562 : if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2486 0 : elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2487 : target, RelationGetRelationName(rel));
2488 :
2489 3562 : if (opaque->btpo_prev != leftsib)
2490 0 : ereport(ERROR,
2491 : (errcode(ERRCODE_INDEX_CORRUPTED),
2492 : errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2493 : leftsib, opaque->btpo_prev, target,
2494 : RelationGetRelationName(rel))));
2495 :
2496 3562 : if (target == leafblkno)
2497 : {
2498 3443 : if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2499 3443 : !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2500 0 : elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2501 : target, RelationGetRelationName(rel));
2502 :
2503 : /* Leaf page is also target page: don't set leaftopparent */
2504 3443 : leaftopparent = InvalidBlockNumber;
2505 : }
2506 : else
2507 : {
2508 : IndexTuple finaldataitem;
2509 :
2510 119 : if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2511 119 : P_ISLEAF(opaque))
2512 0 : elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2513 : targetlevel, target, RelationGetRelationName(rel));
2514 :
2515 : /* Target is internal: set leaftopparent for next call here... */
2516 119 : itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2517 119 : finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2518 119 : leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2519 : /* ...except when it would be a redundant pointer-to-self */
2520 119 : if (leaftopparent == leafblkno)
2521 59 : leaftopparent = InvalidBlockNumber;
2522 : }
2523 :
2524 : /* No leaftopparent for level 0 (leaf page) or level 1 target */
2525 : Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2526 :
2527 : /*
2528 : * And next write-lock the (current) right sibling.
2529 : */
2530 3562 : rightsib = opaque->btpo_next;
2531 3562 : rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2532 3562 : page = BufferGetPage(rbuf);
2533 3562 : opaque = BTPageGetOpaque(page);
2534 :
2535 : /*
2536 : * Validate target's right sibling page. Its left link must point back to
2537 : * the target page.
2538 : */
2539 3562 : if (opaque->btpo_prev != target)
2540 : {
2541 : /*
2542 : * This is known to fail in the field; sibling link corruption is
2543 : * relatively common. Press on with vacuuming rather than just
2544 : * throwing an ERROR (same approach used for left-sibling's-right-link
2545 : * validation check a moment ago).
2546 : */
2547 0 : ereport(LOG,
2548 : (errcode(ERRCODE_INDEX_CORRUPTED),
2549 : errmsg_internal("right sibling's left-link doesn't match: "
2550 : "right sibling %u of target %u with leafblkno %u "
2551 : "and scanblkno %u spuriously links to non-target %u "
2552 : "on level %u of index \"%s\"",
2553 : rightsib, target, leafblkno,
2554 : scanblkno, opaque->btpo_prev,
2555 : targetlevel, RelationGetRelationName(rel))));
2556 :
2557 : /* Must release all pins and locks on failure exit */
2558 0 : if (BufferIsValid(lbuf))
2559 0 : _bt_relbuf(rel, lbuf);
2560 0 : _bt_relbuf(rel, rbuf);
2561 0 : _bt_relbuf(rel, buf);
2562 0 : if (target != leafblkno)
2563 0 : _bt_relbuf(rel, leafbuf);
2564 :
2565 0 : return false;
2566 : }
2567 :
2568 3562 : rightsib_is_rightmost = P_RIGHTMOST(opaque);
2569 3562 : *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2570 :
2571 : /*
2572 : * If we are deleting the next-to-last page on the target's level, then
2573 : * the rightsib is a candidate to become the new fast root. (In theory, it
2574 : * might be possible to push the fast root even further down, but the odds
2575 : * of doing so are slim, and the locking considerations daunting.)
2576 : *
2577 : * We can safely acquire a lock on the metapage here --- see comments for
2578 : * _bt_newlevel().
2579 : */
2580 3562 : if (leftsib == P_NONE && rightsib_is_rightmost)
2581 : {
2582 37 : page = BufferGetPage(rbuf);
2583 37 : opaque = BTPageGetOpaque(page);
2584 37 : if (P_RIGHTMOST(opaque))
2585 : {
2586 : /* rightsib will be the only one left on the level */
2587 37 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2588 37 : metapg = BufferGetPage(metabuf);
2589 37 : metad = BTPageGetMeta(metapg);
2590 :
2591 : /*
2592 : * The expected case here is btm_fastlevel == targetlevel+1; if
2593 : * the fastlevel is <= targetlevel, something is wrong, and we
2594 : * choose to overwrite it to fix it.
2595 : */
2596 37 : if (metad->btm_fastlevel > targetlevel + 1)
2597 : {
2598 : /* no update wanted */
2599 0 : _bt_relbuf(rel, metabuf);
2600 0 : metabuf = InvalidBuffer;
2601 : }
2602 : }
2603 : }
2604 :
2605 : /*
2606 : * Here we begin doing the deletion.
2607 : */
2608 :
2609 : /* No ereport(ERROR) until changes are logged */
2610 3562 : START_CRIT_SECTION();
2611 :
2612 : /*
2613 : * Update siblings' side-links. Note the target page's side-links will
2614 : * continue to point to the siblings. Asserts here are just rechecking
2615 : * things we already verified above.
2616 : */
2617 3562 : if (BufferIsValid(lbuf))
2618 : {
2619 712 : page = BufferGetPage(lbuf);
2620 712 : opaque = BTPageGetOpaque(page);
2621 : Assert(opaque->btpo_next == target);
2622 712 : opaque->btpo_next = rightsib;
2623 : }
2624 3562 : page = BufferGetPage(rbuf);
2625 3562 : opaque = BTPageGetOpaque(page);
2626 : Assert(opaque->btpo_prev == target);
2627 3562 : opaque->btpo_prev = leftsib;
2628 :
2629 : /*
2630 : * If we deleted a parent of the targeted leaf page, instead of the leaf
2631 : * itself, update the leaf to point to the next remaining child in the
2632 : * subtree.
2633 : *
2634 : * Note: We rely on the fact that a buffer pin on the leaf page has been
2635 : * held since leafhikey was initialized. This is safe, though only
2636 : * because the page was already half-dead at that point. The leaf page
2637 : * cannot have been modified by any other backend during the period when
2638 : * no lock was held.
2639 : */
2640 3562 : if (target != leafblkno)
2641 119 : BTreeTupleSetTopParent(leafhikey, leaftopparent);
2642 :
2643 : /*
2644 : * Mark the page itself deleted. It can be recycled when all current
2645 : * transactions are gone. Storing GetTopTransactionId() would work, but
2646 : * we're in VACUUM and would not otherwise have an XID. Having already
2647 : * updated links to the target, ReadNextFullTransactionId() suffices as an
2648 : * upper bound. Any scan having retained a now-stale link is advertising
2649 : * in its PGPROC an xmin less than or equal to the value we read here. It
2650 : * will continue to do so, holding back the xmin horizon, for the duration
2651 : * of that scan.
2652 : */
2653 3562 : page = BufferGetPage(buf);
2654 3562 : opaque = BTPageGetOpaque(page);
2655 : Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2656 :
2657 : /*
2658 : * Store upper bound XID that's used to determine when deleted page is no
2659 : * longer needed as a tombstone
2660 : */
2661 3562 : safexid = ReadNextFullTransactionId();
2662 3562 : BTPageSetDeleted(page, safexid);
2663 3562 : opaque->btpo_cycleid = 0;
2664 :
2665 : /* And update the metapage, if needed */
2666 3562 : if (BufferIsValid(metabuf))
2667 : {
2668 : /* upgrade metapage if needed */
2669 37 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2670 0 : _bt_upgrademetapage(metapg);
2671 37 : metad->btm_fastroot = rightsib;
2672 37 : metad->btm_fastlevel = targetlevel;
2673 37 : MarkBufferDirty(metabuf);
2674 : }
2675 :
2676 : /* Must mark buffers dirty before XLogInsert */
2677 3562 : MarkBufferDirty(rbuf);
2678 3562 : MarkBufferDirty(buf);
2679 3562 : if (BufferIsValid(lbuf))
2680 712 : MarkBufferDirty(lbuf);
2681 3562 : if (target != leafblkno)
2682 119 : MarkBufferDirty(leafbuf);
2683 :
2684 : /* XLOG stuff */
2685 3562 : if (RelationNeedsWAL(rel))
2686 3562 : {
2687 : xl_btree_unlink_page xlrec;
2688 : xl_btree_metadata xlmeta;
2689 : uint8 xlinfo;
2690 :
2691 3562 : XLogBeginInsert();
2692 :
2693 3562 : XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2694 3562 : if (BufferIsValid(lbuf))
2695 712 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2696 3562 : XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2697 3562 : if (target != leafblkno)
2698 119 : XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2699 :
2700 : /* information stored on the target/to-be-unlinked block */
2701 3562 : xlrec.leftsib = leftsib;
2702 3562 : xlrec.rightsib = rightsib;
2703 3562 : xlrec.level = targetlevel;
2704 3562 : xlrec.safexid = safexid;
2705 :
2706 : /* information needed to recreate the leaf block (if not the target) */
2707 3562 : xlrec.leafleftsib = leafleftsib;
2708 3562 : xlrec.leafrightsib = leafrightsib;
2709 3562 : xlrec.leaftopparent = leaftopparent;
2710 :
2711 3562 : XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
2712 :
2713 3562 : if (BufferIsValid(metabuf))
2714 : {
2715 37 : XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2716 :
2717 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2718 37 : xlmeta.version = metad->btm_version;
2719 37 : xlmeta.root = metad->btm_root;
2720 37 : xlmeta.level = metad->btm_level;
2721 37 : xlmeta.fastroot = metad->btm_fastroot;
2722 37 : xlmeta.fastlevel = metad->btm_fastlevel;
2723 37 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2724 37 : xlmeta.allequalimage = metad->btm_allequalimage;
2725 :
2726 37 : XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
2727 37 : xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2728 : }
2729 : else
2730 3525 : xlinfo = XLOG_BTREE_UNLINK_PAGE;
2731 :
2732 3562 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2733 : }
2734 : else
2735 0 : recptr = XLogGetFakeLSN(rel);
2736 :
2737 3562 : if (BufferIsValid(metabuf))
2738 37 : PageSetLSN(metapg, recptr);
2739 3562 : page = BufferGetPage(rbuf);
2740 3562 : PageSetLSN(page, recptr);
2741 3562 : page = BufferGetPage(buf);
2742 3562 : PageSetLSN(page, recptr);
2743 3562 : if (BufferIsValid(lbuf))
2744 : {
2745 712 : page = BufferGetPage(lbuf);
2746 712 : PageSetLSN(page, recptr);
2747 : }
2748 3562 : if (target != leafblkno)
2749 : {
2750 119 : page = BufferGetPage(leafbuf);
2751 119 : PageSetLSN(page, recptr);
2752 : }
2753 :
2754 3562 : END_CRIT_SECTION();
2755 :
2756 : /* release metapage */
2757 3562 : if (BufferIsValid(metabuf))
2758 37 : _bt_relbuf(rel, metabuf);
2759 :
2760 : /* release siblings */
2761 3562 : if (BufferIsValid(lbuf))
2762 712 : _bt_relbuf(rel, lbuf);
2763 3562 : _bt_relbuf(rel, rbuf);
2764 :
2765 : /* If the target is not leafbuf, we're done with it now -- release it */
2766 3562 : if (target != leafblkno)
2767 119 : _bt_relbuf(rel, buf);
2768 :
2769 : /*
2770 : * Maintain pages_newly_deleted, which is simply the number of pages
2771 : * deleted by the ongoing VACUUM operation.
2772 : *
2773 : * Maintain pages_deleted in a way that takes into account how
2774 : * btvacuumpage() will count deleted pages that have yet to become
2775 : * scanblkno -- only count page when it's not going to get that treatment
2776 : * later on.
2777 : */
2778 3562 : stats->pages_newly_deleted++;
2779 3562 : if (target <= scanblkno)
2780 3458 : stats->pages_deleted++;
2781 :
2782 : /*
2783 : * Remember information about the target page (now a newly deleted page)
2784 : * in dedicated vstate space for later. The page will be considered as a
2785 : * candidate to place in the FSM at the end of the current btvacuumscan()
2786 : * call.
2787 : */
2788 3562 : _bt_pendingfsm_add(vstate, target, safexid);
2789 :
2790 : /* Success - hold on to lock on leafbuf (might also have been target) */
2791 3562 : return true;
2792 : }
2793 :
2794 : /*
2795 : * Establish how tall the to-be-deleted subtree will be during the first stage
2796 : * of page deletion.
2797 : *
2798 : * Caller's child argument is the block number of the page caller wants to
2799 : * delete (this is leafbuf's block number, except when we're called
2800 : * recursively). stack is a search stack leading to it. Note that we will
2801 : * update the stack entry(s) to reflect current downlink positions --- this is
2802 : * similar to the corresponding point in page split handling.
2803 : *
2804 : * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2805 : * false. Returns true on success, in which case caller can use certain
2806 : * details established here to perform the first stage of deletion. This
2807 : * function is the last point at which page deletion may be deemed unsafe
2808 : * (barring index corruption, or unexpected concurrent page deletions).
2809 : *
2810 : * We write lock the parent of the root of the to-be-deleted subtree for
2811 : * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2812 : * caller). Caller will have to remove a downlink from *subtreeparent. We
2813 : * also set a *subtreeparent offset number in *poffset, to indicate the
2814 : * location of the pivot tuple that contains the relevant downlink.
2815 : *
2816 : * The root of the to-be-deleted subtree is called the "top parent". Note
2817 : * that the leafbuf page is often the final "top parent" page (you can think
2818 : * of the leafbuf page as a degenerate single page subtree when that happens).
2819 : * Caller should initialize *topparent to the target leafbuf page block number
2820 : * (while *topparentrightsib should be set to leafbuf's right sibling block
2821 : * number). We will update *topparent (and *topparentrightsib) for caller
2822 : * here, though only when it turns out that caller will delete at least one
2823 : * internal page (i.e. only when caller needs to store a valid link to the top
2824 : * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2825 : */
2826 : static bool
2827 3575 : _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
2828 : BTStack stack, Buffer *subtreeparent,
2829 : OffsetNumber *poffset, BlockNumber *topparent,
2830 : BlockNumber *topparentrightsib)
2831 : {
2832 : BlockNumber parent,
2833 : leftsibparent;
2834 : OffsetNumber parentoffset,
2835 : maxoff;
2836 : Buffer pbuf;
2837 : Page page;
2838 : BTPageOpaque opaque;
2839 :
2840 : /*
2841 : * Locate the pivot tuple whose downlink points to "child". Write lock
2842 : * the parent page itself.
2843 : */
2844 3575 : pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
2845 3575 : if (pbuf == InvalidBuffer)
2846 : {
2847 : /*
2848 : * Failed to "re-find" a pivot tuple whose downlink matched our child
2849 : * block number on the parent level -- the index must be corrupt.
2850 : * Don't even try to delete the leafbuf subtree. Just report the
2851 : * issue and press on with vacuuming the index.
2852 : *
2853 : * Note: _bt_getstackbuf() recovers from concurrent page splits that
2854 : * take place on the parent level. Its approach is a near-exhaustive
2855 : * linear search. This also gives it a surprisingly good chance of
2856 : * recovering in the event of a buggy or inconsistent opclass. But we
2857 : * don't rely on that here.
2858 : */
2859 0 : ereport(LOG,
2860 : (errcode(ERRCODE_INDEX_CORRUPTED),
2861 : errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2862 : RelationGetRelationName(rel), child)));
2863 : Assert(false);
2864 0 : return false;
2865 : }
2866 :
2867 3575 : parent = stack->bts_blkno;
2868 3575 : parentoffset = stack->bts_offset;
2869 :
2870 3575 : page = BufferGetPage(pbuf);
2871 3575 : opaque = BTPageGetOpaque(page);
2872 3575 : maxoff = PageGetMaxOffsetNumber(page);
2873 3575 : leftsibparent = opaque->btpo_prev;
2874 :
2875 : /*
2876 : * _bt_getstackbuf() completes page splits on returned parent buffer when
2877 : * required.
2878 : *
2879 : * In general it's a bad idea for VACUUM to use up more disk space, which
2880 : * is why page deletion does not finish incomplete page splits most of the
2881 : * time. We allow this limited exception because the risk is much lower,
2882 : * and the potential downside of not proceeding is much higher: A single
2883 : * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2884 : * prevent us from deleting hundreds of empty leaf pages from one level
2885 : * down.
2886 : */
2887 : Assert(!P_INCOMPLETE_SPLIT(opaque));
2888 :
2889 3575 : if (parentoffset < maxoff)
2890 : {
2891 : /*
2892 : * Child is not the rightmost child in parent, so it's safe to delete
2893 : * the subtree whose root/topparent is child page
2894 : */
2895 3443 : *subtreeparent = pbuf;
2896 3443 : *poffset = parentoffset;
2897 3443 : return true;
2898 : }
2899 :
2900 : /*
2901 : * Child is the rightmost child of parent.
2902 : *
2903 : * Since it's the rightmost child of parent, deleting the child (or
2904 : * deleting the subtree whose root/topparent is the child page) is only
2905 : * safe when it's also possible to delete the parent.
2906 : */
2907 : Assert(parentoffset == maxoff);
2908 132 : if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2909 : {
2910 : /*
2911 : * Child isn't parent's only child, or parent is rightmost on its
2912 : * entire level. Definitely cannot delete any pages.
2913 : */
2914 4 : _bt_relbuf(rel, pbuf);
2915 4 : return false;
2916 : }
2917 :
2918 : /*
2919 : * Now make sure that the parent deletion is itself safe by examining the
2920 : * child's grandparent page. Recurse, passing the parent page as the
2921 : * child page (child's grandparent is the parent on the next level up). If
2922 : * parent deletion is unsafe, then child deletion must also be unsafe (in
2923 : * which case caller cannot delete any pages at all).
2924 : */
2925 128 : *topparent = parent;
2926 128 : *topparentrightsib = opaque->btpo_next;
2927 :
2928 : /*
2929 : * Release lock on parent before recursing.
2930 : *
2931 : * It's OK to release page locks on parent before recursive call locks
2932 : * grandparent. An internal page can only acquire an entry if the child
2933 : * is split, but that cannot happen as long as we still hold a lock on the
2934 : * leafbuf page.
2935 : */
2936 128 : _bt_relbuf(rel, pbuf);
2937 :
2938 : /*
2939 : * Before recursing, check that the left sibling of parent (if any) is not
2940 : * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2941 : * parent lock).
2942 : *
2943 : * Note: We deliberately avoid completing incomplete splits here.
2944 : */
2945 128 : if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2946 0 : return false;
2947 :
2948 : /* Recurse to examine child page's grandparent page */
2949 128 : return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
2950 : subtreeparent, poffset,
2951 : topparent, topparentrightsib);
2952 : }
2953 :
2954 : /*
2955 : * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2956 : * optimization.
2957 : *
2958 : * Called at the start of a btvacuumscan(). Caller's cleanuponly argument
2959 : * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2960 : *
2961 : * We expect to allocate memory inside VACUUM's top-level memory context here.
2962 : * The working buffer is subject to a limit based on work_mem. Our strategy
2963 : * when the array can no longer grow within the bounds of that limit is to
2964 : * stop saving additional newly deleted pages, while proceeding as usual with
2965 : * the pages that we can fit.
2966 : */
2967 : void
2968 1806 : _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2969 : {
2970 : Size maxbufsize;
2971 :
2972 : /*
2973 : * Don't bother with optimization in cleanup-only case -- we don't expect
2974 : * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2975 : * can only take place because this optimization didn't work out during
2976 : * the last VACUUM.
2977 : */
2978 1806 : if (cleanuponly)
2979 7 : return;
2980 :
2981 : /*
2982 : * Cap maximum size of array so that we always respect work_mem. Avoid
2983 : * int overflow here.
2984 : */
2985 1799 : vstate->bufsize = 256;
2986 1799 : maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2987 1799 : maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2988 : /* BTVacState.maxbufsize has type int */
2989 1799 : maxbufsize = Min(maxbufsize, INT_MAX);
2990 : /* Stay sane with small work_mem */
2991 1799 : maxbufsize = Max(maxbufsize, vstate->bufsize);
2992 1799 : vstate->maxbufsize = (int) maxbufsize;
2993 :
2994 : /* Allocate buffer, indicate that there are currently 0 pending pages */
2995 1799 : vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
2996 1799 : vstate->npendingpages = 0;
2997 : }
2998 :
2999 : /*
3000 : * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
3001 : * the ongoing VACUUM operation) into the free space map -- though only when
3002 : * it is actually safe to do so by now.
3003 : *
3004 : * Called at the end of a btvacuumscan(), just before free space map vacuuming
3005 : * takes place.
3006 : *
3007 : * Frees memory allocated by _bt_pendingfsm_init(), if any.
3008 : */
3009 : void
3010 1806 : _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
3011 : {
3012 1806 : IndexBulkDeleteResult *stats = vstate->stats;
3013 1806 : Relation heaprel = vstate->info->heaprel;
3014 :
3015 : Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3016 : Assert(heaprel != NULL);
3017 :
3018 1806 : if (vstate->npendingpages == 0)
3019 : {
3020 : /* Just free memory when nothing to do */
3021 1713 : if (vstate->pendingpages)
3022 1706 : pfree(vstate->pendingpages);
3023 :
3024 1713 : return;
3025 : }
3026 :
3027 : #ifdef DEBUG_BTREE_PENDING_FSM
3028 :
3029 : /*
3030 : * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3031 : * placing pending pages in the FSM. Note that the optimization will
3032 : * never be effective without some other backend concurrently consuming an
3033 : * XID.
3034 : */
3035 : pg_usleep(5000000L);
3036 : #endif
3037 :
3038 : /*
3039 : * Recompute VACUUM XID boundaries.
3040 : *
3041 : * We don't actually care about the oldest non-removable XID. Computing
3042 : * the oldest such XID has a useful side-effect that we rely on: it
3043 : * forcibly updates the XID horizon state for this backend. This step is
3044 : * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3045 : * that it is now safe to recycle newly deleted pages without this step.
3046 : */
3047 93 : GetOldestNonRemovableTransactionId(heaprel);
3048 :
3049 117 : for (int i = 0; i < vstate->npendingpages; i++)
3050 : {
3051 114 : BlockNumber target = vstate->pendingpages[i].target;
3052 114 : FullTransactionId safexid = vstate->pendingpages[i].safexid;
3053 :
3054 : /*
3055 : * Do the equivalent of checking BTPageIsRecyclable(), but without
3056 : * accessing the page again a second time.
3057 : *
3058 : * Give up on finding the first non-recyclable page -- all later pages
3059 : * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3060 : * to the array in safexid order.
3061 : */
3062 114 : if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3063 90 : break;
3064 :
3065 24 : RecordFreeIndexPage(rel, target);
3066 24 : stats->pages_free++;
3067 : }
3068 :
3069 93 : pfree(vstate->pendingpages);
3070 : }
3071 :
3072 : /*
3073 : * Maintain array of pages that were deleted during current btvacuumscan()
3074 : * call, for use in _bt_pendingfsm_finalize()
3075 : */
3076 : static void
3077 3562 : _bt_pendingfsm_add(BTVacState *vstate,
3078 : BlockNumber target,
3079 : FullTransactionId safexid)
3080 : {
3081 : Assert(vstate->npendingpages <= vstate->bufsize);
3082 : Assert(vstate->bufsize <= vstate->maxbufsize);
3083 :
3084 : #ifdef USE_ASSERT_CHECKING
3085 :
3086 : /*
3087 : * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3088 : * array will always be in safexid order (since that is the order that we
3089 : * save them in here)
3090 : */
3091 : if (vstate->npendingpages > 0)
3092 : {
3093 : FullTransactionId lastsafexid =
3094 : vstate->pendingpages[vstate->npendingpages - 1].safexid;
3095 :
3096 : Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3097 : }
3098 : #endif
3099 :
3100 : /*
3101 : * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3102 : * information about this page.
3103 : *
3104 : * Note that this also covers the case where we opted to not use the
3105 : * optimization in _bt_pendingfsm_init().
3106 : */
3107 3562 : if (vstate->npendingpages == vstate->maxbufsize)
3108 0 : return;
3109 :
3110 : /* Consider enlarging buffer */
3111 3562 : if (vstate->npendingpages == vstate->bufsize)
3112 : {
3113 5 : int newbufsize = vstate->bufsize * 2;
3114 :
3115 : /* Respect work_mem */
3116 5 : if (newbufsize > vstate->maxbufsize)
3117 0 : newbufsize = vstate->maxbufsize;
3118 :
3119 5 : vstate->bufsize = newbufsize;
3120 5 : vstate->pendingpages =
3121 5 : repalloc(vstate->pendingpages,
3122 5 : sizeof(BTPendingFSM) * vstate->bufsize);
3123 : }
3124 :
3125 : /* Save metadata for newly deleted page */
3126 3562 : vstate->pendingpages[vstate->npendingpages].target = target;
3127 3562 : vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3128 3562 : vstate->npendingpages++;
3129 : }
|