LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 740 832 88.9 %
Date: 2024-11-21 09:14:53 Functions: 32 33 97.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtpage.c
       4             :  *    BTree-specific page management code for the Postgres btree access
       5             :  *    method.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  *
      11             :  * IDENTIFICATION
      12             :  *    src/backend/access/nbtree/nbtpage.c
      13             :  *
      14             :  *  NOTES
      15             :  *     Postgres btree pages look like ordinary relation pages.  The opaque
      16             :  *     data at high addresses includes pointers to left and right siblings
      17             :  *     and flag data describing page state.  The first page in a btree, page
      18             :  *     zero, is special -- it stores meta-information describing the tree.
      19             :  *     Pages one and higher store the actual tree data.
      20             :  *
      21             :  *-------------------------------------------------------------------------
      22             :  */
      23             : #include "postgres.h"
      24             : 
      25             : #include "access/nbtree.h"
      26             : #include "access/nbtxlog.h"
      27             : #include "access/tableam.h"
      28             : #include "access/transam.h"
      29             : #include "access/xlog.h"
      30             : #include "access/xloginsert.h"
      31             : #include "common/int.h"
      32             : #include "miscadmin.h"
      33             : #include "storage/indexfsm.h"
      34             : #include "storage/predicate.h"
      35             : #include "storage/procarray.h"
      36             : #include "utils/memdebug.h"
      37             : #include "utils/memutils.h"
      38             : #include "utils/snapmgr.h"
      39             : 
      40             : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
      41             : static void _bt_delitems_delete(Relation rel, Buffer buf,
      42             :                                 TransactionId snapshotConflictHorizon,
      43             :                                 bool isCatalogRel,
      44             :                                 OffsetNumber *deletable, int ndeletable,
      45             :                                 BTVacuumPosting *updatable, int nupdatable);
      46             : static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
      47             :                                  OffsetNumber *updatedoffsets,
      48             :                                  Size *updatedbuflen, bool needswal);
      49             : static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
      50             :                                    Buffer leafbuf, BTStack stack);
      51             : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
      52             :                                      BlockNumber scanblkno,
      53             :                                      bool *rightsib_empty,
      54             :                                      BTVacState *vstate);
      55             : static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
      56             :                                     BlockNumber child, BTStack stack,
      57             :                                     Buffer *subtreeparent, OffsetNumber *poffset,
      58             :                                     BlockNumber *topparent,
      59             :                                     BlockNumber *topparentrightsib);
      60             : static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
      61             :                                FullTransactionId safexid);
      62             : 
      63             : /*
      64             :  *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
      65             :  */
      66             : void
      67       47524 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
      68             :                  bool allequalimage)
      69             : {
      70             :     BTMetaPageData *metad;
      71             :     BTPageOpaque metaopaque;
      72             : 
      73       47524 :     _bt_pageinit(page, BLCKSZ);
      74             : 
      75       47524 :     metad = BTPageGetMeta(page);
      76       47524 :     metad->btm_magic = BTREE_MAGIC;
      77       47524 :     metad->btm_version = BTREE_VERSION;
      78       47524 :     metad->btm_root = rootbknum;
      79       47524 :     metad->btm_level = level;
      80       47524 :     metad->btm_fastroot = rootbknum;
      81       47524 :     metad->btm_fastlevel = level;
      82       47524 :     metad->btm_last_cleanup_num_delpages = 0;
      83       47524 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
      84       47524 :     metad->btm_allequalimage = allequalimage;
      85             : 
      86       47524 :     metaopaque = BTPageGetOpaque(page);
      87       47524 :     metaopaque->btpo_flags = BTP_META;
      88             : 
      89             :     /*
      90             :      * Set pd_lower just past the end of the metadata.  This is essential,
      91             :      * because without doing so, metadata will be lost if xlog.c compresses
      92             :      * the page.
      93             :      */
      94       47524 :     ((PageHeader) page)->pd_lower =
      95       47524 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
      96       47524 : }
      97             : 
      98             : /*
      99             :  *  _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
     100             :  *      3, the last version that can be updated without broadly affecting
     101             :  *      on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
     102             :  *
     103             :  *      This routine does purely in-memory image upgrade.  Caller is
     104             :  *      responsible for locking, WAL-logging etc.
     105             :  */
     106             : void
     107           0 : _bt_upgrademetapage(Page page)
     108             : {
     109             :     BTMetaPageData *metad;
     110             :     BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
     111             : 
     112           0 :     metad = BTPageGetMeta(page);
     113           0 :     metaopaque = BTPageGetOpaque(page);
     114             : 
     115             :     /* It must be really a meta page of upgradable version */
     116             :     Assert(metaopaque->btpo_flags & BTP_META);
     117             :     Assert(metad->btm_version < BTREE_NOVAC_VERSION);
     118             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     119             : 
     120             :     /* Set version number and fill extra fields added into version 3 */
     121           0 :     metad->btm_version = BTREE_NOVAC_VERSION;
     122           0 :     metad->btm_last_cleanup_num_delpages = 0;
     123           0 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     124             :     /* Only a REINDEX can set this field */
     125             :     Assert(!metad->btm_allequalimage);
     126           0 :     metad->btm_allequalimage = false;
     127             : 
     128             :     /* Adjust pd_lower (see _bt_initmetapage() for details) */
     129           0 :     ((PageHeader) page)->pd_lower =
     130           0 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
     131           0 : }
     132             : 
     133             : /*
     134             :  * Get metadata from share-locked buffer containing metapage, while performing
     135             :  * standard sanity checks.
     136             :  *
     137             :  * Callers that cache data returned here in local cache should note that an
     138             :  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
     139             :  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
     140             :  */
     141             : static BTMetaPageData *
     142     1667176 : _bt_getmeta(Relation rel, Buffer metabuf)
     143             : {
     144             :     Page        metapg;
     145             :     BTPageOpaque metaopaque;
     146             :     BTMetaPageData *metad;
     147             : 
     148     1667176 :     metapg = BufferGetPage(metabuf);
     149     1667176 :     metaopaque = BTPageGetOpaque(metapg);
     150     1667176 :     metad = BTPageGetMeta(metapg);
     151             : 
     152             :     /* sanity-check the metapage */
     153     1667176 :     if (!P_ISMETA(metaopaque) ||
     154     1667176 :         metad->btm_magic != BTREE_MAGIC)
     155           0 :         ereport(ERROR,
     156             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     157             :                  errmsg("index \"%s\" is not a btree",
     158             :                         RelationGetRelationName(rel))));
     159             : 
     160     1667176 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     161     1667176 :         metad->btm_version > BTREE_VERSION)
     162           0 :         ereport(ERROR,
     163             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     164             :                  errmsg("version mismatch in index \"%s\": file version %d, "
     165             :                         "current version %d, minimal supported version %d",
     166             :                         RelationGetRelationName(rel),
     167             :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     168             : 
     169     1667176 :     return metad;
     170             : }
     171             : 
     172             : /*
     173             :  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
     174             :  *
     175             :  * Called by btvacuumcleanup when btbulkdelete was never called because no
     176             :  * index tuples needed to be deleted.
     177             :  */
     178             : bool
     179      118422 : _bt_vacuum_needs_cleanup(Relation rel)
     180             : {
     181             :     Buffer      metabuf;
     182             :     Page        metapg;
     183             :     BTMetaPageData *metad;
     184             :     uint32      btm_version;
     185             :     BlockNumber prev_num_delpages;
     186             : 
     187             :     /*
     188             :      * Copy details from metapage to local variables quickly.
     189             :      *
     190             :      * Note that we deliberately avoid using cached version of metapage here.
     191             :      */
     192      118422 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     193      118422 :     metapg = BufferGetPage(metabuf);
     194      118422 :     metad = BTPageGetMeta(metapg);
     195      118422 :     btm_version = metad->btm_version;
     196             : 
     197      118422 :     if (btm_version < BTREE_NOVAC_VERSION)
     198             :     {
     199             :         /*
     200             :          * Metapage needs to be dynamically upgraded to store fields that are
     201             :          * only present when btm_version >= BTREE_NOVAC_VERSION
     202             :          */
     203           0 :         _bt_relbuf(rel, metabuf);
     204           0 :         return true;
     205             :     }
     206             : 
     207      118422 :     prev_num_delpages = metad->btm_last_cleanup_num_delpages;
     208      118422 :     _bt_relbuf(rel, metabuf);
     209             : 
     210             :     /*
     211             :      * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
     212             :      * total size of the index.  We can reasonably expect (though are not
     213             :      * guaranteed) to be able to recycle this many pages if we decide to do a
     214             :      * btvacuumscan call during the ongoing btvacuumcleanup.  For further
     215             :      * details see the nbtree/README section on placing deleted pages in the
     216             :      * FSM.
     217             :      */
     218      118422 :     if (prev_num_delpages > 0 &&
     219          14 :         prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
     220          14 :         return true;
     221             : 
     222      118408 :     return false;
     223             : }
     224             : 
     225             : /*
     226             :  * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
     227             :  *
     228             :  * Called at the end of btvacuumcleanup, when num_delpages value has been
     229             :  * finalized.
     230             :  */
     231             : void
     232        1810 : _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
     233             : {
     234             :     Buffer      metabuf;
     235             :     Page        metapg;
     236             :     BTMetaPageData *metad;
     237             : 
     238             :     /*
     239             :      * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
     240             :      * field started out as a TransactionId field called btm_oldest_btpo_xact.
     241             :      * Both "versions" are just uint32 fields.  It was convenient to repurpose
     242             :      * the field when we began to use 64-bit XIDs in deleted pages.
     243             :      *
     244             :      * It's possible that a pg_upgrade'd database will contain an XID value in
     245             :      * what is now recognized as the metapage's btm_last_cleanup_num_delpages
     246             :      * field.  _bt_vacuum_needs_cleanup() may even believe that this value
     247             :      * indicates that there are lots of pages that it needs to recycle, when
     248             :      * in reality there are only one or two.  The worst that can happen is
     249             :      * that there will be a call to btvacuumscan a little earlier, which will
     250             :      * set btm_last_cleanup_num_delpages to a sane value when we're called.
     251             :      *
     252             :      * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
     253             :      * no longer used as of PostgreSQL 14.  We set it to -1.0 on rewrite, just
     254             :      * to be consistent.
     255             :      */
     256        1810 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     257        1810 :     metapg = BufferGetPage(metabuf);
     258        1810 :     metad = BTPageGetMeta(metapg);
     259             : 
     260             :     /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
     261        1810 :     if (metad->btm_version >= BTREE_NOVAC_VERSION &&
     262        1810 :         metad->btm_last_cleanup_num_delpages == num_delpages)
     263             :     {
     264             :         /* Usually means index continues to have num_delpages of 0 */
     265        1696 :         _bt_relbuf(rel, metabuf);
     266        1696 :         return;
     267             :     }
     268             : 
     269             :     /* trade in our read lock for a write lock */
     270         114 :     _bt_unlockbuf(rel, metabuf);
     271         114 :     _bt_lockbuf(rel, metabuf, BT_WRITE);
     272             : 
     273         114 :     START_CRIT_SECTION();
     274             : 
     275             :     /* upgrade meta-page if needed */
     276         114 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
     277           0 :         _bt_upgrademetapage(metapg);
     278             : 
     279             :     /* update cleanup-related information */
     280         114 :     metad->btm_last_cleanup_num_delpages = num_delpages;
     281         114 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     282         114 :     MarkBufferDirty(metabuf);
     283             : 
     284             :     /* write wal record if needed */
     285         114 :     if (RelationNeedsWAL(rel))
     286             :     {
     287             :         xl_btree_metadata md;
     288             :         XLogRecPtr  recptr;
     289             : 
     290         114 :         XLogBeginInsert();
     291         114 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     292             : 
     293             :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     294         114 :         md.version = metad->btm_version;
     295         114 :         md.root = metad->btm_root;
     296         114 :         md.level = metad->btm_level;
     297         114 :         md.fastroot = metad->btm_fastroot;
     298         114 :         md.fastlevel = metad->btm_fastlevel;
     299         114 :         md.last_cleanup_num_delpages = num_delpages;
     300         114 :         md.allequalimage = metad->btm_allequalimage;
     301             : 
     302         114 :         XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
     303             : 
     304         114 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
     305             : 
     306         114 :         PageSetLSN(metapg, recptr);
     307             :     }
     308             : 
     309         114 :     END_CRIT_SECTION();
     310             : 
     311         114 :     _bt_relbuf(rel, metabuf);
     312             : }
     313             : 
     314             : /*
     315             :  *  _bt_getroot() -- Get the root page of the btree.
     316             :  *
     317             :  *      Since the root page can move around the btree file, we have to read
     318             :  *      its location from the metadata page, and then read the root page
     319             :  *      itself.  If no root page exists yet, we have to create one.
     320             :  *
     321             :  *      The access type parameter (BT_READ or BT_WRITE) controls whether
     322             :  *      a new root page will be created or not.  If access = BT_READ,
     323             :  *      and no root page exists, we just return InvalidBuffer.  For
     324             :  *      BT_WRITE, we try to create the root page if it doesn't exist.
     325             :  *      NOTE that the returned root page will have only a read lock set
     326             :  *      on it even if access = BT_WRITE!
     327             :  *
     328             :  *      If access = BT_WRITE, heaprel must be set; otherwise caller can just
     329             :  *      pass NULL.  See _bt_allocbuf for an explanation.
     330             :  *
     331             :  *      The returned page is not necessarily the true root --- it could be
     332             :  *      a "fast root" (a page that is alone in its level due to deletions).
     333             :  *      Also, if the root page is split while we are "in flight" to it,
     334             :  *      what we will return is the old root, which is now just the leftmost
     335             :  *      page on a probably-not-very-wide level.  For most purposes this is
     336             :  *      as good as or better than the true root, so we do not bother to
     337             :  *      insist on finding the true root.  We do, however, guarantee to
     338             :  *      return a live (not deleted or half-dead) page.
     339             :  *
     340             :  *      On successful return, the root page is pinned and read-locked.
     341             :  *      The metadata page is not locked or pinned on exit.
     342             :  */
     343             : Buffer
     344    19985794 : _bt_getroot(Relation rel, Relation heaprel, int access)
     345             : {
     346             :     Buffer      metabuf;
     347             :     Buffer      rootbuf;
     348             :     Page        rootpage;
     349             :     BTPageOpaque rootopaque;
     350             :     BlockNumber rootblkno;
     351             :     uint32      rootlevel;
     352             :     BTMetaPageData *metad;
     353             : 
     354             :     Assert(access == BT_READ || heaprel != NULL);
     355             : 
     356             :     /*
     357             :      * Try to use previously-cached metapage data to find the root.  This
     358             :      * normally saves one buffer access per index search, which is a very
     359             :      * helpful savings in bufmgr traffic and hence contention.
     360             :      */
     361    19985794 :     if (rel->rd_amcache != NULL)
     362             :     {
     363    19460866 :         metad = (BTMetaPageData *) rel->rd_amcache;
     364             :         /* We shouldn't have cached it if any of these fail */
     365             :         Assert(metad->btm_magic == BTREE_MAGIC);
     366             :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     367             :         Assert(metad->btm_version <= BTREE_VERSION);
     368             :         Assert(!metad->btm_allequalimage ||
     369             :                metad->btm_version > BTREE_NOVAC_VERSION);
     370             :         Assert(metad->btm_root != P_NONE);
     371             : 
     372    19460866 :         rootblkno = metad->btm_fastroot;
     373             :         Assert(rootblkno != P_NONE);
     374    19460866 :         rootlevel = metad->btm_fastlevel;
     375             : 
     376    19460866 :         rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
     377    19460866 :         rootpage = BufferGetPage(rootbuf);
     378    19460866 :         rootopaque = BTPageGetOpaque(rootpage);
     379             : 
     380             :         /*
     381             :          * Since the cache might be stale, we check the page more carefully
     382             :          * here than normal.  We *must* check that it's not deleted. If it's
     383             :          * not alone on its level, then we reject too --- this may be overly
     384             :          * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
     385             :          * because that's not set in a "fast root".
     386             :          */
     387    19460866 :         if (!P_IGNORE(rootopaque) &&
     388    19460866 :             rootopaque->btpo_level == rootlevel &&
     389    19460866 :             P_LEFTMOST(rootopaque) &&
     390    19460866 :             P_RIGHTMOST(rootopaque))
     391             :         {
     392             :             /* OK, accept cached page as the root */
     393    19459418 :             return rootbuf;
     394             :         }
     395        1448 :         _bt_relbuf(rel, rootbuf);
     396             :         /* Cache is stale, throw it away */
     397        1448 :         if (rel->rd_amcache)
     398        1448 :             pfree(rel->rd_amcache);
     399        1448 :         rel->rd_amcache = NULL;
     400             :     }
     401             : 
     402      526376 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     403      526376 :     metad = _bt_getmeta(rel, metabuf);
     404             : 
     405             :     /* if no root page initialized yet, do it */
     406      526376 :     if (metad->btm_root == P_NONE)
     407             :     {
     408             :         Page        metapg;
     409             : 
     410             :         /* If access = BT_READ, caller doesn't want us to create root yet */
     411      524448 :         if (access == BT_READ)
     412             :         {
     413      512962 :             _bt_relbuf(rel, metabuf);
     414      512962 :             return InvalidBuffer;
     415             :         }
     416             : 
     417             :         /* trade in our read lock for a write lock */
     418       11486 :         _bt_unlockbuf(rel, metabuf);
     419       11486 :         _bt_lockbuf(rel, metabuf, BT_WRITE);
     420             : 
     421             :         /*
     422             :          * Race condition:  if someone else initialized the metadata between
     423             :          * the time we released the read lock and acquired the write lock, we
     424             :          * must avoid doing it again.
     425             :          */
     426       11486 :         if (metad->btm_root != P_NONE)
     427             :         {
     428             :             /*
     429             :              * Metadata initialized by someone else.  In order to guarantee no
     430             :              * deadlocks, we have to release the metadata page and start all
     431             :              * over again.  (Is that really true? But it's hardly worth trying
     432             :              * to optimize this case.)
     433             :              */
     434           0 :             _bt_relbuf(rel, metabuf);
     435           0 :             return _bt_getroot(rel, heaprel, access);
     436             :         }
     437             : 
     438             :         /*
     439             :          * Get, initialize, write, and leave a lock of the appropriate type on
     440             :          * the new root page.  Since this is the first page in the tree, it's
     441             :          * a leaf as well as the root.
     442             :          */
     443       11486 :         rootbuf = _bt_allocbuf(rel, heaprel);
     444       11486 :         rootblkno = BufferGetBlockNumber(rootbuf);
     445       11486 :         rootpage = BufferGetPage(rootbuf);
     446       11486 :         rootopaque = BTPageGetOpaque(rootpage);
     447       11486 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     448       11486 :         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
     449       11486 :         rootopaque->btpo_level = 0;
     450       11486 :         rootopaque->btpo_cycleid = 0;
     451             :         /* Get raw page pointer for metapage */
     452       11486 :         metapg = BufferGetPage(metabuf);
     453             : 
     454             :         /* NO ELOG(ERROR) till meta is updated */
     455       11486 :         START_CRIT_SECTION();
     456             : 
     457             :         /* upgrade metapage if needed */
     458       11486 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
     459           0 :             _bt_upgrademetapage(metapg);
     460             : 
     461       11486 :         metad->btm_root = rootblkno;
     462       11486 :         metad->btm_level = 0;
     463       11486 :         metad->btm_fastroot = rootblkno;
     464       11486 :         metad->btm_fastlevel = 0;
     465       11486 :         metad->btm_last_cleanup_num_delpages = 0;
     466       11486 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
     467             : 
     468       11486 :         MarkBufferDirty(rootbuf);
     469       11486 :         MarkBufferDirty(metabuf);
     470             : 
     471             :         /* XLOG stuff */
     472       11486 :         if (RelationNeedsWAL(rel))
     473             :         {
     474             :             xl_btree_newroot xlrec;
     475             :             XLogRecPtr  recptr;
     476             :             xl_btree_metadata md;
     477             : 
     478       11028 :             XLogBeginInsert();
     479       11028 :             XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
     480       11028 :             XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     481             : 
     482             :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     483       11028 :             md.version = metad->btm_version;
     484       11028 :             md.root = rootblkno;
     485       11028 :             md.level = 0;
     486       11028 :             md.fastroot = rootblkno;
     487       11028 :             md.fastlevel = 0;
     488       11028 :             md.last_cleanup_num_delpages = 0;
     489       11028 :             md.allequalimage = metad->btm_allequalimage;
     490             : 
     491       11028 :             XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
     492             : 
     493       11028 :             xlrec.rootblk = rootblkno;
     494       11028 :             xlrec.level = 0;
     495             : 
     496       11028 :             XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
     497             : 
     498       11028 :             recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
     499             : 
     500       11028 :             PageSetLSN(rootpage, recptr);
     501       11028 :             PageSetLSN(metapg, recptr);
     502             :         }
     503             : 
     504       11486 :         END_CRIT_SECTION();
     505             : 
     506             :         /*
     507             :          * swap root write lock for read lock.  There is no danger of anyone
     508             :          * else accessing the new root page while it's unlocked, since no one
     509             :          * else knows where it is yet.
     510             :          */
     511       11486 :         _bt_unlockbuf(rel, rootbuf);
     512       11486 :         _bt_lockbuf(rel, rootbuf, BT_READ);
     513             : 
     514             :         /* okay, metadata is correct, release lock on it without caching */
     515       11486 :         _bt_relbuf(rel, metabuf);
     516             :     }
     517             :     else
     518             :     {
     519        1928 :         rootblkno = metad->btm_fastroot;
     520             :         Assert(rootblkno != P_NONE);
     521        1928 :         rootlevel = metad->btm_fastlevel;
     522             : 
     523             :         /*
     524             :          * Cache the metapage data for next time
     525             :          */
     526        1928 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     527             :                                              sizeof(BTMetaPageData));
     528        1928 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     529             : 
     530             :         /*
     531             :          * We are done with the metapage; arrange to release it via first
     532             :          * _bt_relandgetbuf call
     533             :          */
     534        1928 :         rootbuf = metabuf;
     535             : 
     536             :         for (;;)
     537             :         {
     538        1928 :             rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     539        1928 :             rootpage = BufferGetPage(rootbuf);
     540        1928 :             rootopaque = BTPageGetOpaque(rootpage);
     541             : 
     542        1928 :             if (!P_IGNORE(rootopaque))
     543        1928 :                 break;
     544             : 
     545             :             /* it's dead, Jim.  step right one page */
     546           0 :             if (P_RIGHTMOST(rootopaque))
     547           0 :                 elog(ERROR, "no live root page found in index \"%s\"",
     548             :                      RelationGetRelationName(rel));
     549           0 :             rootblkno = rootopaque->btpo_next;
     550             :         }
     551             : 
     552        1928 :         if (rootopaque->btpo_level != rootlevel)
     553           0 :             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     554             :                  rootblkno, RelationGetRelationName(rel),
     555             :                  rootopaque->btpo_level, rootlevel);
     556             :     }
     557             : 
     558             :     /*
     559             :      * By here, we have a pin and read lock on the root page, and no lock set
     560             :      * on the metadata page.  Return the root page's buffer.
     561             :      */
     562       13414 :     return rootbuf;
     563             : }
     564             : 
     565             : /*
     566             :  *  _bt_gettrueroot() -- Get the true root page of the btree.
     567             :  *
     568             :  *      This is the same as the BT_READ case of _bt_getroot(), except
     569             :  *      we follow the true-root link not the fast-root link.
     570             :  *
     571             :  * By the time we acquire lock on the root page, it might have been split and
     572             :  * not be the true root anymore.  This is okay for the present uses of this
     573             :  * routine; we only really need to be able to move up at least one tree level
     574             :  * from whatever non-root page we were at.  If we ever do need to lock the
     575             :  * one true root page, we could loop here, re-reading the metapage on each
     576             :  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
     577             :  * moving to the root --- that'd deadlock against any concurrent root split.)
     578             :  */
     579             : Buffer
     580          24 : _bt_gettrueroot(Relation rel)
     581             : {
     582             :     Buffer      metabuf;
     583             :     Page        metapg;
     584             :     BTPageOpaque metaopaque;
     585             :     Buffer      rootbuf;
     586             :     Page        rootpage;
     587             :     BTPageOpaque rootopaque;
     588             :     BlockNumber rootblkno;
     589             :     uint32      rootlevel;
     590             :     BTMetaPageData *metad;
     591             : 
     592             :     /*
     593             :      * We don't try to use cached metapage data here, since (a) this path is
     594             :      * not performance-critical, and (b) if we are here it suggests our cache
     595             :      * is out-of-date anyway.  In light of point (b), it's probably safest to
     596             :      * actively flush any cached metapage info.
     597             :      */
     598          24 :     if (rel->rd_amcache)
     599          24 :         pfree(rel->rd_amcache);
     600          24 :     rel->rd_amcache = NULL;
     601             : 
     602          24 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     603          24 :     metapg = BufferGetPage(metabuf);
     604          24 :     metaopaque = BTPageGetOpaque(metapg);
     605          24 :     metad = BTPageGetMeta(metapg);
     606             : 
     607          24 :     if (!P_ISMETA(metaopaque) ||
     608          24 :         metad->btm_magic != BTREE_MAGIC)
     609           0 :         ereport(ERROR,
     610             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     611             :                  errmsg("index \"%s\" is not a btree",
     612             :                         RelationGetRelationName(rel))));
     613             : 
     614          24 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     615          24 :         metad->btm_version > BTREE_VERSION)
     616           0 :         ereport(ERROR,
     617             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     618             :                  errmsg("version mismatch in index \"%s\": file version %d, "
     619             :                         "current version %d, minimal supported version %d",
     620             :                         RelationGetRelationName(rel),
     621             :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     622             : 
     623             :     /* if no root page initialized yet, fail */
     624          24 :     if (metad->btm_root == P_NONE)
     625             :     {
     626           0 :         _bt_relbuf(rel, metabuf);
     627           0 :         return InvalidBuffer;
     628             :     }
     629             : 
     630          24 :     rootblkno = metad->btm_root;
     631          24 :     rootlevel = metad->btm_level;
     632             : 
     633             :     /*
     634             :      * We are done with the metapage; arrange to release it via first
     635             :      * _bt_relandgetbuf call
     636             :      */
     637          24 :     rootbuf = metabuf;
     638             : 
     639             :     for (;;)
     640             :     {
     641          24 :         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     642          24 :         rootpage = BufferGetPage(rootbuf);
     643          24 :         rootopaque = BTPageGetOpaque(rootpage);
     644             : 
     645          24 :         if (!P_IGNORE(rootopaque))
     646          24 :             break;
     647             : 
     648             :         /* it's dead, Jim.  step right one page */
     649           0 :         if (P_RIGHTMOST(rootopaque))
     650           0 :             elog(ERROR, "no live root page found in index \"%s\"",
     651             :                  RelationGetRelationName(rel));
     652           0 :         rootblkno = rootopaque->btpo_next;
     653             :     }
     654             : 
     655          24 :     if (rootopaque->btpo_level != rootlevel)
     656           0 :         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     657             :              rootblkno, RelationGetRelationName(rel),
     658             :              rootopaque->btpo_level, rootlevel);
     659             : 
     660          24 :     return rootbuf;
     661             : }
     662             : 
     663             : /*
     664             :  *  _bt_getrootheight() -- Get the height of the btree search tree.
     665             :  *
     666             :  *      We return the level (counting from zero) of the current fast root.
     667             :  *      This represents the number of tree levels we'd have to descend through
     668             :  *      to start any btree index search.
     669             :  *
     670             :  *      This is used by the planner for cost-estimation purposes.  Since it's
     671             :  *      only an estimate, slightly-stale data is fine, hence we don't worry
     672             :  *      about updating previously cached data.
     673             :  */
     674             : int
     675     4303774 : _bt_getrootheight(Relation rel)
     676             : {
     677             :     BTMetaPageData *metad;
     678             : 
     679     4303774 :     if (rel->rd_amcache == NULL)
     680             :     {
     681             :         Buffer      metabuf;
     682             : 
     683       69380 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     684       69380 :         metad = _bt_getmeta(rel, metabuf);
     685             : 
     686             :         /*
     687             :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     688             :          * to be made, so just stop here and report the index height is zero.
     689             :          * (XXX perhaps _bt_getroot() should be changed to allow this case.)
     690             :          */
     691       69380 :         if (metad->btm_root == P_NONE)
     692             :         {
     693       30952 :             _bt_relbuf(rel, metabuf);
     694       30952 :             return 0;
     695             :         }
     696             : 
     697             :         /*
     698             :          * Cache the metapage data for next time
     699             :          */
     700       38428 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     701             :                                              sizeof(BTMetaPageData));
     702       38428 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     703       38428 :         _bt_relbuf(rel, metabuf);
     704             :     }
     705             : 
     706             :     /* Get cached page */
     707     4272822 :     metad = (BTMetaPageData *) rel->rd_amcache;
     708             :     /* We shouldn't have cached it if any of these fail */
     709             :     Assert(metad->btm_magic == BTREE_MAGIC);
     710             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     711             :     Assert(metad->btm_version <= BTREE_VERSION);
     712             :     Assert(!metad->btm_allequalimage ||
     713             :            metad->btm_version > BTREE_NOVAC_VERSION);
     714             :     Assert(metad->btm_fastroot != P_NONE);
     715             : 
     716     4272822 :     return metad->btm_fastlevel;
     717             : }
     718             : 
     719             : /*
     720             :  *  _bt_metaversion() -- Get version/status info from metapage.
     721             :  *
     722             :  *      Sets caller's *heapkeyspace and *allequalimage arguments using data
     723             :  *      from the B-Tree metapage (could be locally-cached version).  This
     724             :  *      information needs to be stashed in insertion scankey, so we provide a
     725             :  *      single function that fetches both at once.
     726             :  *
     727             :  *      This is used to determine the rules that must be used to descend a
     728             :  *      btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
     729             :  *      pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
     730             :  *      performance when inserting a new BTScanInsert-wise duplicate tuple
     731             :  *      among many leaf pages already full of such duplicates.
     732             :  *
     733             :  *      Also sets allequalimage field, which indicates whether or not it is
     734             :  *      safe to apply deduplication.  We rely on the assumption that
     735             :  *      btm_allequalimage will be zero'ed on heapkeyspace indexes that were
     736             :  *      pg_upgrade'd from Postgres 12.
     737             :  */
     738             : void
     739    24014036 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
     740             : {
     741             :     BTMetaPageData *metad;
     742             : 
     743    24014036 :     if (rel->rd_amcache == NULL)
     744             :     {
     745             :         Buffer      metabuf;
     746             : 
     747     1071420 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     748     1071420 :         metad = _bt_getmeta(rel, metabuf);
     749             : 
     750             :         /*
     751             :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     752             :          * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
     753             :          * be changed to allow this case.)
     754             :          */
     755     1071420 :         if (metad->btm_root == P_NONE)
     756             :         {
     757      516350 :             *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     758      516350 :             *allequalimage = metad->btm_allequalimage;
     759             : 
     760      516350 :             _bt_relbuf(rel, metabuf);
     761      516350 :             return;
     762             :         }
     763             : 
     764             :         /*
     765             :          * Cache the metapage data for next time
     766             :          *
     767             :          * An on-the-fly version upgrade performed by _bt_upgrademetapage()
     768             :          * can change the nbtree version for an index without invalidating any
     769             :          * local cache.  This is okay because it can only happen when moving
     770             :          * from version 2 to version 3, both of which are !heapkeyspace
     771             :          * versions.
     772             :          */
     773      555070 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     774             :                                              sizeof(BTMetaPageData));
     775      555070 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     776      555070 :         _bt_relbuf(rel, metabuf);
     777             :     }
     778             : 
     779             :     /* Get cached page */
     780    23497686 :     metad = (BTMetaPageData *) rel->rd_amcache;
     781             :     /* We shouldn't have cached it if any of these fail */
     782             :     Assert(metad->btm_magic == BTREE_MAGIC);
     783             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     784             :     Assert(metad->btm_version <= BTREE_VERSION);
     785             :     Assert(!metad->btm_allequalimage ||
     786             :            metad->btm_version > BTREE_NOVAC_VERSION);
     787             :     Assert(metad->btm_fastroot != P_NONE);
     788             : 
     789    23497686 :     *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     790    23497686 :     *allequalimage = metad->btm_allequalimage;
     791             : }
     792             : 
     793             : /*
     794             :  *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
     795             :  */
     796             : void
     797    37698122 : _bt_checkpage(Relation rel, Buffer buf)
     798             : {
     799    37698122 :     Page        page = BufferGetPage(buf);
     800             : 
     801             :     /*
     802             :      * ReadBuffer verifies that every newly-read page passes
     803             :      * PageHeaderIsValid, which means it either contains a reasonably sane
     804             :      * page header or is all-zero.  We have to defend against the all-zero
     805             :      * case, however.
     806             :      */
     807    37698122 :     if (PageIsNew(page))
     808           0 :         ereport(ERROR,
     809             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     810             :                  errmsg("index \"%s\" contains unexpected zero page at block %u",
     811             :                         RelationGetRelationName(rel),
     812             :                         BufferGetBlockNumber(buf)),
     813             :                  errhint("Please REINDEX it.")));
     814             : 
     815             :     /*
     816             :      * Additionally check that the special area looks sane.
     817             :      */
     818    37698122 :     if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
     819           0 :         ereport(ERROR,
     820             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     821             :                  errmsg("index \"%s\" contains corrupted page at block %u",
     822             :                         RelationGetRelationName(rel),
     823             :                         BufferGetBlockNumber(buf)),
     824             :                  errhint("Please REINDEX it.")));
     825    37698122 : }
     826             : 
     827             : /*
     828             :  *  _bt_getbuf() -- Get an existing block in a buffer, for read or write.
     829             :  *
     830             :  *      The general rule in nbtree is that it's never okay to access a
     831             :  *      page without holding both a buffer pin and a buffer lock on
     832             :  *      the page's buffer.
     833             :  *
     834             :  *      When this routine returns, the appropriate lock is set on the
     835             :  *      requested buffer and its reference count has been incremented
     836             :  *      (ie, the buffer is "locked and pinned").  Also, we apply
     837             :  *      _bt_checkpage to sanity-check the page, and perform Valgrind
     838             :  *      client requests that help Valgrind detect unsafe page accesses.
     839             :  *
     840             :  *      Note: raw LockBuffer() calls are disallowed in nbtree; all
     841             :  *      buffer lock requests need to go through wrapper functions such
     842             :  *      as _bt_lockbuf().
     843             :  */
     844             : Buffer
     845    21448136 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
     846             : {
     847             :     Buffer      buf;
     848             : 
     849             :     Assert(BlockNumberIsValid(blkno));
     850             : 
     851             :     /* Read an existing block of the relation */
     852    21448136 :     buf = ReadBuffer(rel, blkno);
     853    21448136 :     _bt_lockbuf(rel, buf, access);
     854    21448136 :     _bt_checkpage(rel, buf);
     855             : 
     856    21448136 :     return buf;
     857             : }
     858             : 
     859             : /*
     860             :  *  _bt_allocbuf() -- Allocate a new block/page.
     861             :  *
     862             :  * Returns a write-locked buffer containing an unallocated nbtree page.
     863             :  *
     864             :  * Callers are required to pass a valid heaprel.  We need heaprel so that we
     865             :  * can handle generating a snapshotConflictHorizon that makes reusing a page
     866             :  * from the FSM safe for queries that may be running on standbys.
     867             :  */
     868             : Buffer
     869       34332 : _bt_allocbuf(Relation rel, Relation heaprel)
     870             : {
     871             :     Buffer      buf;
     872             :     BlockNumber blkno;
     873             :     Page        page;
     874             : 
     875             :     Assert(heaprel != NULL);
     876             : 
     877             :     /*
     878             :      * First see if the FSM knows of any free pages.
     879             :      *
     880             :      * We can't trust the FSM's report unreservedly; we have to check that the
     881             :      * page is still free.  (For example, an already-free page could have been
     882             :      * re-used between the time the last VACUUM scanned it and the time the
     883             :      * VACUUM made its FSM updates.)
     884             :      *
     885             :      * In fact, it's worse than that: we can't even assume that it's safe to
     886             :      * take a lock on the reported page.  If somebody else has a lock on it,
     887             :      * or even worse our own caller does, we could deadlock.  (The own-caller
     888             :      * scenario is actually not improbable. Consider an index on a serial or
     889             :      * timestamp column.  Nearly all splits will be at the rightmost page, so
     890             :      * it's entirely likely that _bt_split will call us while holding a lock
     891             :      * on the page most recently acquired from FSM. A VACUUM running
     892             :      * concurrently with the previous split could well have placed that page
     893             :      * back in FSM.)
     894             :      *
     895             :      * To get around that, we ask for only a conditional lock on the reported
     896             :      * page.  If we fail, then someone else is using the page, and we may
     897             :      * reasonably assume it's not free.  (If we happen to be wrong, the worst
     898             :      * consequence is the page will be lost to use till the next VACUUM, which
     899             :      * is no big problem.)
     900             :      */
     901             :     for (;;)
     902             :     {
     903       34332 :         blkno = GetFreeIndexPage(rel);
     904       34332 :         if (blkno == InvalidBlockNumber)
     905       34310 :             break;
     906          22 :         buf = ReadBuffer(rel, blkno);
     907          22 :         if (_bt_conditionallockbuf(rel, buf))
     908             :         {
     909          22 :             page = BufferGetPage(buf);
     910             : 
     911             :             /*
     912             :              * It's possible to find an all-zeroes page in an index.  For
     913             :              * example, a backend might successfully extend the relation one
     914             :              * page and then crash before it is able to make a WAL entry for
     915             :              * adding the page.  If we find a zeroed page then reclaim it
     916             :              * immediately.
     917             :              */
     918          22 :             if (PageIsNew(page))
     919             :             {
     920             :                 /* Okay to use page.  Initialize and return it. */
     921           0 :                 _bt_pageinit(page, BufferGetPageSize(buf));
     922           0 :                 return buf;
     923             :             }
     924             : 
     925          22 :             if (BTPageIsRecyclable(page, heaprel))
     926             :             {
     927             :                 /*
     928             :                  * If we are generating WAL for Hot Standby then create a WAL
     929             :                  * record that will allow us to conflict with queries running
     930             :                  * on standby, in case they have snapshots older than safexid
     931             :                  * value
     932             :                  */
     933          22 :                 if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
     934             :                 {
     935             :                     xl_btree_reuse_page xlrec_reuse;
     936             : 
     937             :                     /*
     938             :                      * Note that we don't register the buffer with the record,
     939             :                      * because this operation doesn't modify the page (that
     940             :                      * already happened, back when VACUUM deleted the page).
     941             :                      * This record only exists to provide a conflict point for
     942             :                      * Hot Standby.  See record REDO routine comments.
     943             :                      */
     944          22 :                     xlrec_reuse.locator = rel->rd_locator;
     945          22 :                     xlrec_reuse.block = blkno;
     946          22 :                     xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
     947          22 :                     xlrec_reuse.isCatalogRel =
     948          22 :                         RelationIsAccessibleInLogicalDecoding(heaprel);
     949             : 
     950          22 :                     XLogBeginInsert();
     951          22 :                     XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
     952             : 
     953          22 :                     XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
     954             :                 }
     955             : 
     956             :                 /* Okay to use page.  Re-initialize and return it. */
     957          22 :                 _bt_pageinit(page, BufferGetPageSize(buf));
     958          22 :                 return buf;
     959             :             }
     960           0 :             elog(DEBUG2, "FSM returned nonrecyclable page");
     961           0 :             _bt_relbuf(rel, buf);
     962             :         }
     963             :         else
     964             :         {
     965           0 :             elog(DEBUG2, "FSM returned nonlockable page");
     966             :             /* couldn't get lock, so just drop pin */
     967           0 :             ReleaseBuffer(buf);
     968             :         }
     969             :     }
     970             : 
     971             :     /*
     972             :      * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
     973             :      * risk a race condition against btvacuumscan --- see comments therein.
     974             :      * This forces us to repeat the valgrind request that _bt_lockbuf()
     975             :      * otherwise would make, as we can't use _bt_lockbuf() without introducing
     976             :      * a race.
     977             :      */
     978       34310 :     buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
     979       34310 :     if (!RelationUsesLocalBuffers(rel))
     980             :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
     981             : 
     982             :     /* Initialize the new page before returning it */
     983       34310 :     page = BufferGetPage(buf);
     984             :     Assert(PageIsNew(page));
     985       34310 :     _bt_pageinit(page, BufferGetPageSize(buf));
     986             : 
     987       34310 :     return buf;
     988             : }
     989             : 
     990             : /*
     991             :  *  _bt_relandgetbuf() -- release a locked buffer and get another one.
     992             :  *
     993             :  * This is equivalent to _bt_relbuf followed by _bt_getbuf.  Also, if obuf is
     994             :  * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
     995             :  * simplifies some callers.
     996             :  *
     997             :  * The original motivation for using this was to avoid two entries to the
     998             :  * bufmgr when one would do.  However, now it's mainly just a notational
     999             :  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
    1000             :  * is when the target page is the same one already in the buffer.
    1001             :  */
    1002             : Buffer
    1003    16123190 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
    1004             : {
    1005             :     Buffer      buf;
    1006             : 
    1007             :     Assert(BlockNumberIsValid(blkno));
    1008    16123190 :     if (BufferIsValid(obuf))
    1009    16107386 :         _bt_unlockbuf(rel, obuf);
    1010    16123190 :     buf = ReleaseAndReadBuffer(obuf, rel, blkno);
    1011    16123190 :     _bt_lockbuf(rel, buf, access);
    1012             : 
    1013    16123190 :     _bt_checkpage(rel, buf);
    1014    16123190 :     return buf;
    1015             : }
    1016             : 
    1017             : /*
    1018             :  *  _bt_relbuf() -- release a locked buffer.
    1019             :  *
    1020             :  * Lock and pin (refcount) are both dropped.
    1021             :  */
    1022             : void
    1023     9035236 : _bt_relbuf(Relation rel, Buffer buf)
    1024             : {
    1025     9035236 :     _bt_unlockbuf(rel, buf);
    1026     9035236 :     ReleaseBuffer(buf);
    1027     9035236 : }
    1028             : 
    1029             : /*
    1030             :  *  _bt_lockbuf() -- lock a pinned buffer.
    1031             :  *
    1032             :  * Lock is acquired without acquiring another pin.  This is like a raw
    1033             :  * LockBuffer() call, but performs extra steps needed by Valgrind.
    1034             :  *
    1035             :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1036             :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1037             :  */
    1038             : void
    1039    38473112 : _bt_lockbuf(Relation rel, Buffer buf, int access)
    1040             : {
    1041             :     /* LockBuffer() asserts that pin is held by this backend */
    1042    38473112 :     LockBuffer(buf, access);
    1043             : 
    1044             :     /*
    1045             :      * It doesn't matter that _bt_unlockbuf() won't get called in the event of
    1046             :      * an nbtree error (e.g. a unique violation error).  That won't cause
    1047             :      * Valgrind false positives.
    1048             :      *
    1049             :      * The nbtree client requests are superimposed on top of the bufmgr.c
    1050             :      * buffer pin client requests.  In the event of an nbtree error the buffer
    1051             :      * will certainly get marked as defined when the backend once again
    1052             :      * acquires its first pin on the buffer. (Of course, if the backend never
    1053             :      * touches the buffer again then it doesn't matter that it remains
    1054             :      * non-accessible to Valgrind.)
    1055             :      *
    1056             :      * Note: When an IndexTuple C pointer gets computed using an ItemId read
    1057             :      * from a page while a lock was held, the C pointer becomes unsafe to
    1058             :      * dereference forever as soon as the lock is released.  Valgrind can only
    1059             :      * detect cases where the pointer gets dereferenced with no _current_
    1060             :      * lock/pin held, though.
    1061             :      */
    1062    38473112 :     if (!RelationUsesLocalBuffers(rel))
    1063             :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1064    38473112 : }
    1065             : 
    1066             : /*
    1067             :  *  _bt_unlockbuf() -- unlock a pinned buffer.
    1068             :  */
    1069             : void
    1070    38572828 : _bt_unlockbuf(Relation rel, Buffer buf)
    1071             : {
    1072             :     /*
    1073             :      * Buffer is pinned and locked, which means that it is expected to be
    1074             :      * defined and addressable.  Check that proactively.
    1075             :      */
    1076             :     VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1077             : 
    1078             :     /* LockBuffer() asserts that pin is held by this backend */
    1079    38572828 :     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1080             : 
    1081    38572828 :     if (!RelationUsesLocalBuffers(rel))
    1082             :         VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
    1083    38572828 : }
    1084             : 
    1085             : /*
    1086             :  *  _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
    1087             :  *  buffer.
    1088             :  *
    1089             :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1090             :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1091             :  */
    1092             : bool
    1093       66226 : _bt_conditionallockbuf(Relation rel, Buffer buf)
    1094             : {
    1095             :     /* ConditionalLockBuffer() asserts that pin is held by this backend */
    1096       66226 :     if (!ConditionalLockBuffer(buf))
    1097         806 :         return false;
    1098             : 
    1099       65420 :     if (!RelationUsesLocalBuffers(rel))
    1100             :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1101             : 
    1102       65420 :     return true;
    1103             : }
    1104             : 
    1105             : /*
    1106             :  *  _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
    1107             :  */
    1108             : void
    1109       16812 : _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
    1110             : {
    1111             :     /*
    1112             :      * Buffer is pinned and locked, which means that it is expected to be
    1113             :      * defined and addressable.  Check that proactively.
    1114             :      */
    1115             :     VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1116             : 
    1117             :     /* LockBuffer() asserts that pin is held by this backend */
    1118       16812 :     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1119       16812 :     LockBufferForCleanup(buf);
    1120       16812 : }
    1121             : 
    1122             : /*
    1123             :  *  _bt_pageinit() -- Initialize a new page.
    1124             :  *
    1125             :  * On return, the page header is initialized; data space is empty;
    1126             :  * special space is zeroed out.
    1127             :  */
    1128             : void
    1129      162038 : _bt_pageinit(Page page, Size size)
    1130             : {
    1131      162038 :     PageInit(page, size, sizeof(BTPageOpaqueData));
    1132      162038 : }
    1133             : 
    1134             : /*
    1135             :  * Delete item(s) from a btree leaf page during VACUUM.
    1136             :  *
    1137             :  * This routine assumes that the caller already has a full cleanup lock on
    1138             :  * the buffer.  Also, the given deletable and updatable arrays *must* be
    1139             :  * sorted in ascending order.
    1140             :  *
    1141             :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1142             :  * in an existing posting list item are to be removed.  This works by
    1143             :  * updating/overwriting an existing item with caller's new version of the item
    1144             :  * (a version that lacks the TIDs that are to be deleted).
    1145             :  *
    1146             :  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
    1147             :  * generate their own snapshotConflictHorizon directly from the tableam,
    1148             :  * whereas VACUUMs rely on the initial VACUUM table scan performing
    1149             :  * WAL-logging that takes care of the issue for the table's indexes
    1150             :  * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
    1151             :  * deletes don't do.
    1152             :  */
    1153             : void
    1154       11392 : _bt_delitems_vacuum(Relation rel, Buffer buf,
    1155             :                     OffsetNumber *deletable, int ndeletable,
    1156             :                     BTVacuumPosting *updatable, int nupdatable)
    1157             : {
    1158       11392 :     Page        page = BufferGetPage(buf);
    1159             :     BTPageOpaque opaque;
    1160       11392 :     bool        needswal = RelationNeedsWAL(rel);
    1161       11392 :     char       *updatedbuf = NULL;
    1162       11392 :     Size        updatedbuflen = 0;
    1163             :     OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1164             : 
    1165             :     /* Shouldn't be called unless there's something to do */
    1166             :     Assert(ndeletable > 0 || nupdatable > 0);
    1167             : 
    1168             :     /* Generate new version of posting lists without deleted TIDs */
    1169       11392 :     if (nupdatable > 0)
    1170         974 :         updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1171             :                                          updatedoffsets, &updatedbuflen,
    1172             :                                          needswal);
    1173             : 
    1174             :     /* No ereport(ERROR) until changes are logged */
    1175       11392 :     START_CRIT_SECTION();
    1176             : 
    1177             :     /*
    1178             :      * Handle posting tuple updates.
    1179             :      *
    1180             :      * Deliberately do this before handling simple deletes.  If we did it the
    1181             :      * other way around (i.e. WAL record order -- simple deletes before
    1182             :      * updates) then we'd have to make compensating changes to the 'updatable'
    1183             :      * array of offset numbers.
    1184             :      *
    1185             :      * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
    1186             :      * happens to already be set.  It's important that we not interfere with
    1187             :      * any future simple index tuple deletion operations.
    1188             :      */
    1189       38214 :     for (int i = 0; i < nupdatable; i++)
    1190             :     {
    1191       26822 :         OffsetNumber updatedoffset = updatedoffsets[i];
    1192             :         IndexTuple  itup;
    1193             :         Size        itemsz;
    1194             : 
    1195       26822 :         itup = updatable[i]->itup;
    1196       26822 :         itemsz = MAXALIGN(IndexTupleSize(itup));
    1197       26822 :         if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
    1198             :                                      itemsz))
    1199           0 :             elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1200             :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1201             :     }
    1202             : 
    1203             :     /* Now handle simple deletes of entire tuples */
    1204       11392 :     if (ndeletable > 0)
    1205       11116 :         PageIndexMultiDelete(page, deletable, ndeletable);
    1206             : 
    1207             :     /*
    1208             :      * We can clear the vacuum cycle ID since this page has certainly been
    1209             :      * processed by the current vacuum scan.
    1210             :      */
    1211       11392 :     opaque = BTPageGetOpaque(page);
    1212       11392 :     opaque->btpo_cycleid = 0;
    1213             : 
    1214             :     /*
    1215             :      * Clear the BTP_HAS_GARBAGE page flag.
    1216             :      *
    1217             :      * This flag indicates the presence of LP_DEAD items on the page (though
    1218             :      * not reliably).  Note that we only rely on it with pg_upgrade'd
    1219             :      * !heapkeyspace indexes.  That's why clearing it here won't usually
    1220             :      * interfere with simple index tuple deletion.
    1221             :      */
    1222       11392 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1223             : 
    1224       11392 :     MarkBufferDirty(buf);
    1225             : 
    1226             :     /* XLOG stuff */
    1227       11392 :     if (needswal)
    1228             :     {
    1229             :         XLogRecPtr  recptr;
    1230             :         xl_btree_vacuum xlrec_vacuum;
    1231             : 
    1232       11390 :         xlrec_vacuum.ndeleted = ndeletable;
    1233       11390 :         xlrec_vacuum.nupdated = nupdatable;
    1234             : 
    1235       11390 :         XLogBeginInsert();
    1236       11390 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1237       11390 :         XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
    1238             : 
    1239       11390 :         if (ndeletable > 0)
    1240       11114 :             XLogRegisterBufData(0, (char *) deletable,
    1241             :                                 ndeletable * sizeof(OffsetNumber));
    1242             : 
    1243       11390 :         if (nupdatable > 0)
    1244             :         {
    1245         974 :             XLogRegisterBufData(0, (char *) updatedoffsets,
    1246             :                                 nupdatable * sizeof(OffsetNumber));
    1247         974 :             XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1248             :         }
    1249             : 
    1250       11390 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
    1251             : 
    1252       11390 :         PageSetLSN(page, recptr);
    1253             :     }
    1254             : 
    1255       11392 :     END_CRIT_SECTION();
    1256             : 
    1257             :     /* can't leak memory here */
    1258       11392 :     if (updatedbuf != NULL)
    1259         974 :         pfree(updatedbuf);
    1260             :     /* free tuples allocated within _bt_delitems_update() */
    1261       38214 :     for (int i = 0; i < nupdatable; i++)
    1262       26822 :         pfree(updatable[i]->itup);
    1263       11392 : }
    1264             : 
    1265             : /*
    1266             :  * Delete item(s) from a btree leaf page during single-page cleanup.
    1267             :  *
    1268             :  * This routine assumes that the caller has pinned and write locked the
    1269             :  * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
    1270             :  * ascending order.
    1271             :  *
    1272             :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1273             :  * in an existing posting list item are to be removed.  This works by
    1274             :  * updating/overwriting an existing item with caller's new version of the item
    1275             :  * (a version that lacks the TIDs that are to be deleted).
    1276             :  *
    1277             :  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
    1278             :  * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
    1279             :  * (from the tableam).  This is used by the REDO routine to generate recovery
    1280             :  * conflicts.  The other difference is that only _bt_delitems_vacuum will
    1281             :  * clear page's VACUUM cycle ID.
    1282             :  */
    1283             : static void
    1284        7682 : _bt_delitems_delete(Relation rel, Buffer buf,
    1285             :                     TransactionId snapshotConflictHorizon, bool isCatalogRel,
    1286             :                     OffsetNumber *deletable, int ndeletable,
    1287             :                     BTVacuumPosting *updatable, int nupdatable)
    1288             : {
    1289        7682 :     Page        page = BufferGetPage(buf);
    1290             :     BTPageOpaque opaque;
    1291        7682 :     bool        needswal = RelationNeedsWAL(rel);
    1292        7682 :     char       *updatedbuf = NULL;
    1293        7682 :     Size        updatedbuflen = 0;
    1294             :     OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1295             : 
    1296             :     /* Shouldn't be called unless there's something to do */
    1297             :     Assert(ndeletable > 0 || nupdatable > 0);
    1298             : 
    1299             :     /* Generate new versions of posting lists without deleted TIDs */
    1300        7682 :     if (nupdatable > 0)
    1301         760 :         updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1302             :                                          updatedoffsets, &updatedbuflen,
    1303             :                                          needswal);
    1304             : 
    1305             :     /* No ereport(ERROR) until changes are logged */
    1306        7682 :     START_CRIT_SECTION();
    1307             : 
    1308             :     /* Handle updates and deletes just like _bt_delitems_vacuum */
    1309       19982 :     for (int i = 0; i < nupdatable; i++)
    1310             :     {
    1311       12300 :         OffsetNumber updatedoffset = updatedoffsets[i];
    1312             :         IndexTuple  itup;
    1313             :         Size        itemsz;
    1314             : 
    1315       12300 :         itup = updatable[i]->itup;
    1316       12300 :         itemsz = MAXALIGN(IndexTupleSize(itup));
    1317       12300 :         if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
    1318             :                                      itemsz))
    1319           0 :             elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1320             :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1321             :     }
    1322             : 
    1323        7682 :     if (ndeletable > 0)
    1324        7566 :         PageIndexMultiDelete(page, deletable, ndeletable);
    1325             : 
    1326             :     /*
    1327             :      * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
    1328             :      * this point.  The VACUUM command alone controls vacuum cycle IDs.
    1329             :      */
    1330        7682 :     opaque = BTPageGetOpaque(page);
    1331             : 
    1332             :     /*
    1333             :      * Clear the BTP_HAS_GARBAGE page flag.
    1334             :      *
    1335             :      * This flag indicates the presence of LP_DEAD items on the page (though
    1336             :      * not reliably).  Note that we only rely on it with pg_upgrade'd
    1337             :      * !heapkeyspace indexes.
    1338             :      */
    1339        7682 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1340             : 
    1341        7682 :     MarkBufferDirty(buf);
    1342             : 
    1343             :     /* XLOG stuff */
    1344        7682 :     if (needswal)
    1345             :     {
    1346             :         XLogRecPtr  recptr;
    1347             :         xl_btree_delete xlrec_delete;
    1348             : 
    1349        7634 :         xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
    1350        7634 :         xlrec_delete.ndeleted = ndeletable;
    1351        7634 :         xlrec_delete.nupdated = nupdatable;
    1352        7634 :         xlrec_delete.isCatalogRel = isCatalogRel;
    1353             : 
    1354        7634 :         XLogBeginInsert();
    1355        7634 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1356        7634 :         XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
    1357             : 
    1358        7634 :         if (ndeletable > 0)
    1359        7518 :             XLogRegisterBufData(0, (char *) deletable,
    1360             :                                 ndeletable * sizeof(OffsetNumber));
    1361             : 
    1362        7634 :         if (nupdatable > 0)
    1363             :         {
    1364         760 :             XLogRegisterBufData(0, (char *) updatedoffsets,
    1365             :                                 nupdatable * sizeof(OffsetNumber));
    1366         760 :             XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1367             :         }
    1368             : 
    1369        7634 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
    1370             : 
    1371        7634 :         PageSetLSN(page, recptr);
    1372             :     }
    1373             : 
    1374        7682 :     END_CRIT_SECTION();
    1375             : 
    1376             :     /* can't leak memory here */
    1377        7682 :     if (updatedbuf != NULL)
    1378         760 :         pfree(updatedbuf);
    1379             :     /* free tuples allocated within _bt_delitems_update() */
    1380       19982 :     for (int i = 0; i < nupdatable; i++)
    1381       12300 :         pfree(updatable[i]->itup);
    1382        7682 : }
    1383             : 
    1384             : /*
    1385             :  * Set up state needed to delete TIDs from posting list tuples via "updating"
    1386             :  * the tuple.  Performs steps common to both _bt_delitems_vacuum and
    1387             :  * _bt_delitems_delete.  These steps must take place before each function's
    1388             :  * critical section begins.
    1389             :  *
    1390             :  * updatable and nupdatable are inputs, though note that we will use
    1391             :  * _bt_update_posting() to replace the original itup with a pointer to a final
    1392             :  * version in palloc()'d memory.  Caller should free the tuples when its done.
    1393             :  *
    1394             :  * The first nupdatable entries from updatedoffsets are set to the page offset
    1395             :  * number for posting list tuples that caller updates.  This is mostly useful
    1396             :  * because caller may need to WAL-log the page offsets (though we always do
    1397             :  * this for caller out of convenience).
    1398             :  *
    1399             :  * Returns buffer consisting of an array of xl_btree_update structs that
    1400             :  * describe the steps we perform here for caller (though only when needswal is
    1401             :  * true).  Also sets *updatedbuflen to the final size of the buffer.  This
    1402             :  * buffer is used by caller when WAL logging is required.
    1403             :  */
    1404             : static char *
    1405        1734 : _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
    1406             :                     OffsetNumber *updatedoffsets, Size *updatedbuflen,
    1407             :                     bool needswal)
    1408             : {
    1409        1734 :     char       *updatedbuf = NULL;
    1410        1734 :     Size        buflen = 0;
    1411             : 
    1412             :     /* Shouldn't be called unless there's something to do */
    1413             :     Assert(nupdatable > 0);
    1414             : 
    1415       40856 :     for (int i = 0; i < nupdatable; i++)
    1416             :     {
    1417       39122 :         BTVacuumPosting vacposting = updatable[i];
    1418             :         Size        itemsz;
    1419             : 
    1420             :         /* Replace work area IndexTuple with updated version */
    1421       39122 :         _bt_update_posting(vacposting);
    1422             : 
    1423             :         /* Keep track of size of xl_btree_update for updatedbuf in passing */
    1424       39122 :         itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
    1425       39122 :         buflen += itemsz;
    1426             : 
    1427             :         /* Build updatedoffsets buffer in passing */
    1428       39122 :         updatedoffsets[i] = vacposting->updatedoffset;
    1429             :     }
    1430             : 
    1431             :     /* XLOG stuff */
    1432        1734 :     if (needswal)
    1433             :     {
    1434        1734 :         Size        offset = 0;
    1435             : 
    1436             :         /* Allocate, set final size for caller */
    1437        1734 :         updatedbuf = palloc(buflen);
    1438        1734 :         *updatedbuflen = buflen;
    1439       40856 :         for (int i = 0; i < nupdatable; i++)
    1440             :         {
    1441       39122 :             BTVacuumPosting vacposting = updatable[i];
    1442             :             Size        itemsz;
    1443             :             xl_btree_update update;
    1444             : 
    1445       39122 :             update.ndeletedtids = vacposting->ndeletedtids;
    1446       39122 :             memcpy(updatedbuf + offset, &update.ndeletedtids,
    1447             :                    SizeOfBtreeUpdate);
    1448       39122 :             offset += SizeOfBtreeUpdate;
    1449             : 
    1450       39122 :             itemsz = update.ndeletedtids * sizeof(uint16);
    1451       39122 :             memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
    1452       39122 :             offset += itemsz;
    1453             :         }
    1454             :     }
    1455             : 
    1456        1734 :     return updatedbuf;
    1457             : }
    1458             : 
    1459             : /*
    1460             :  * Comparator used by _bt_delitems_delete_check() to restore deltids array
    1461             :  * back to its original leaf-page-wise sort order
    1462             :  */
    1463             : static int
    1464     4869718 : _bt_delitems_cmp(const void *a, const void *b)
    1465             : {
    1466     4869718 :     TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
    1467     4869718 :     TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
    1468             : 
    1469             :     Assert(indexdelete1->id != indexdelete2->id);
    1470             : 
    1471     4869718 :     return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
    1472             : }
    1473             : 
    1474             : /*
    1475             :  * Try to delete item(s) from a btree leaf page during single-page cleanup.
    1476             :  *
    1477             :  * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
    1478             :  * tuples from caller's deltids array: those whose TIDs are found safe to
    1479             :  * delete by the tableam (or already marked LP_DEAD in index, and so already
    1480             :  * known to be deletable by our simple index deletion caller).  We physically
    1481             :  * delete index tuples from buf leaf page last of all (for index tuples where
    1482             :  * that is known to be safe following our table_index_delete_tuples() call).
    1483             :  *
    1484             :  * Simple index deletion caller only includes TIDs from index tuples marked
    1485             :  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
    1486             :  * included without increasing the total number of distinct table blocks for
    1487             :  * the deletion operation as a whole.  This approach often allows us to delete
    1488             :  * some extra index tuples that were practically free for tableam to check in
    1489             :  * passing (when they actually turn out to be safe to delete).  It probably
    1490             :  * only makes sense for the tableam to go ahead with these extra checks when
    1491             :  * it is block-oriented (otherwise the checks probably won't be practically
    1492             :  * free, which we rely on).  The tableam interface requires the tableam side
    1493             :  * to handle the problem, though, so this is okay (we as an index AM are free
    1494             :  * to make the simplifying assumption that all tableams must be block-based).
    1495             :  *
    1496             :  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
    1497             :  * without expecting that tableam will check most of them.  The tableam has
    1498             :  * considerable discretion around which entries/blocks it checks.  Our role in
    1499             :  * costing the bottom-up deletion operation is strictly advisory.
    1500             :  *
    1501             :  * Note: Caller must have added deltids entries (i.e. entries that go in
    1502             :  * delstate's main array) in leaf-page-wise order: page offset number order,
    1503             :  * TID order among entries taken from the same posting list tuple (tiebreak on
    1504             :  * TID).  This order is convenient to work with here.
    1505             :  *
    1506             :  * Note: We also rely on the id field of each deltids element "capturing" this
    1507             :  * original leaf-page-wise order.  That is, we expect to be able to get back
    1508             :  * to the original leaf-page-wise order just by sorting deltids on the id
    1509             :  * field (tableam will sort deltids for its own reasons, so we'll need to put
    1510             :  * it back in leaf-page-wise order afterwards).
    1511             :  */
    1512             : void
    1513       10596 : _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
    1514             :                           TM_IndexDeleteOp *delstate)
    1515             : {
    1516       10596 :     Page        page = BufferGetPage(buf);
    1517             :     TransactionId snapshotConflictHorizon;
    1518             :     bool        isCatalogRel;
    1519       10596 :     OffsetNumber postingidxoffnum = InvalidOffsetNumber;
    1520       10596 :     int         ndeletable = 0,
    1521       10596 :                 nupdatable = 0;
    1522             :     OffsetNumber deletable[MaxIndexTuplesPerPage];
    1523             :     BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1524             : 
    1525             :     /* Use tableam interface to determine which tuples to delete first */
    1526       10596 :     snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
    1527       10596 :     isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
    1528             : 
    1529             :     /* Should not WAL-log snapshotConflictHorizon unless it's required */
    1530       10596 :     if (!XLogStandbyInfoActive())
    1531        2994 :         snapshotConflictHorizon = InvalidTransactionId;
    1532             : 
    1533             :     /*
    1534             :      * Construct a leaf-page-wise description of what _bt_delitems_delete()
    1535             :      * needs to do to physically delete index tuples from the page.
    1536             :      *
    1537             :      * Must sort deltids array to restore leaf-page-wise order (original order
    1538             :      * before call to tableam).  This is the order that the loop expects.
    1539             :      *
    1540             :      * Note that deltids array might be a lot smaller now.  It might even have
    1541             :      * no entries at all (with bottom-up deletion caller), in which case there
    1542             :      * is nothing left to do.
    1543             :      */
    1544       10596 :     qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
    1545             :           _bt_delitems_cmp);
    1546       10596 :     if (delstate->ndeltids == 0)
    1547             :     {
    1548             :         Assert(delstate->bottomup);
    1549        2914 :         return;
    1550             :     }
    1551             : 
    1552             :     /* We definitely have to delete at least one index tuple (or one TID) */
    1553      713556 :     for (int i = 0; i < delstate->ndeltids; i++)
    1554             :     {
    1555      705874 :         TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
    1556      705874 :         OffsetNumber idxoffnum = dstatus->idxoffnum;
    1557      705874 :         ItemId      itemid = PageGetItemId(page, idxoffnum);
    1558      705874 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    1559             :         int         nestedi,
    1560             :                     nitem;
    1561             :         BTVacuumPosting vacposting;
    1562             : 
    1563             :         Assert(OffsetNumberIsValid(idxoffnum));
    1564             : 
    1565      705874 :         if (idxoffnum == postingidxoffnum)
    1566             :         {
    1567             :             /*
    1568             :              * This deltid entry is a TID from a posting list tuple that has
    1569             :              * already been completely processed
    1570             :              */
    1571             :             Assert(BTreeTupleIsPosting(itup));
    1572             :             Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
    1573             :                                       &delstate->deltids[i].tid) < 0);
    1574             :             Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
    1575             :                                       &delstate->deltids[i].tid) >= 0);
    1576       27344 :             continue;
    1577             :         }
    1578             : 
    1579      678530 :         if (!BTreeTupleIsPosting(itup))
    1580             :         {
    1581             :             /* Plain non-pivot tuple */
    1582             :             Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
    1583      650794 :             if (dstatus->knowndeletable)
    1584      531366 :                 deletable[ndeletable++] = idxoffnum;
    1585      650794 :             continue;
    1586             :         }
    1587             : 
    1588             :         /*
    1589             :          * itup is a posting list tuple whose lowest deltids entry (which may
    1590             :          * or may not be for the first TID from itup) is considered here now.
    1591             :          * We should process all of the deltids entries for the posting list
    1592             :          * together now, though (not just the lowest).  Remember to skip over
    1593             :          * later itup-related entries during later iterations of outermost
    1594             :          * loop.
    1595             :          */
    1596       27736 :         postingidxoffnum = idxoffnum;   /* Remember work in outermost loop */
    1597       27736 :         nestedi = i;            /* Initialize for first itup deltids entry */
    1598       27736 :         vacposting = NULL;      /* Describes final action for itup */
    1599       27736 :         nitem = BTreeTupleGetNPosting(itup);
    1600      123782 :         for (int p = 0; p < nitem; p++)
    1601             :         {
    1602       96046 :             ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
    1603       96046 :             int         ptidcmp = -1;
    1604             : 
    1605             :             /*
    1606             :              * This nested loop reuses work across ptid TIDs taken from itup.
    1607             :              * We take advantage of the fact that both itup's TIDs and deltids
    1608             :              * entries (within a single itup/posting list grouping) must both
    1609             :              * be in ascending TID order.
    1610             :              */
    1611      137086 :             for (; nestedi < delstate->ndeltids; nestedi++)
    1612             :             {
    1613      131584 :                 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
    1614      131584 :                 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
    1615             : 
    1616             :                 /* Stop once we get past all itup related deltids entries */
    1617             :                 Assert(tdstatus->idxoffnum >= idxoffnum);
    1618      131584 :                 if (tdstatus->idxoffnum != idxoffnum)
    1619       30076 :                     break;
    1620             : 
    1621             :                 /* Skip past non-deletable itup related entries up front */
    1622      101508 :                 if (!tdstatus->knowndeletable)
    1623        7698 :                     continue;
    1624             : 
    1625             :                 /* Entry is first partial ptid match (or an exact match)? */
    1626       93810 :                 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
    1627       93810 :                 if (ptidcmp >= 0)
    1628             :                 {
    1629             :                     /* Greater than or equal (partial or exact) match... */
    1630       60468 :                     break;
    1631             :                 }
    1632             :             }
    1633             : 
    1634             :             /* ...exact ptid match to a deletable deltids entry? */
    1635       96046 :             if (ptidcmp != 0)
    1636       48664 :                 continue;
    1637             : 
    1638             :             /* Exact match for deletable deltids entry -- ptid gets deleted */
    1639       47382 :             if (vacposting == NULL)
    1640             :             {
    1641       25420 :                 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1642             :                                     nitem * sizeof(uint16));
    1643       25420 :                 vacposting->itup = itup;
    1644       25420 :                 vacposting->updatedoffset = idxoffnum;
    1645       25420 :                 vacposting->ndeletedtids = 0;
    1646             :             }
    1647       47382 :             vacposting->deletetids[vacposting->ndeletedtids++] = p;
    1648             :         }
    1649             : 
    1650             :         /* Final decision on itup, a posting list tuple */
    1651             : 
    1652       27736 :         if (vacposting == NULL)
    1653             :         {
    1654             :             /* No TIDs to delete from itup -- do nothing */
    1655             :         }
    1656       25420 :         else if (vacposting->ndeletedtids == nitem)
    1657             :         {
    1658             :             /* Straight delete of itup (to delete all TIDs) */
    1659       13120 :             deletable[ndeletable++] = idxoffnum;
    1660             :             /* Turns out we won't need granular information */
    1661       13120 :             pfree(vacposting);
    1662             :         }
    1663             :         else
    1664             :         {
    1665             :             /* Delete some (but not all) TIDs from itup */
    1666             :             Assert(vacposting->ndeletedtids > 0 &&
    1667             :                    vacposting->ndeletedtids < nitem);
    1668       12300 :             updatable[nupdatable++] = vacposting;
    1669             :         }
    1670             :     }
    1671             : 
    1672             :     /* Physically delete tuples (or TIDs) using deletable (or updatable) */
    1673        7682 :     _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
    1674             :                         deletable, ndeletable, updatable, nupdatable);
    1675             : 
    1676             :     /* be tidy */
    1677       19982 :     for (int i = 0; i < nupdatable; i++)
    1678       12300 :         pfree(updatable[i]);
    1679             : }
    1680             : 
    1681             : /*
    1682             :  * Check that leftsib page (the btpo_prev of target page) is not marked with
    1683             :  * INCOMPLETE_SPLIT flag.  Used during page deletion.
    1684             :  *
    1685             :  * Returning true indicates that page flag is set in leftsib (which is
    1686             :  * definitely still the left sibling of target).  When that happens, the
    1687             :  * target doesn't have a downlink in parent, and the page deletion algorithm
    1688             :  * isn't prepared to handle that.  Deletion of the target page (or the whole
    1689             :  * subtree that contains the target page) cannot take place.
    1690             :  *
    1691             :  * Caller should not have a lock on the target page itself, since pages on the
    1692             :  * same level must always be locked left to right to avoid deadlocks.
    1693             :  */
    1694             : static bool
    1695        5472 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
    1696             : {
    1697             :     Buffer      buf;
    1698             :     Page        page;
    1699             :     BTPageOpaque opaque;
    1700             :     bool        result;
    1701             : 
    1702             :     /* Easy case: No left sibling */
    1703        5472 :     if (leftsib == P_NONE)
    1704        4456 :         return false;
    1705             : 
    1706        1016 :     buf = _bt_getbuf(rel, leftsib, BT_READ);
    1707        1016 :     page = BufferGetPage(buf);
    1708        1016 :     opaque = BTPageGetOpaque(page);
    1709             : 
    1710             :     /*
    1711             :      * If the left sibling was concurrently split, so that its next-pointer
    1712             :      * doesn't point to the current page anymore, the split that created
    1713             :      * target must be completed.  Caller can reasonably expect that there will
    1714             :      * be a downlink to the target page that it can relocate using its stack.
    1715             :      * (We don't allow splitting an incompletely split page again until the
    1716             :      * previous split has been completed.)
    1717             :      */
    1718        1016 :     result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
    1719        1016 :     _bt_relbuf(rel, buf);
    1720             : 
    1721        1016 :     return result;
    1722             : }
    1723             : 
    1724             : /*
    1725             :  * Check that leafrightsib page (the btpo_next of target leaf page) is not
    1726             :  * marked with ISHALFDEAD flag.  Used during page deletion.
    1727             :  *
    1728             :  * Returning true indicates that page flag is set in leafrightsib, so page
    1729             :  * deletion cannot go ahead.  Our caller is not prepared to deal with the case
    1730             :  * where the parent page does not have a pivot tuples whose downlink points to
    1731             :  * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
    1732             :  * seem worth going to the trouble of teaching our caller to deal with it.
    1733             :  * The situation will be resolved after VACUUM finishes the deletion of the
    1734             :  * half-dead page (when a future VACUUM operation reaches the target page
    1735             :  * again).
    1736             :  *
    1737             :  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
    1738             :  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
    1739             :  * okay because of the restriction on deleting pages that are the rightmost
    1740             :  * page of their parent (i.e. that such deletions can only take place when the
    1741             :  * entire subtree must be deleted).  The leaf level check made here will apply
    1742             :  * to a right "cousin" leaf page rather than a simple right sibling leaf page
    1743             :  * in cases where caller actually goes on to attempt deleting pages that are
    1744             :  * above the leaf page.  The right cousin leaf page is representative of the
    1745             :  * left edge of the subtree to the right of the to-be-deleted subtree as a
    1746             :  * whole, which is exactly the condition that our caller cares about.
    1747             :  * (Besides, internal pages are never marked half-dead, so it isn't even
    1748             :  * possible to _directly_ assess if an internal page is part of some other
    1749             :  * to-be-deleted subtree.)
    1750             :  */
    1751             : static bool
    1752        5320 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
    1753             : {
    1754             :     Buffer      buf;
    1755             :     Page        page;
    1756             :     BTPageOpaque opaque;
    1757             :     bool        result;
    1758             : 
    1759             :     Assert(leafrightsib != P_NONE);
    1760             : 
    1761        5320 :     buf = _bt_getbuf(rel, leafrightsib, BT_READ);
    1762        5320 :     page = BufferGetPage(buf);
    1763        5320 :     opaque = BTPageGetOpaque(page);
    1764             : 
    1765             :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
    1766        5320 :     result = P_ISHALFDEAD(opaque);
    1767        5320 :     _bt_relbuf(rel, buf);
    1768             : 
    1769        5320 :     return result;
    1770             : }
    1771             : 
    1772             : /*
    1773             :  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
    1774             :  *
    1775             :  * This action unlinks the leaf page from the b-tree structure, removing all
    1776             :  * pointers leading to it --- but not touching its own left and right links.
    1777             :  * The page cannot be physically reclaimed right away, since other processes
    1778             :  * may currently be trying to follow links leading to the page; they have to
    1779             :  * be allowed to use its right-link to recover.  See nbtree/README.
    1780             :  *
    1781             :  * On entry, the target buffer must be pinned and locked (either read or write
    1782             :  * lock is OK).  The page must be an empty leaf page, which may be half-dead
    1783             :  * already (a half-dead page should only be passed to us when an earlier
    1784             :  * VACUUM operation was interrupted, though).  Note in particular that caller
    1785             :  * should never pass a buffer containing an existing deleted page here.  The
    1786             :  * lock and pin on caller's buffer will be dropped before we return.
    1787             :  *
    1788             :  * Maintains bulk delete stats for caller, which are taken from vstate.  We
    1789             :  * need to cooperate closely with caller here so that whole VACUUM operation
    1790             :  * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
    1791             :  * delete in passing.  If such pages happen to be from a block number that is
    1792             :  * ahead of the current scanblkno position, then caller is expected to count
    1793             :  * them directly later on.  It's simpler for us to understand caller's
    1794             :  * requirements than it would be for caller to understand when or how a
    1795             :  * deleted page became deleted after the fact.
    1796             :  *
    1797             :  * NOTE: this leaks memory.  Rather than trying to clean up everything
    1798             :  * carefully, it's better to run it in a temp context that can be reset
    1799             :  * frequently.
    1800             :  */
    1801             : void
    1802        5474 : _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
    1803             : {
    1804             :     BlockNumber rightsib;
    1805             :     bool        rightsib_empty;
    1806             :     Page        page;
    1807             :     BTPageOpaque opaque;
    1808             : 
    1809             :     /*
    1810             :      * Save original leafbuf block number from caller.  Only deleted blocks
    1811             :      * that are <= scanblkno are added to bulk delete stat's pages_deleted
    1812             :      * count.
    1813             :      */
    1814        5474 :     BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
    1815             : 
    1816             :     /*
    1817             :      * "stack" is a search stack leading (approximately) to the target page.
    1818             :      * It is initially NULL, but when iterating, we keep it to avoid
    1819             :      * duplicated search effort.
    1820             :      *
    1821             :      * Also, when "stack" is not NULL, we have already checked that the
    1822             :      * current page is not the right half of an incomplete split, i.e. the
    1823             :      * left sibling does not have its INCOMPLETE_SPLIT flag set, including
    1824             :      * when the current target page is to the right of caller's initial page
    1825             :      * (the scanblkno page).
    1826             :      */
    1827        5474 :     BTStack     stack = NULL;
    1828             : 
    1829             :     for (;;)
    1830             :     {
    1831       10794 :         page = BufferGetPage(leafbuf);
    1832       10794 :         opaque = BTPageGetOpaque(page);
    1833             : 
    1834             :         /*
    1835             :          * Internal pages are never deleted directly, only as part of deleting
    1836             :          * the whole subtree all the way down to leaf level.
    1837             :          *
    1838             :          * Also check for deleted pages here.  Caller never passes us a fully
    1839             :          * deleted page.  Only VACUUM can delete pages, so there can't have
    1840             :          * been a concurrent deletion.  Assume that we reached any deleted
    1841             :          * page encountered here by following a sibling link, and that the
    1842             :          * index is corrupt.
    1843             :          */
    1844             :         Assert(!P_ISDELETED(opaque));
    1845       10794 :         if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
    1846             :         {
    1847             :             /*
    1848             :              * Pre-9.4 page deletion only marked internal pages as half-dead,
    1849             :              * but now we only use that flag on leaf pages. The old algorithm
    1850             :              * was never supposed to leave half-dead pages in the tree, it was
    1851             :              * just a transient state, but it was nevertheless possible in
    1852             :              * error scenarios. We don't know how to deal with them here. They
    1853             :              * are harmless as far as searches are considered, but inserts
    1854             :              * into the deleted keyspace could add out-of-order downlinks in
    1855             :              * the upper levels. Log a notice, hopefully the admin will notice
    1856             :              * and reindex.
    1857             :              */
    1858           0 :             if (P_ISHALFDEAD(opaque))
    1859           0 :                 ereport(LOG,
    1860             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1861             :                          errmsg("index \"%s\" contains a half-dead internal page",
    1862             :                                 RelationGetRelationName(rel)),
    1863             :                          errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
    1864             : 
    1865           0 :             if (P_ISDELETED(opaque))
    1866           0 :                 ereport(LOG,
    1867             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1868             :                          errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
    1869             :                                          BufferGetBlockNumber(leafbuf),
    1870             :                                          scanblkno,
    1871             :                                          RelationGetRelationName(rel))));
    1872             : 
    1873           0 :             _bt_relbuf(rel, leafbuf);
    1874         166 :             return;
    1875             :         }
    1876             : 
    1877             :         /*
    1878             :          * We can never delete rightmost pages nor root pages.  While at it,
    1879             :          * check that page is empty, since it's possible that the leafbuf page
    1880             :          * was empty a moment ago, but has since had some inserts.
    1881             :          *
    1882             :          * To keep the algorithm simple, we also never delete an incompletely
    1883             :          * split page (they should be rare enough that this doesn't make any
    1884             :          * meaningful difference to disk usage):
    1885             :          *
    1886             :          * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
    1887             :          * left half of an incomplete split, but ensuring that it's not the
    1888             :          * right half is more complicated.  For that, we have to check that
    1889             :          * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
    1890             :          * _bt_leftsib_splitflag().  On the first iteration, we temporarily
    1891             :          * release the lock on scanblkno/leafbuf, check the left sibling, and
    1892             :          * construct a search stack to scanblkno.  On subsequent iterations,
    1893             :          * we know we stepped right from a page that passed these tests, so
    1894             :          * it's OK.
    1895             :          */
    1896       10794 :         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
    1897       10634 :             P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1898       10634 :             P_INCOMPLETE_SPLIT(opaque))
    1899             :         {
    1900             :             /* Should never fail to delete a half-dead page */
    1901             :             Assert(!P_ISHALFDEAD(opaque));
    1902             : 
    1903         160 :             _bt_relbuf(rel, leafbuf);
    1904         160 :             return;
    1905             :         }
    1906             : 
    1907             :         /*
    1908             :          * First, remove downlink pointing to the page (or a parent of the
    1909             :          * page, if we are going to delete a taller subtree), and mark the
    1910             :          * leafbuf page half-dead
    1911             :          */
    1912       10634 :         if (!P_ISHALFDEAD(opaque))
    1913             :         {
    1914             :             /*
    1915             :              * We need an approximate pointer to the page's parent page.  We
    1916             :              * use a variant of the standard search mechanism to search for
    1917             :              * the page's high key; this will give us a link to either the
    1918             :              * current parent or someplace to its left (if there are multiple
    1919             :              * equal high keys, which is possible with !heapkeyspace indexes).
    1920             :              *
    1921             :              * Also check if this is the right-half of an incomplete split
    1922             :              * (see comment above).
    1923             :              */
    1924       10634 :             if (!stack)
    1925             :             {
    1926             :                 BTScanInsert itup_key;
    1927             :                 ItemId      itemid;
    1928             :                 IndexTuple  targetkey;
    1929             :                 BlockNumber leftsib,
    1930             :                             leafblkno;
    1931             :                 Buffer      sleafbuf;
    1932             : 
    1933        5314 :                 itemid = PageGetItemId(page, P_HIKEY);
    1934        5314 :                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
    1935             : 
    1936        5314 :                 leftsib = opaque->btpo_prev;
    1937        5314 :                 leafblkno = BufferGetBlockNumber(leafbuf);
    1938             : 
    1939             :                 /*
    1940             :                  * To avoid deadlocks, we'd better drop the leaf page lock
    1941             :                  * before going further.
    1942             :                  */
    1943        5314 :                 _bt_unlockbuf(rel, leafbuf);
    1944             : 
    1945             :                 /*
    1946             :                  * Check that the left sibling of leafbuf (if any) is not
    1947             :                  * marked with INCOMPLETE_SPLIT flag before proceeding
    1948             :                  */
    1949             :                 Assert(leafblkno == scanblkno);
    1950        5314 :                 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
    1951             :                 {
    1952           0 :                     ReleaseBuffer(leafbuf);
    1953           0 :                     return;
    1954             :                 }
    1955             : 
    1956             :                 /*
    1957             :                  * We need an insertion scan key, so build one.
    1958             :                  *
    1959             :                  * _bt_search searches for the leaf page that contains any
    1960             :                  * matching non-pivot tuples, but we need it to "search" for
    1961             :                  * the high key pivot from the page that we're set to delete.
    1962             :                  * Compensate for the mismatch by having _bt_search locate the
    1963             :                  * last position < equal-to-untruncated-prefix non-pivots.
    1964             :                  */
    1965        5314 :                 itup_key = _bt_mkscankey(rel, targetkey);
    1966             : 
    1967             :                 /* Set up a BTLessStrategyNumber-like insertion scan key */
    1968        5314 :                 itup_key->nextkey = false;
    1969        5314 :                 itup_key->backward = true;
    1970        5314 :                 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
    1971             :                 /* won't need a second lock or pin on leafbuf */
    1972        5314 :                 _bt_relbuf(rel, sleafbuf);
    1973             : 
    1974             :                 /*
    1975             :                  * Re-lock the leaf page, and start over to use our stack
    1976             :                  * within _bt_mark_page_halfdead.  We must do it that way
    1977             :                  * because it's possible that leafbuf can no longer be
    1978             :                  * deleted.  We need to recheck.
    1979             :                  *
    1980             :                  * Note: We can't simply hold on to the sleafbuf lock instead,
    1981             :                  * because it's barely possible that sleafbuf is not the same
    1982             :                  * page as leafbuf.  This happens when leafbuf split after our
    1983             :                  * original lock was dropped, but before _bt_search finished
    1984             :                  * its descent.  We rely on the assumption that we'll find
    1985             :                  * leafbuf isn't safe to delete anymore in this scenario.
    1986             :                  * (Page deletion can cope with the stack being to the left of
    1987             :                  * leafbuf, but not to the right of leafbuf.)
    1988             :                  */
    1989        5314 :                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
    1990        5314 :                 continue;
    1991             :             }
    1992             : 
    1993             :             /*
    1994             :              * See if it's safe to delete the leaf page, and determine how
    1995             :              * many parent/internal pages above the leaf level will be
    1996             :              * deleted.  If it's safe then _bt_mark_page_halfdead will also
    1997             :              * perform the first phase of deletion, which includes marking the
    1998             :              * leafbuf page half-dead.
    1999             :              */
    2000             :             Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
    2001        5320 :             if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
    2002             :                                         stack))
    2003             :             {
    2004           6 :                 _bt_relbuf(rel, leafbuf);
    2005           6 :                 return;
    2006             :             }
    2007             :         }
    2008             : 
    2009             :         /*
    2010             :          * Then unlink it from its siblings.  Each call to
    2011             :          * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
    2012             :          * making it shallower.  Iterate until the leafbuf page is deleted.
    2013             :          */
    2014        5314 :         rightsib_empty = false;
    2015             :         Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
    2016       10772 :         while (P_ISHALFDEAD(opaque))
    2017             :         {
    2018             :             /* Check for interrupts in _bt_unlink_halfdead_page */
    2019        5458 :             if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
    2020             :                                           &rightsib_empty, vstate))
    2021             :             {
    2022             :                 /*
    2023             :                  * _bt_unlink_halfdead_page should never fail, since we
    2024             :                  * established that deletion is generally safe in
    2025             :                  * _bt_mark_page_halfdead -- index must be corrupt.
    2026             :                  *
    2027             :                  * Note that _bt_unlink_halfdead_page already released the
    2028             :                  * lock and pin on leafbuf for us.
    2029             :                  */
    2030             :                 Assert(false);
    2031           0 :                 return;
    2032             :             }
    2033             :         }
    2034             : 
    2035             :         Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
    2036             : 
    2037        5314 :         rightsib = opaque->btpo_next;
    2038             : 
    2039        5314 :         _bt_relbuf(rel, leafbuf);
    2040             : 
    2041             :         /*
    2042             :          * Check here, as calling loops will have locks held, preventing
    2043             :          * interrupts from being processed.
    2044             :          */
    2045        5314 :         CHECK_FOR_INTERRUPTS();
    2046             : 
    2047             :         /*
    2048             :          * The page has now been deleted. If its right sibling is completely
    2049             :          * empty, it's possible that the reason we haven't deleted it earlier
    2050             :          * is that it was the rightmost child of the parent. Now that we
    2051             :          * removed the downlink for this page, the right sibling might now be
    2052             :          * the only child of the parent, and could be removed. It would be
    2053             :          * picked up by the next vacuum anyway, but might as well try to
    2054             :          * remove it now, so loop back to process the right sibling.
    2055             :          *
    2056             :          * Note: This relies on the assumption that _bt_getstackbuf() will be
    2057             :          * able to reuse our original descent stack with a different child
    2058             :          * block (provided that the child block is to the right of the
    2059             :          * original leaf page reached by _bt_search()). It will even update
    2060             :          * the descent stack each time we loop around, avoiding repeated work.
    2061             :          */
    2062        5314 :         if (!rightsib_empty)
    2063        5308 :             break;
    2064             : 
    2065           6 :         leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2066             :     }
    2067             : }
    2068             : 
    2069             : /*
    2070             :  * First stage of page deletion.
    2071             :  *
    2072             :  * Establish the height of the to-be-deleted subtree with leafbuf at its
    2073             :  * lowest level, remove the downlink to the subtree, and mark leafbuf
    2074             :  * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
    2075             :  * but may include additional internal pages (at most one per level of the
    2076             :  * tree below the root).
    2077             :  *
    2078             :  * Caller must pass a valid heaprel, since it's just about possible that our
    2079             :  * call to _bt_lock_subtree_parent will need to allocate a new index page to
    2080             :  * complete a page split.  Every call to _bt_allocbuf needs to pass a heaprel.
    2081             :  *
    2082             :  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
    2083             :  * the rightmost child of its parent (and parent has more than one downlink).
    2084             :  * Returns 'true' when the first stage of page deletion completed
    2085             :  * successfully.
    2086             :  */
    2087             : static bool
    2088        5320 : _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
    2089             :                        BTStack stack)
    2090             : {
    2091             :     BlockNumber leafblkno;
    2092             :     BlockNumber leafrightsib;
    2093             :     BlockNumber topparent;
    2094             :     BlockNumber topparentrightsib;
    2095             :     ItemId      itemid;
    2096             :     Page        page;
    2097             :     BTPageOpaque opaque;
    2098             :     Buffer      subtreeparent;
    2099             :     OffsetNumber poffset;
    2100             :     OffsetNumber nextoffset;
    2101             :     IndexTuple  itup;
    2102             :     IndexTupleData trunctuple;
    2103             : 
    2104        5320 :     page = BufferGetPage(leafbuf);
    2105        5320 :     opaque = BTPageGetOpaque(page);
    2106             : 
    2107             :     Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
    2108             :            P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
    2109             :            P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2110             :     Assert(heaprel != NULL);
    2111             : 
    2112             :     /*
    2113             :      * Save info about the leaf page.
    2114             :      */
    2115        5320 :     leafblkno = BufferGetBlockNumber(leafbuf);
    2116        5320 :     leafrightsib = opaque->btpo_next;
    2117             : 
    2118             :     /*
    2119             :      * Before attempting to lock the parent page, check that the right sibling
    2120             :      * is not in half-dead state.  A half-dead right sibling would have no
    2121             :      * downlink in the parent, which would be highly confusing later when we
    2122             :      * delete the downlink.  It would fail the "right sibling of target page
    2123             :      * is also the next child in parent page" cross-check below.
    2124             :      */
    2125        5320 :     if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
    2126             :     {
    2127           0 :         elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    2128             :              leafblkno, leafrightsib);
    2129           0 :         return false;
    2130             :     }
    2131             : 
    2132             :     /*
    2133             :      * We cannot delete a page that is the rightmost child of its immediate
    2134             :      * parent, unless it is the only child --- in which case the parent has to
    2135             :      * be deleted too, and the same condition applies recursively to it. We
    2136             :      * have to check this condition all the way up before trying to delete,
    2137             :      * and lock the parent of the root of the to-be-deleted subtree (the
    2138             :      * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
    2139             :      * for us.  We remove the downlink to the "top parent" page (subtree root
    2140             :      * page) from the subtree parent page below.
    2141             :      *
    2142             :      * Initialize topparent to be leafbuf page now.  The final to-be-deleted
    2143             :      * subtree is often a degenerate one page subtree consisting only of the
    2144             :      * leafbuf page.  When that happens, the leafbuf page is the final subtree
    2145             :      * root page/top parent page.
    2146             :      */
    2147        5320 :     topparent = leafblkno;
    2148        5320 :     topparentrightsib = leafrightsib;
    2149        5320 :     if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
    2150             :                                  &subtreeparent, &poffset,
    2151             :                                  &topparent, &topparentrightsib))
    2152           6 :         return false;
    2153             : 
    2154        5314 :     page = BufferGetPage(subtreeparent);
    2155        5314 :     opaque = BTPageGetOpaque(page);
    2156             : 
    2157             : #ifdef USE_ASSERT_CHECKING
    2158             : 
    2159             :     /*
    2160             :      * This is just an assertion because _bt_lock_subtree_parent should have
    2161             :      * guaranteed tuple has the expected contents
    2162             :      */
    2163             :     itemid = PageGetItemId(page, poffset);
    2164             :     itup = (IndexTuple) PageGetItem(page, itemid);
    2165             :     Assert(BTreeTupleGetDownLink(itup) == topparent);
    2166             : #endif
    2167             : 
    2168        5314 :     nextoffset = OffsetNumberNext(poffset);
    2169        5314 :     itemid = PageGetItemId(page, nextoffset);
    2170        5314 :     itup = (IndexTuple) PageGetItem(page, itemid);
    2171             : 
    2172             :     /*
    2173             :      * Check that the parent-page index items we're about to delete/overwrite
    2174             :      * in subtree parent page contain what we expect.  This can fail if the
    2175             :      * index has become corrupt for some reason.  When that happens we back
    2176             :      * out of deletion of the leafbuf subtree.  (This is just like the case
    2177             :      * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
    2178             :      */
    2179        5314 :     if (BTreeTupleGetDownLink(itup) != topparentrightsib)
    2180             :     {
    2181           0 :         ereport(LOG,
    2182             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2183             :                  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
    2184             :                                  topparentrightsib, topparent,
    2185             :                                  BTreeTupleGetDownLink(itup),
    2186             :                                  BufferGetBlockNumber(subtreeparent),
    2187             :                                  RelationGetRelationName(rel))));
    2188             : 
    2189           0 :         _bt_relbuf(rel, subtreeparent);
    2190             :         Assert(false);
    2191           0 :         return false;
    2192             :     }
    2193             : 
    2194             :     /*
    2195             :      * Any insert which would have gone on the leaf block will now go to its
    2196             :      * right sibling.  In other words, the key space moves right.
    2197             :      */
    2198        5314 :     PredicateLockPageCombine(rel, leafblkno, leafrightsib);
    2199             : 
    2200             :     /* No ereport(ERROR) until changes are logged */
    2201        5314 :     START_CRIT_SECTION();
    2202             : 
    2203             :     /*
    2204             :      * Update parent of subtree.  We want to delete the downlink to the top
    2205             :      * parent page/root of the subtree, and the *following* key.  Easiest way
    2206             :      * is to copy the right sibling's downlink over the downlink that points
    2207             :      * to top parent page, and then delete the right sibling's original pivot
    2208             :      * tuple.
    2209             :      *
    2210             :      * Lanin and Shasha make the key space move left when deleting a page,
    2211             :      * whereas the key space moves right here.  That's why we cannot simply
    2212             :      * delete the pivot tuple with the downlink to the top parent page.  See
    2213             :      * nbtree/README.
    2214             :      */
    2215        5314 :     page = BufferGetPage(subtreeparent);
    2216        5314 :     opaque = BTPageGetOpaque(page);
    2217             : 
    2218        5314 :     itemid = PageGetItemId(page, poffset);
    2219        5314 :     itup = (IndexTuple) PageGetItem(page, itemid);
    2220        5314 :     BTreeTupleSetDownLink(itup, topparentrightsib);
    2221             : 
    2222        5314 :     nextoffset = OffsetNumberNext(poffset);
    2223        5314 :     PageIndexTupleDelete(page, nextoffset);
    2224             : 
    2225             :     /*
    2226             :      * Mark the leaf page as half-dead, and stamp it with a link to the top
    2227             :      * parent page.  When the leaf page is also the top parent page, the link
    2228             :      * is set to InvalidBlockNumber.
    2229             :      */
    2230        5314 :     page = BufferGetPage(leafbuf);
    2231        5314 :     opaque = BTPageGetOpaque(page);
    2232        5314 :     opaque->btpo_flags |= BTP_HALF_DEAD;
    2233             : 
    2234             :     Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
    2235        5314 :     MemSet(&trunctuple, 0, sizeof(IndexTupleData));
    2236        5314 :     trunctuple.t_info = sizeof(IndexTupleData);
    2237        5314 :     if (topparent != leafblkno)
    2238          76 :         BTreeTupleSetTopParent(&trunctuple, topparent);
    2239             :     else
    2240        5238 :         BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
    2241             : 
    2242        5314 :     if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
    2243        5314 :                                  IndexTupleSize(&trunctuple)))
    2244           0 :         elog(ERROR, "could not overwrite high key in half-dead page");
    2245             : 
    2246             :     /* Must mark buffers dirty before XLogInsert */
    2247        5314 :     MarkBufferDirty(subtreeparent);
    2248        5314 :     MarkBufferDirty(leafbuf);
    2249             : 
    2250             :     /* XLOG stuff */
    2251        5314 :     if (RelationNeedsWAL(rel))
    2252             :     {
    2253             :         xl_btree_mark_page_halfdead xlrec;
    2254             :         XLogRecPtr  recptr;
    2255             : 
    2256        5314 :         xlrec.poffset = poffset;
    2257        5314 :         xlrec.leafblk = leafblkno;
    2258        5314 :         if (topparent != leafblkno)
    2259          76 :             xlrec.topparent = topparent;
    2260             :         else
    2261        5238 :             xlrec.topparent = InvalidBlockNumber;
    2262             : 
    2263        5314 :         XLogBeginInsert();
    2264        5314 :         XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
    2265        5314 :         XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
    2266             : 
    2267        5314 :         page = BufferGetPage(leafbuf);
    2268        5314 :         opaque = BTPageGetOpaque(page);
    2269        5314 :         xlrec.leftblk = opaque->btpo_prev;
    2270        5314 :         xlrec.rightblk = opaque->btpo_next;
    2271             : 
    2272        5314 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
    2273             : 
    2274        5314 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
    2275             : 
    2276        5314 :         page = BufferGetPage(subtreeparent);
    2277        5314 :         PageSetLSN(page, recptr);
    2278        5314 :         page = BufferGetPage(leafbuf);
    2279        5314 :         PageSetLSN(page, recptr);
    2280             :     }
    2281             : 
    2282        5314 :     END_CRIT_SECTION();
    2283             : 
    2284        5314 :     _bt_relbuf(rel, subtreeparent);
    2285        5314 :     return true;
    2286             : }
    2287             : 
    2288             : /*
    2289             :  * Second stage of page deletion.
    2290             :  *
    2291             :  * Unlinks a single page (in the subtree undergoing deletion) from its
    2292             :  * siblings.  Also marks the page deleted.
    2293             :  *
    2294             :  * To get rid of the whole subtree, including the leaf page itself, call here
    2295             :  * until the leaf page is deleted.  The original "top parent" established in
    2296             :  * the first stage of deletion is deleted in the first call here, while the
    2297             :  * leaf page is deleted in the last call here.  Note that the leaf page itself
    2298             :  * is often the initial top parent page.
    2299             :  *
    2300             :  * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
    2301             :  * the right sibling of the current target page is empty, *rightsib_empty is
    2302             :  * set to true, allowing caller to delete the target's right sibling page in
    2303             :  * passing.  Note that *rightsib_empty is only actually used by caller when
    2304             :  * target page is leafbuf, following last call here for leafbuf/the subtree
    2305             :  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
    2306             :  * consistent.)
    2307             :  *
    2308             :  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
    2309             :  * On success exit, we'll be holding pin and write lock.  On failure exit,
    2310             :  * we'll release both pin and lock before returning (we define it that way
    2311             :  * to avoid having to reacquire a lock we already released).
    2312             :  */
    2313             : static bool
    2314        5458 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
    2315             :                          bool *rightsib_empty, BTVacState *vstate)
    2316             : {
    2317        5458 :     BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
    2318        5458 :     IndexBulkDeleteResult *stats = vstate->stats;
    2319             :     BlockNumber leafleftsib;
    2320             :     BlockNumber leafrightsib;
    2321             :     BlockNumber target;
    2322             :     BlockNumber leftsib;
    2323             :     BlockNumber rightsib;
    2324        5458 :     Buffer      lbuf = InvalidBuffer;
    2325             :     Buffer      buf;
    2326             :     Buffer      rbuf;
    2327        5458 :     Buffer      metabuf = InvalidBuffer;
    2328        5458 :     Page        metapg = NULL;
    2329        5458 :     BTMetaPageData *metad = NULL;
    2330             :     ItemId      itemid;
    2331             :     Page        page;
    2332             :     BTPageOpaque opaque;
    2333             :     FullTransactionId safexid;
    2334             :     bool        rightsib_is_rightmost;
    2335             :     uint32      targetlevel;
    2336             :     IndexTuple  leafhikey;
    2337             :     BlockNumber leaftopparent;
    2338             : 
    2339        5458 :     page = BufferGetPage(leafbuf);
    2340        5458 :     opaque = BTPageGetOpaque(page);
    2341             : 
    2342             :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
    2343             : 
    2344             :     /*
    2345             :      * Remember some information about the leaf page.
    2346             :      */
    2347        5458 :     itemid = PageGetItemId(page, P_HIKEY);
    2348        5458 :     leafhikey = (IndexTuple) PageGetItem(page, itemid);
    2349        5458 :     target = BTreeTupleGetTopParent(leafhikey);
    2350        5458 :     leafleftsib = opaque->btpo_prev;
    2351        5458 :     leafrightsib = opaque->btpo_next;
    2352             : 
    2353        5458 :     _bt_unlockbuf(rel, leafbuf);
    2354             : 
    2355             :     /*
    2356             :      * Check here, as calling loops will have locks held, preventing
    2357             :      * interrupts from being processed.
    2358             :      */
    2359        5458 :     CHECK_FOR_INTERRUPTS();
    2360             : 
    2361             :     /* Unlink the current top parent of the subtree */
    2362        5458 :     if (!BlockNumberIsValid(target))
    2363             :     {
    2364             :         /* Target is leaf page (or leaf page is top parent, if you prefer) */
    2365        5314 :         target = leafblkno;
    2366             : 
    2367        5314 :         buf = leafbuf;
    2368        5314 :         leftsib = leafleftsib;
    2369        5314 :         targetlevel = 0;
    2370             :     }
    2371             :     else
    2372             :     {
    2373             :         /* Target is the internal page taken from leaf's top parent link */
    2374             :         Assert(target != leafblkno);
    2375             : 
    2376             :         /* Fetch the block number of the target's left sibling */
    2377         144 :         buf = _bt_getbuf(rel, target, BT_READ);
    2378         144 :         page = BufferGetPage(buf);
    2379         144 :         opaque = BTPageGetOpaque(page);
    2380         144 :         leftsib = opaque->btpo_prev;
    2381         144 :         targetlevel = opaque->btpo_level;
    2382             :         Assert(targetlevel > 0);
    2383             : 
    2384             :         /*
    2385             :          * To avoid deadlocks, we'd better drop the target page lock before
    2386             :          * going further.
    2387             :          */
    2388         144 :         _bt_unlockbuf(rel, buf);
    2389             :     }
    2390             : 
    2391             :     /*
    2392             :      * We have to lock the pages we need to modify in the standard order:
    2393             :      * moving right, then up.  Else we will deadlock against other writers.
    2394             :      *
    2395             :      * So, first lock the leaf page, if it's not the target.  Then find and
    2396             :      * write-lock the current left sibling of the target page.  The sibling
    2397             :      * that was current a moment ago could have split, so we may have to move
    2398             :      * right.
    2399             :      */
    2400        5458 :     if (target != leafblkno)
    2401         144 :         _bt_lockbuf(rel, leafbuf, BT_WRITE);
    2402        5458 :     if (leftsib != P_NONE)
    2403             :     {
    2404         996 :         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2405         996 :         page = BufferGetPage(lbuf);
    2406         996 :         opaque = BTPageGetOpaque(page);
    2407         996 :         while (P_ISDELETED(opaque) || opaque->btpo_next != target)
    2408             :         {
    2409           0 :             bool        leftsibvalid = true;
    2410             : 
    2411             :             /*
    2412             :              * Before we follow the link from the page that was the left
    2413             :              * sibling mere moments ago, validate its right link.  This
    2414             :              * reduces the opportunities for loop to fail to ever make any
    2415             :              * progress in the presence of index corruption.
    2416             :              *
    2417             :              * Note: we rely on the assumption that there can only be one
    2418             :              * vacuum process running at a time (against the same index).
    2419             :              */
    2420           0 :             if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
    2421           0 :                 leftsib == opaque->btpo_next)
    2422           0 :                 leftsibvalid = false;
    2423             : 
    2424           0 :             leftsib = opaque->btpo_next;
    2425           0 :             _bt_relbuf(rel, lbuf);
    2426             : 
    2427           0 :             if (!leftsibvalid)
    2428             :             {
    2429             :                 /*
    2430             :                  * This is known to fail in the field; sibling link corruption
    2431             :                  * is relatively common.  Press on with vacuuming rather than
    2432             :                  * just throwing an ERROR.
    2433             :                  */
    2434           0 :                 ereport(LOG,
    2435             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    2436             :                          errmsg_internal("valid left sibling for deletion target could not be located: "
    2437             :                                          "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
    2438             :                                          leftsib, target, leafblkno, scanblkno,
    2439             :                                          targetlevel, RelationGetRelationName(rel))));
    2440             : 
    2441             :                 /* Must release all pins and locks on failure exit */
    2442           0 :                 ReleaseBuffer(buf);
    2443           0 :                 if (target != leafblkno)
    2444           0 :                     _bt_relbuf(rel, leafbuf);
    2445             : 
    2446           0 :                 return false;
    2447             :             }
    2448             : 
    2449           0 :             CHECK_FOR_INTERRUPTS();
    2450             : 
    2451             :             /* step right one page */
    2452           0 :             lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2453           0 :             page = BufferGetPage(lbuf);
    2454           0 :             opaque = BTPageGetOpaque(page);
    2455             :         }
    2456             :     }
    2457             :     else
    2458        4462 :         lbuf = InvalidBuffer;
    2459             : 
    2460             :     /* Next write-lock the target page itself */
    2461        5458 :     _bt_lockbuf(rel, buf, BT_WRITE);
    2462        5458 :     page = BufferGetPage(buf);
    2463        5458 :     opaque = BTPageGetOpaque(page);
    2464             : 
    2465             :     /*
    2466             :      * Check page is still empty etc, else abandon deletion.  This is just for
    2467             :      * paranoia's sake; a half-dead page cannot resurrect because there can be
    2468             :      * only one vacuum process running at a time.
    2469             :      */
    2470        5458 :     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
    2471           0 :         elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
    2472             :              target, RelationGetRelationName(rel));
    2473             : 
    2474        5458 :     if (opaque->btpo_prev != leftsib)
    2475           0 :         ereport(ERROR,
    2476             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2477             :                  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
    2478             :                                  leftsib, opaque->btpo_prev, target,
    2479             :                                  RelationGetRelationName(rel))));
    2480             : 
    2481        5458 :     if (target == leafblkno)
    2482             :     {
    2483        5314 :         if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    2484        5314 :             !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
    2485           0 :             elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
    2486             :                  target, RelationGetRelationName(rel));
    2487             : 
    2488             :         /* Leaf page is also target page: don't set leaftopparent */
    2489        5314 :         leaftopparent = InvalidBlockNumber;
    2490             :     }
    2491             :     else
    2492             :     {
    2493             :         IndexTuple  finaldataitem;
    2494             : 
    2495         144 :         if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
    2496         144 :             P_ISLEAF(opaque))
    2497           0 :             elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
    2498             :                  targetlevel, target, RelationGetRelationName(rel));
    2499             : 
    2500             :         /* Target is internal: set leaftopparent for next call here...  */
    2501         144 :         itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
    2502         144 :         finaldataitem = (IndexTuple) PageGetItem(page, itemid);
    2503         144 :         leaftopparent = BTreeTupleGetDownLink(finaldataitem);
    2504             :         /* ...except when it would be a redundant pointer-to-self */
    2505         144 :         if (leaftopparent == leafblkno)
    2506          76 :             leaftopparent = InvalidBlockNumber;
    2507             :     }
    2508             : 
    2509             :     /* No leaftopparent for level 0 (leaf page) or level 1 target */
    2510             :     Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
    2511             : 
    2512             :     /*
    2513             :      * And next write-lock the (current) right sibling.
    2514             :      */
    2515        5458 :     rightsib = opaque->btpo_next;
    2516        5458 :     rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2517        5458 :     page = BufferGetPage(rbuf);
    2518        5458 :     opaque = BTPageGetOpaque(page);
    2519             : 
    2520             :     /*
    2521             :      * Validate target's right sibling page.  Its left link must point back to
    2522             :      * the target page.
    2523             :      */
    2524        5458 :     if (opaque->btpo_prev != target)
    2525             :     {
    2526             :         /*
    2527             :          * This is known to fail in the field; sibling link corruption is
    2528             :          * relatively common.  Press on with vacuuming rather than just
    2529             :          * throwing an ERROR (same approach used for left-sibling's-right-link
    2530             :          * validation check a moment ago).
    2531             :          */
    2532           0 :         ereport(LOG,
    2533             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2534             :                  errmsg_internal("right sibling's left-link doesn't match: "
    2535             :                                  "right sibling %u of target %u with leafblkno %u "
    2536             :                                  "and scanblkno %u spuriously links to non-target %u "
    2537             :                                  "on level %u of index \"%s\"",
    2538             :                                  rightsib, target, leafblkno,
    2539             :                                  scanblkno, opaque->btpo_prev,
    2540             :                                  targetlevel, RelationGetRelationName(rel))));
    2541             : 
    2542             :         /* Must release all pins and locks on failure exit */
    2543           0 :         if (BufferIsValid(lbuf))
    2544           0 :             _bt_relbuf(rel, lbuf);
    2545           0 :         _bt_relbuf(rel, rbuf);
    2546           0 :         _bt_relbuf(rel, buf);
    2547           0 :         if (target != leafblkno)
    2548           0 :             _bt_relbuf(rel, leafbuf);
    2549             : 
    2550           0 :         return false;
    2551             :     }
    2552             : 
    2553        5458 :     rightsib_is_rightmost = P_RIGHTMOST(opaque);
    2554        5458 :     *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2555             : 
    2556             :     /*
    2557             :      * If we are deleting the next-to-last page on the target's level, then
    2558             :      * the rightsib is a candidate to become the new fast root. (In theory, it
    2559             :      * might be possible to push the fast root even further down, but the odds
    2560             :      * of doing so are slim, and the locking considerations daunting.)
    2561             :      *
    2562             :      * We can safely acquire a lock on the metapage here --- see comments for
    2563             :      * _bt_newlevel().
    2564             :      */
    2565        5458 :     if (leftsib == P_NONE && rightsib_is_rightmost)
    2566             :     {
    2567          50 :         page = BufferGetPage(rbuf);
    2568          50 :         opaque = BTPageGetOpaque(page);
    2569          50 :         if (P_RIGHTMOST(opaque))
    2570             :         {
    2571             :             /* rightsib will be the only one left on the level */
    2572          50 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2573          50 :             metapg = BufferGetPage(metabuf);
    2574          50 :             metad = BTPageGetMeta(metapg);
    2575             : 
    2576             :             /*
    2577             :              * The expected case here is btm_fastlevel == targetlevel+1; if
    2578             :              * the fastlevel is <= targetlevel, something is wrong, and we
    2579             :              * choose to overwrite it to fix it.
    2580             :              */
    2581          50 :             if (metad->btm_fastlevel > targetlevel + 1)
    2582             :             {
    2583             :                 /* no update wanted */
    2584           0 :                 _bt_relbuf(rel, metabuf);
    2585           0 :                 metabuf = InvalidBuffer;
    2586             :             }
    2587             :         }
    2588             :     }
    2589             : 
    2590             :     /*
    2591             :      * Here we begin doing the deletion.
    2592             :      */
    2593             : 
    2594             :     /* No ereport(ERROR) until changes are logged */
    2595        5458 :     START_CRIT_SECTION();
    2596             : 
    2597             :     /*
    2598             :      * Update siblings' side-links.  Note the target page's side-links will
    2599             :      * continue to point to the siblings.  Asserts here are just rechecking
    2600             :      * things we already verified above.
    2601             :      */
    2602        5458 :     if (BufferIsValid(lbuf))
    2603             :     {
    2604         996 :         page = BufferGetPage(lbuf);
    2605         996 :         opaque = BTPageGetOpaque(page);
    2606             :         Assert(opaque->btpo_next == target);
    2607         996 :         opaque->btpo_next = rightsib;
    2608             :     }
    2609        5458 :     page = BufferGetPage(rbuf);
    2610        5458 :     opaque = BTPageGetOpaque(page);
    2611             :     Assert(opaque->btpo_prev == target);
    2612        5458 :     opaque->btpo_prev = leftsib;
    2613             : 
    2614             :     /*
    2615             :      * If we deleted a parent of the targeted leaf page, instead of the leaf
    2616             :      * itself, update the leaf to point to the next remaining child in the
    2617             :      * subtree.
    2618             :      *
    2619             :      * Note: We rely on the fact that a buffer pin on the leaf page has been
    2620             :      * held since leafhikey was initialized.  This is safe, though only
    2621             :      * because the page was already half-dead at that point.  The leaf page
    2622             :      * cannot have been modified by any other backend during the period when
    2623             :      * no lock was held.
    2624             :      */
    2625        5458 :     if (target != leafblkno)
    2626         144 :         BTreeTupleSetTopParent(leafhikey, leaftopparent);
    2627             : 
    2628             :     /*
    2629             :      * Mark the page itself deleted.  It can be recycled when all current
    2630             :      * transactions are gone.  Storing GetTopTransactionId() would work, but
    2631             :      * we're in VACUUM and would not otherwise have an XID.  Having already
    2632             :      * updated links to the target, ReadNextFullTransactionId() suffices as an
    2633             :      * upper bound.  Any scan having retained a now-stale link is advertising
    2634             :      * in its PGPROC an xmin less than or equal to the value we read here.  It
    2635             :      * will continue to do so, holding back the xmin horizon, for the duration
    2636             :      * of that scan.
    2637             :      */
    2638        5458 :     page = BufferGetPage(buf);
    2639        5458 :     opaque = BTPageGetOpaque(page);
    2640             :     Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
    2641             : 
    2642             :     /*
    2643             :      * Store upper bound XID that's used to determine when deleted page is no
    2644             :      * longer needed as a tombstone
    2645             :      */
    2646        5458 :     safexid = ReadNextFullTransactionId();
    2647        5458 :     BTPageSetDeleted(page, safexid);
    2648        5458 :     opaque->btpo_cycleid = 0;
    2649             : 
    2650             :     /* And update the metapage, if needed */
    2651        5458 :     if (BufferIsValid(metabuf))
    2652             :     {
    2653             :         /* upgrade metapage if needed */
    2654          50 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
    2655           0 :             _bt_upgrademetapage(metapg);
    2656          50 :         metad->btm_fastroot = rightsib;
    2657          50 :         metad->btm_fastlevel = targetlevel;
    2658          50 :         MarkBufferDirty(metabuf);
    2659             :     }
    2660             : 
    2661             :     /* Must mark buffers dirty before XLogInsert */
    2662        5458 :     MarkBufferDirty(rbuf);
    2663        5458 :     MarkBufferDirty(buf);
    2664        5458 :     if (BufferIsValid(lbuf))
    2665         996 :         MarkBufferDirty(lbuf);
    2666        5458 :     if (target != leafblkno)
    2667         144 :         MarkBufferDirty(leafbuf);
    2668             : 
    2669             :     /* XLOG stuff */
    2670        5458 :     if (RelationNeedsWAL(rel))
    2671             :     {
    2672             :         xl_btree_unlink_page xlrec;
    2673             :         xl_btree_metadata xlmeta;
    2674             :         uint8       xlinfo;
    2675             :         XLogRecPtr  recptr;
    2676             : 
    2677        5458 :         XLogBeginInsert();
    2678             : 
    2679        5458 :         XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
    2680        5458 :         if (BufferIsValid(lbuf))
    2681         996 :             XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2682        5458 :         XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
    2683        5458 :         if (target != leafblkno)
    2684         144 :             XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
    2685             : 
    2686             :         /* information stored on the target/to-be-unlinked block */
    2687        5458 :         xlrec.leftsib = leftsib;
    2688        5458 :         xlrec.rightsib = rightsib;
    2689        5458 :         xlrec.level = targetlevel;
    2690        5458 :         xlrec.safexid = safexid;
    2691             : 
    2692             :         /* information needed to recreate the leaf block (if not the target) */
    2693        5458 :         xlrec.leafleftsib = leafleftsib;
    2694        5458 :         xlrec.leafrightsib = leafrightsib;
    2695        5458 :         xlrec.leaftopparent = leaftopparent;
    2696             : 
    2697        5458 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
    2698             : 
    2699        5458 :         if (BufferIsValid(metabuf))
    2700             :         {
    2701          50 :             XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2702             : 
    2703             :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2704          50 :             xlmeta.version = metad->btm_version;
    2705          50 :             xlmeta.root = metad->btm_root;
    2706          50 :             xlmeta.level = metad->btm_level;
    2707          50 :             xlmeta.fastroot = metad->btm_fastroot;
    2708          50 :             xlmeta.fastlevel = metad->btm_fastlevel;
    2709          50 :             xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2710          50 :             xlmeta.allequalimage = metad->btm_allequalimage;
    2711             : 
    2712          50 :             XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
    2713          50 :             xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
    2714             :         }
    2715             :         else
    2716        5408 :             xlinfo = XLOG_BTREE_UNLINK_PAGE;
    2717             : 
    2718        5458 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2719             : 
    2720        5458 :         if (BufferIsValid(metabuf))
    2721             :         {
    2722          50 :             PageSetLSN(metapg, recptr);
    2723             :         }
    2724        5458 :         page = BufferGetPage(rbuf);
    2725        5458 :         PageSetLSN(page, recptr);
    2726        5458 :         page = BufferGetPage(buf);
    2727        5458 :         PageSetLSN(page, recptr);
    2728        5458 :         if (BufferIsValid(lbuf))
    2729             :         {
    2730         996 :             page = BufferGetPage(lbuf);
    2731         996 :             PageSetLSN(page, recptr);
    2732             :         }
    2733        5458 :         if (target != leafblkno)
    2734             :         {
    2735         144 :             page = BufferGetPage(leafbuf);
    2736         144 :             PageSetLSN(page, recptr);
    2737             :         }
    2738             :     }
    2739             : 
    2740        5458 :     END_CRIT_SECTION();
    2741             : 
    2742             :     /* release metapage */
    2743        5458 :     if (BufferIsValid(metabuf))
    2744          50 :         _bt_relbuf(rel, metabuf);
    2745             : 
    2746             :     /* release siblings */
    2747        5458 :     if (BufferIsValid(lbuf))
    2748         996 :         _bt_relbuf(rel, lbuf);
    2749        5458 :     _bt_relbuf(rel, rbuf);
    2750             : 
    2751             :     /* If the target is not leafbuf, we're done with it now -- release it */
    2752        5458 :     if (target != leafblkno)
    2753         144 :         _bt_relbuf(rel, buf);
    2754             : 
    2755             :     /*
    2756             :      * Maintain pages_newly_deleted, which is simply the number of pages
    2757             :      * deleted by the ongoing VACUUM operation.
    2758             :      *
    2759             :      * Maintain pages_deleted in a way that takes into account how
    2760             :      * btvacuumpage() will count deleted pages that have yet to become
    2761             :      * scanblkno -- only count page when it's not going to get that treatment
    2762             :      * later on.
    2763             :      */
    2764        5458 :     stats->pages_newly_deleted++;
    2765        5458 :     if (target <= scanblkno)
    2766        5336 :         stats->pages_deleted++;
    2767             : 
    2768             :     /*
    2769             :      * Remember information about the target page (now a newly deleted page)
    2770             :      * in dedicated vstate space for later.  The page will be considered as a
    2771             :      * candidate to place in the FSM at the end of the current btvacuumscan()
    2772             :      * call.
    2773             :      */
    2774        5458 :     _bt_pendingfsm_add(vstate, target, safexid);
    2775             : 
    2776             :     /* Success - hold on to lock on leafbuf (might also have been target) */
    2777        5458 :     return true;
    2778             : }
    2779             : 
    2780             : /*
    2781             :  * Establish how tall the to-be-deleted subtree will be during the first stage
    2782             :  * of page deletion.
    2783             :  *
    2784             :  * Caller's child argument is the block number of the page caller wants to
    2785             :  * delete (this is leafbuf's block number, except when we're called
    2786             :  * recursively).  stack is a search stack leading to it.  Note that we will
    2787             :  * update the stack entry(s) to reflect current downlink positions --- this is
    2788             :  * similar to the corresponding point in page split handling.
    2789             :  *
    2790             :  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
    2791             :  * false.  Returns true on success, in which case caller can use certain
    2792             :  * details established here to perform the first stage of deletion.  This
    2793             :  * function is the last point at which page deletion may be deemed unsafe
    2794             :  * (barring index corruption, or unexpected concurrent page deletions).
    2795             :  *
    2796             :  * We write lock the parent of the root of the to-be-deleted subtree for
    2797             :  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
    2798             :  * caller).  Caller will have to remove a downlink from *subtreeparent.  We
    2799             :  * also set a *subtreeparent offset number in *poffset, to indicate the
    2800             :  * location of the pivot tuple that contains the relevant downlink.
    2801             :  *
    2802             :  * The root of the to-be-deleted subtree is called the "top parent".  Note
    2803             :  * that the leafbuf page is often the final "top parent" page (you can think
    2804             :  * of the leafbuf page as a degenerate single page subtree when that happens).
    2805             :  * Caller should initialize *topparent to the target leafbuf page block number
    2806             :  * (while *topparentrightsib should be set to leafbuf's right sibling block
    2807             :  * number).  We will update *topparent (and *topparentrightsib) for caller
    2808             :  * here, though only when it turns out that caller will delete at least one
    2809             :  * internal page (i.e. only when caller needs to store a valid link to the top
    2810             :  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
    2811             :  */
    2812             : static bool
    2813        5478 : _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
    2814             :                         BTStack stack, Buffer *subtreeparent,
    2815             :                         OffsetNumber *poffset, BlockNumber *topparent,
    2816             :                         BlockNumber *topparentrightsib)
    2817             : {
    2818             :     BlockNumber parent,
    2819             :                 leftsibparent;
    2820             :     OffsetNumber parentoffset,
    2821             :                 maxoff;
    2822             :     Buffer      pbuf;
    2823             :     Page        page;
    2824             :     BTPageOpaque opaque;
    2825             : 
    2826             :     /*
    2827             :      * Locate the pivot tuple whose downlink points to "child".  Write lock
    2828             :      * the parent page itself.
    2829             :      */
    2830        5478 :     pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
    2831        5478 :     if (pbuf == InvalidBuffer)
    2832             :     {
    2833             :         /*
    2834             :          * Failed to "re-find" a pivot tuple whose downlink matched our child
    2835             :          * block number on the parent level -- the index must be corrupt.
    2836             :          * Don't even try to delete the leafbuf subtree.  Just report the
    2837             :          * issue and press on with vacuuming the index.
    2838             :          *
    2839             :          * Note: _bt_getstackbuf() recovers from concurrent page splits that
    2840             :          * take place on the parent level.  Its approach is a near-exhaustive
    2841             :          * linear search.  This also gives it a surprisingly good chance of
    2842             :          * recovering in the event of a buggy or inconsistent opclass.  But we
    2843             :          * don't rely on that here.
    2844             :          */
    2845           0 :         ereport(LOG,
    2846             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2847             :                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
    2848             :                                  RelationGetRelationName(rel), child)));
    2849             :         Assert(false);
    2850           0 :         return false;
    2851             :     }
    2852             : 
    2853        5478 :     parent = stack->bts_blkno;
    2854        5478 :     parentoffset = stack->bts_offset;
    2855             : 
    2856        5478 :     page = BufferGetPage(pbuf);
    2857        5478 :     opaque = BTPageGetOpaque(page);
    2858        5478 :     maxoff = PageGetMaxOffsetNumber(page);
    2859        5478 :     leftsibparent = opaque->btpo_prev;
    2860             : 
    2861             :     /*
    2862             :      * _bt_getstackbuf() completes page splits on returned parent buffer when
    2863             :      * required.
    2864             :      *
    2865             :      * In general it's a bad idea for VACUUM to use up more disk space, which
    2866             :      * is why page deletion does not finish incomplete page splits most of the
    2867             :      * time.  We allow this limited exception because the risk is much lower,
    2868             :      * and the potential downside of not proceeding is much higher:  A single
    2869             :      * internal page with the INCOMPLETE_SPLIT flag set might otherwise
    2870             :      * prevent us from deleting hundreds of empty leaf pages from one level
    2871             :      * down.
    2872             :      */
    2873             :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    2874             : 
    2875        5478 :     if (parentoffset < maxoff)
    2876             :     {
    2877             :         /*
    2878             :          * Child is not the rightmost child in parent, so it's safe to delete
    2879             :          * the subtree whose root/topparent is child page
    2880             :          */
    2881        5314 :         *subtreeparent = pbuf;
    2882        5314 :         *poffset = parentoffset;
    2883        5314 :         return true;
    2884             :     }
    2885             : 
    2886             :     /*
    2887             :      * Child is the rightmost child of parent.
    2888             :      *
    2889             :      * Since it's the rightmost child of parent, deleting the child (or
    2890             :      * deleting the subtree whose root/topparent is the child page) is only
    2891             :      * safe when it's also possible to delete the parent.
    2892             :      */
    2893             :     Assert(parentoffset == maxoff);
    2894         164 :     if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
    2895             :     {
    2896             :         /*
    2897             :          * Child isn't parent's only child, or parent is rightmost on its
    2898             :          * entire level.  Definitely cannot delete any pages.
    2899             :          */
    2900           6 :         _bt_relbuf(rel, pbuf);
    2901           6 :         return false;
    2902             :     }
    2903             : 
    2904             :     /*
    2905             :      * Now make sure that the parent deletion is itself safe by examining the
    2906             :      * child's grandparent page.  Recurse, passing the parent page as the
    2907             :      * child page (child's grandparent is the parent on the next level up). If
    2908             :      * parent deletion is unsafe, then child deletion must also be unsafe (in
    2909             :      * which case caller cannot delete any pages at all).
    2910             :      */
    2911         158 :     *topparent = parent;
    2912         158 :     *topparentrightsib = opaque->btpo_next;
    2913             : 
    2914             :     /*
    2915             :      * Release lock on parent before recursing.
    2916             :      *
    2917             :      * It's OK to release page locks on parent before recursive call locks
    2918             :      * grandparent.  An internal page can only acquire an entry if the child
    2919             :      * is split, but that cannot happen as long as we still hold a lock on the
    2920             :      * leafbuf page.
    2921             :      */
    2922         158 :     _bt_relbuf(rel, pbuf);
    2923             : 
    2924             :     /*
    2925             :      * Before recursing, check that the left sibling of parent (if any) is not
    2926             :      * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
    2927             :      * parent lock).
    2928             :      *
    2929             :      * Note: We deliberately avoid completing incomplete splits here.
    2930             :      */
    2931         158 :     if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
    2932           0 :         return false;
    2933             : 
    2934             :     /* Recurse to examine child page's grandparent page */
    2935         158 :     return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
    2936             :                                    subtreeparent, poffset,
    2937             :                                    topparent, topparentrightsib);
    2938             : }
    2939             : 
    2940             : /*
    2941             :  * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
    2942             :  * optimization.
    2943             :  *
    2944             :  * Called at the start of a btvacuumscan().  Caller's cleanuponly argument
    2945             :  * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
    2946             :  *
    2947             :  * We expect to allocate memory inside VACUUM's top-level memory context here.
    2948             :  * The working buffer is subject to a limit based on work_mem.  Our strategy
    2949             :  * when the array can no longer grow within the bounds of that limit is to
    2950             :  * stop saving additional newly deleted pages, while proceeding as usual with
    2951             :  * the pages that we can fit.
    2952             :  */
    2953             : void
    2954        2392 : _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
    2955             : {
    2956             :     int64       maxbufsize;
    2957             : 
    2958             :     /*
    2959             :      * Don't bother with optimization in cleanup-only case -- we don't expect
    2960             :      * any newly deleted pages.  Besides, cleanup-only calls to btvacuumscan()
    2961             :      * can only take place because this optimization didn't work out during
    2962             :      * the last VACUUM.
    2963             :      */
    2964        2392 :     if (cleanuponly)
    2965          14 :         return;
    2966             : 
    2967             :     /*
    2968             :      * Cap maximum size of array so that we always respect work_mem.  Avoid
    2969             :      * int overflow here.
    2970             :      */
    2971        2378 :     vstate->bufsize = 256;
    2972        2378 :     maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
    2973        2378 :     maxbufsize = Min(maxbufsize, INT_MAX);
    2974        2378 :     maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
    2975             :     /* Stay sane with small work_mem */
    2976        2378 :     maxbufsize = Max(maxbufsize, vstate->bufsize);
    2977        2378 :     vstate->maxbufsize = maxbufsize;
    2978             : 
    2979             :     /* Allocate buffer, indicate that there are currently 0 pending pages */
    2980        2378 :     vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
    2981        2378 :     vstate->npendingpages = 0;
    2982             : }
    2983             : 
    2984             : /*
    2985             :  * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
    2986             :  * the ongoing VACUUM operation) into the free space map -- though only when
    2987             :  * it is actually safe to do so by now.
    2988             :  *
    2989             :  * Called at the end of a btvacuumscan(), just before free space map vacuuming
    2990             :  * takes place.
    2991             :  *
    2992             :  * Frees memory allocated by _bt_pendingfsm_init(), if any.
    2993             :  */
    2994             : void
    2995        2392 : _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
    2996             : {
    2997        2392 :     IndexBulkDeleteResult *stats = vstate->stats;
    2998        2392 :     Relation    heaprel = vstate->info->heaprel;
    2999             : 
    3000             :     Assert(stats->pages_newly_deleted >= vstate->npendingpages);
    3001             :     Assert(heaprel != NULL);
    3002             : 
    3003        2392 :     if (vstate->npendingpages == 0)
    3004             :     {
    3005             :         /* Just free memory when nothing to do */
    3006        2290 :         if (vstate->pendingpages)
    3007        2276 :             pfree(vstate->pendingpages);
    3008             : 
    3009        2290 :         return;
    3010             :     }
    3011             : 
    3012             : #ifdef DEBUG_BTREE_PENDING_FSM
    3013             : 
    3014             :     /*
    3015             :      * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
    3016             :      * placing pending pages in the FSM.  Note that the optimization will
    3017             :      * never be effective without some other backend concurrently consuming an
    3018             :      * XID.
    3019             :      */
    3020             :     pg_usleep(5000000L);
    3021             : #endif
    3022             : 
    3023             :     /*
    3024             :      * Recompute VACUUM XID boundaries.
    3025             :      *
    3026             :      * We don't actually care about the oldest non-removable XID.  Computing
    3027             :      * the oldest such XID has a useful side-effect that we rely on: it
    3028             :      * forcibly updates the XID horizon state for this backend.  This step is
    3029             :      * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
    3030             :      * that it is now safe to recycle newly deleted pages without this step.
    3031             :      */
    3032         102 :     GetOldestNonRemovableTransactionId(heaprel);
    3033             : 
    3034         102 :     for (int i = 0; i < vstate->npendingpages; i++)
    3035             :     {
    3036         102 :         BlockNumber target = vstate->pendingpages[i].target;
    3037         102 :         FullTransactionId safexid = vstate->pendingpages[i].safexid;
    3038             : 
    3039             :         /*
    3040             :          * Do the equivalent of checking BTPageIsRecyclable(), but without
    3041             :          * accessing the page again a second time.
    3042             :          *
    3043             :          * Give up on finding the first non-recyclable page -- all later pages
    3044             :          * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
    3045             :          * to the array in safexid order.
    3046             :          */
    3047         102 :         if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
    3048         102 :             break;
    3049             : 
    3050           0 :         RecordFreeIndexPage(rel, target);
    3051           0 :         stats->pages_free++;
    3052             :     }
    3053             : 
    3054         102 :     pfree(vstate->pendingpages);
    3055             : }
    3056             : 
    3057             : /*
    3058             :  * Maintain array of pages that were deleted during current btvacuumscan()
    3059             :  * call, for use in _bt_pendingfsm_finalize()
    3060             :  */
    3061             : static void
    3062        5458 : _bt_pendingfsm_add(BTVacState *vstate,
    3063             :                    BlockNumber target,
    3064             :                    FullTransactionId safexid)
    3065             : {
    3066             :     Assert(vstate->npendingpages <= vstate->bufsize);
    3067             :     Assert(vstate->bufsize <= vstate->maxbufsize);
    3068             : 
    3069             : #ifdef USE_ASSERT_CHECKING
    3070             : 
    3071             :     /*
    3072             :      * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
    3073             :      * array will always be in safexid order (since that is the order that we
    3074             :      * save them in here)
    3075             :      */
    3076             :     if (vstate->npendingpages > 0)
    3077             :     {
    3078             :         FullTransactionId lastsafexid =
    3079             :             vstate->pendingpages[vstate->npendingpages - 1].safexid;
    3080             : 
    3081             :         Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
    3082             :     }
    3083             : #endif
    3084             : 
    3085             :     /*
    3086             :      * If temp buffer reaches maxbufsize/work_mem capacity then we discard
    3087             :      * information about this page.
    3088             :      *
    3089             :      * Note that this also covers the case where we opted to not use the
    3090             :      * optimization in _bt_pendingfsm_init().
    3091             :      */
    3092        5458 :     if (vstate->npendingpages == vstate->maxbufsize)
    3093           0 :         return;
    3094             : 
    3095             :     /* Consider enlarging buffer */
    3096        5458 :     if (vstate->npendingpages == vstate->bufsize)
    3097             :     {
    3098           8 :         int         newbufsize = vstate->bufsize * 2;
    3099             : 
    3100             :         /* Respect work_mem */
    3101           8 :         if (newbufsize > vstate->maxbufsize)
    3102           0 :             newbufsize = vstate->maxbufsize;
    3103             : 
    3104           8 :         vstate->bufsize = newbufsize;
    3105           8 :         vstate->pendingpages =
    3106           8 :             repalloc(vstate->pendingpages,
    3107           8 :                      sizeof(BTPendingFSM) * vstate->bufsize);
    3108             :     }
    3109             : 
    3110             :     /* Save metadata for newly deleted page */
    3111        5458 :     vstate->pendingpages[vstate->npendingpages].target = target;
    3112        5458 :     vstate->pendingpages[vstate->npendingpages].safexid = safexid;
    3113        5458 :     vstate->npendingpages++;
    3114             : }

Generated by: LCOV version 1.14