LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtpage.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.9 % 848 754
Test Date: 2026-04-02 18:15:59 Functions: 97.0 % 33 32
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtpage.c
       4              :  *    BTree-specific page management code for the Postgres btree access
       5              :  *    method.
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  *
      11              :  * IDENTIFICATION
      12              :  *    src/backend/access/nbtree/nbtpage.c
      13              :  *
      14              :  *  NOTES
      15              :  *     Postgres btree pages look like ordinary relation pages.  The opaque
      16              :  *     data at high addresses includes pointers to left and right siblings
      17              :  *     and flag data describing page state.  The first page in a btree, page
      18              :  *     zero, is special -- it stores meta-information describing the tree.
      19              :  *     Pages one and higher store the actual tree data.
      20              :  *
      21              :  *-------------------------------------------------------------------------
      22              :  */
      23              : #include "postgres.h"
      24              : 
      25              : #include "access/nbtree.h"
      26              : #include "access/nbtxlog.h"
      27              : #include "access/tableam.h"
      28              : #include "access/transam.h"
      29              : #include "access/xlog.h"
      30              : #include "access/xloginsert.h"
      31              : #include "common/int.h"
      32              : #include "miscadmin.h"
      33              : #include "storage/indexfsm.h"
      34              : #include "storage/predicate.h"
      35              : #include "storage/procarray.h"
      36              : #include "utils/injection_point.h"
      37              : #include "utils/memdebug.h"
      38              : #include "utils/memutils.h"
      39              : #include "utils/snapmgr.h"
      40              : 
      41              : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
      42              : static void _bt_delitems_delete(Relation rel, Buffer buf,
      43              :                                 TransactionId snapshotConflictHorizon,
      44              :                                 bool isCatalogRel,
      45              :                                 OffsetNumber *deletable, int ndeletable,
      46              :                                 BTVacuumPosting *updatable, int nupdatable);
      47              : static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
      48              :                                  OffsetNumber *updatedoffsets,
      49              :                                  Size *updatedbuflen, bool needswal);
      50              : static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
      51              :                                    Buffer leafbuf, BTStack stack);
      52              : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
      53              :                                      BlockNumber scanblkno,
      54              :                                      bool *rightsib_empty,
      55              :                                      BTVacState *vstate);
      56              : static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
      57              :                                     BlockNumber child, BTStack stack,
      58              :                                     Buffer *subtreeparent, OffsetNumber *poffset,
      59              :                                     BlockNumber *topparent,
      60              :                                     BlockNumber *topparentrightsib);
      61              : static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
      62              :                                FullTransactionId safexid);
      63              : 
      64              : /*
      65              :  *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
      66              :  */
      67              : void
      68        31147 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
      69              :                  bool allequalimage)
      70              : {
      71              :     BTMetaPageData *metad;
      72              :     BTPageOpaque metaopaque;
      73              : 
      74        31147 :     _bt_pageinit(page, BLCKSZ);
      75              : 
      76        31147 :     metad = BTPageGetMeta(page);
      77        31147 :     metad->btm_magic = BTREE_MAGIC;
      78        31147 :     metad->btm_version = BTREE_VERSION;
      79        31147 :     metad->btm_root = rootbknum;
      80        31147 :     metad->btm_level = level;
      81        31147 :     metad->btm_fastroot = rootbknum;
      82        31147 :     metad->btm_fastlevel = level;
      83        31147 :     metad->btm_last_cleanup_num_delpages = 0;
      84        31147 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
      85        31147 :     metad->btm_allequalimage = allequalimage;
      86              : 
      87        31147 :     metaopaque = BTPageGetOpaque(page);
      88        31147 :     metaopaque->btpo_flags = BTP_META;
      89              : 
      90              :     /*
      91              :      * Set pd_lower just past the end of the metadata.  This is essential,
      92              :      * because without doing so, metadata will be lost if xlog.c compresses
      93              :      * the page.
      94              :      */
      95        31147 :     ((PageHeader) page)->pd_lower =
      96        31147 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
      97        31147 : }
      98              : 
      99              : /*
     100              :  *  _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
     101              :  *      3, the last version that can be updated without broadly affecting
     102              :  *      on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
     103              :  *
     104              :  *      This routine does purely in-memory image upgrade.  Caller is
     105              :  *      responsible for locking, WAL-logging etc.
     106              :  */
     107              : void
     108            0 : _bt_upgrademetapage(Page page)
     109              : {
     110              :     BTMetaPageData *metad;
     111              :     BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
     112              : 
     113            0 :     metad = BTPageGetMeta(page);
     114            0 :     metaopaque = BTPageGetOpaque(page);
     115              : 
     116              :     /* It must be really a meta page of upgradable version */
     117              :     Assert(metaopaque->btpo_flags & BTP_META);
     118              :     Assert(metad->btm_version < BTREE_NOVAC_VERSION);
     119              :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     120              : 
     121              :     /* Set version number and fill extra fields added into version 3 */
     122            0 :     metad->btm_version = BTREE_NOVAC_VERSION;
     123            0 :     metad->btm_last_cleanup_num_delpages = 0;
     124            0 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     125              :     /* Only a REINDEX can set this field */
     126              :     Assert(!metad->btm_allequalimage);
     127            0 :     metad->btm_allequalimage = false;
     128              : 
     129              :     /* Adjust pd_lower (see _bt_initmetapage() for details) */
     130            0 :     ((PageHeader) page)->pd_lower =
     131            0 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
     132            0 : }
     133              : 
     134              : /*
     135              :  * Get metadata from share-locked buffer containing metapage, while performing
     136              :  * standard sanity checks.
     137              :  *
     138              :  * Callers that cache data returned here in local cache should note that an
     139              :  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
     140              :  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
     141              :  */
     142              : static BTMetaPageData *
     143      1115688 : _bt_getmeta(Relation rel, Buffer metabuf)
     144              : {
     145              :     Page        metapg;
     146              :     BTPageOpaque metaopaque;
     147              :     BTMetaPageData *metad;
     148              : 
     149      1115688 :     metapg = BufferGetPage(metabuf);
     150      1115688 :     metaopaque = BTPageGetOpaque(metapg);
     151      1115688 :     metad = BTPageGetMeta(metapg);
     152              : 
     153              :     /* sanity-check the metapage */
     154      1115688 :     if (!P_ISMETA(metaopaque) ||
     155      1115688 :         metad->btm_magic != BTREE_MAGIC)
     156            0 :         ereport(ERROR,
     157              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     158              :                  errmsg("index \"%s\" is not a btree",
     159              :                         RelationGetRelationName(rel))));
     160              : 
     161      1115688 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     162      1115688 :         metad->btm_version > BTREE_VERSION)
     163            0 :         ereport(ERROR,
     164              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     165              :                  errmsg("version mismatch in index \"%s\": file version %d, "
     166              :                         "current version %d, minimal supported version %d",
     167              :                         RelationGetRelationName(rel),
     168              :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     169              : 
     170      1115688 :     return metad;
     171              : }
     172              : 
     173              : /*
     174              :  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
     175              :  *
     176              :  * Called by btvacuumcleanup when btbulkdelete was never called because no
     177              :  * index tuples needed to be deleted.
     178              :  */
     179              : bool
     180       139878 : _bt_vacuum_needs_cleanup(Relation rel)
     181              : {
     182              :     Buffer      metabuf;
     183              :     Page        metapg;
     184              :     BTMetaPageData *metad;
     185              :     uint32      btm_version;
     186              :     BlockNumber prev_num_delpages;
     187              : 
     188              :     /*
     189              :      * Copy details from metapage to local variables quickly.
     190              :      *
     191              :      * Note that we deliberately avoid using cached version of metapage here.
     192              :      */
     193       139878 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     194       139878 :     metapg = BufferGetPage(metabuf);
     195       139878 :     metad = BTPageGetMeta(metapg);
     196       139878 :     btm_version = metad->btm_version;
     197              : 
     198       139878 :     if (btm_version < BTREE_NOVAC_VERSION)
     199              :     {
     200              :         /*
     201              :          * Metapage needs to be dynamically upgraded to store fields that are
     202              :          * only present when btm_version >= BTREE_NOVAC_VERSION
     203              :          */
     204            0 :         _bt_relbuf(rel, metabuf);
     205            0 :         return true;
     206              :     }
     207              : 
     208       139878 :     prev_num_delpages = metad->btm_last_cleanup_num_delpages;
     209       139878 :     _bt_relbuf(rel, metabuf);
     210              : 
     211              :     /*
     212              :      * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
     213              :      * total size of the index.  We can reasonably expect (though are not
     214              :      * guaranteed) to be able to recycle this many pages if we decide to do a
     215              :      * btvacuumscan call during the ongoing btvacuumcleanup.  For further
     216              :      * details see the nbtree/README section on placing deleted pages in the
     217              :      * FSM.
     218              :      */
     219       139878 :     if (prev_num_delpages > 0 &&
     220            7 :         prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
     221            7 :         return true;
     222              : 
     223       139871 :     return false;
     224              : }
     225              : 
     226              : /*
     227              :  * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
     228              :  *
     229              :  * Called at the end of btvacuumcleanup, when num_delpages value has been
     230              :  * finalized.
     231              :  */
     232              : void
     233         1344 : _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
     234              : {
     235              :     Buffer      metabuf;
     236              :     Page        metapg;
     237              :     BTMetaPageData *metad;
     238              :     XLogRecPtr  recptr;
     239              : 
     240              :     /*
     241              :      * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
     242              :      * field started out as a TransactionId field called btm_oldest_btpo_xact.
     243              :      * Both "versions" are just uint32 fields.  It was convenient to repurpose
     244              :      * the field when we began to use 64-bit XIDs in deleted pages.
     245              :      *
     246              :      * It's possible that a pg_upgrade'd database will contain an XID value in
     247              :      * what is now recognized as the metapage's btm_last_cleanup_num_delpages
     248              :      * field.  _bt_vacuum_needs_cleanup() may even believe that this value
     249              :      * indicates that there are lots of pages that it needs to recycle, when
     250              :      * in reality there are only one or two.  The worst that can happen is
     251              :      * that there will be a call to btvacuumscan a little earlier, which will
     252              :      * set btm_last_cleanup_num_delpages to a sane value when we're called.
     253              :      *
     254              :      * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
     255              :      * no longer used as of PostgreSQL 14.  We set it to -1.0 on rewrite, just
     256              :      * to be consistent.
     257              :      */
     258         1344 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     259         1344 :     metapg = BufferGetPage(metabuf);
     260         1344 :     metad = BTPageGetMeta(metapg);
     261              : 
     262              :     /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
     263         1344 :     if (metad->btm_version >= BTREE_NOVAC_VERSION &&
     264         1344 :         metad->btm_last_cleanup_num_delpages == num_delpages)
     265              :     {
     266              :         /* Usually means index continues to have num_delpages of 0 */
     267         1251 :         _bt_relbuf(rel, metabuf);
     268         1251 :         return;
     269              :     }
     270              : 
     271              :     /* trade in our read lock for a write lock */
     272           93 :     _bt_unlockbuf(rel, metabuf);
     273           93 :     _bt_lockbuf(rel, metabuf, BT_WRITE);
     274              : 
     275           93 :     START_CRIT_SECTION();
     276              : 
     277              :     /* upgrade meta-page if needed */
     278           93 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
     279            0 :         _bt_upgrademetapage(metapg);
     280              : 
     281              :     /* update cleanup-related information */
     282           93 :     metad->btm_last_cleanup_num_delpages = num_delpages;
     283           93 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     284           93 :     MarkBufferDirty(metabuf);
     285              : 
     286              :     /* write wal record if needed */
     287           93 :     if (RelationNeedsWAL(rel))
     288           93 :     {
     289              :         xl_btree_metadata md;
     290              : 
     291           93 :         XLogBeginInsert();
     292           93 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     293              : 
     294              :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     295           93 :         md.version = metad->btm_version;
     296           93 :         md.root = metad->btm_root;
     297           93 :         md.level = metad->btm_level;
     298           93 :         md.fastroot = metad->btm_fastroot;
     299           93 :         md.fastlevel = metad->btm_fastlevel;
     300           93 :         md.last_cleanup_num_delpages = num_delpages;
     301           93 :         md.allequalimage = metad->btm_allequalimage;
     302              : 
     303           93 :         XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
     304              : 
     305           93 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
     306              :     }
     307              :     else
     308            0 :         recptr = XLogGetFakeLSN(rel);
     309              : 
     310           93 :     PageSetLSN(metapg, recptr);
     311              : 
     312           93 :     END_CRIT_SECTION();
     313              : 
     314           93 :     _bt_relbuf(rel, metabuf);
     315              : }
     316              : 
     317              : /*
     318              :  *  _bt_getroot() -- Get the root page of the btree.
     319              :  *
     320              :  *      Since the root page can move around the btree file, we have to read
     321              :  *      its location from the metadata page, and then read the root page
     322              :  *      itself.  If no root page exists yet, we have to create one.
     323              :  *
     324              :  *      The access type parameter (BT_READ or BT_WRITE) controls whether
     325              :  *      a new root page will be created or not.  If access = BT_READ,
     326              :  *      and no root page exists, we just return InvalidBuffer.  For
     327              :  *      BT_WRITE, we try to create the root page if it doesn't exist.
     328              :  *      NOTE that the returned root page will have only a read lock set
     329              :  *      on it even if access = BT_WRITE!
     330              :  *
     331              :  *      If access = BT_WRITE, heaprel must be set; otherwise caller can just
     332              :  *      pass NULL.  See _bt_allocbuf for an explanation.
     333              :  *
     334              :  *      The returned page is not necessarily the true root --- it could be
     335              :  *      a "fast root" (a page that is alone in its level due to deletions).
     336              :  *      Also, if the root page is split while we are "in flight" to it,
     337              :  *      what we will return is the old root, which is now just the leftmost
     338              :  *      page on a probably-not-very-wide level.  For most purposes this is
     339              :  *      as good as or better than the true root, so we do not bother to
     340              :  *      insist on finding the true root.  We do, however, guarantee to
     341              :  *      return a live (not deleted or half-dead) page.
     342              :  *
     343              :  *      On successful return, the root page is pinned and read-locked.
     344              :  *      The metadata page is not locked or pinned on exit.
     345              :  */
     346              : Buffer
     347     15359277 : _bt_getroot(Relation rel, Relation heaprel, int access)
     348              : {
     349              :     Buffer      metabuf;
     350              :     Buffer      rootbuf;
     351              :     Page        rootpage;
     352              :     BTPageOpaque rootopaque;
     353              :     BlockNumber rootblkno;
     354              :     uint32      rootlevel;
     355              :     BTMetaPageData *metad;
     356              :     XLogRecPtr  recptr;
     357              : 
     358              :     Assert(access == BT_READ || heaprel != NULL);
     359              : 
     360              :     /*
     361              :      * Try to use previously-cached metapage data to find the root.  This
     362              :      * normally saves one buffer access per index search, which is a very
     363              :      * helpful savings in bufmgr traffic and hence contention.
     364              :      */
     365     15359277 :     if (rel->rd_amcache != NULL)
     366              :     {
     367     15024303 :         metad = (BTMetaPageData *) rel->rd_amcache;
     368              :         /* We shouldn't have cached it if any of these fail */
     369              :         Assert(metad->btm_magic == BTREE_MAGIC);
     370              :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     371              :         Assert(metad->btm_version <= BTREE_VERSION);
     372              :         Assert(!metad->btm_allequalimage ||
     373              :                metad->btm_version > BTREE_NOVAC_VERSION);
     374              :         Assert(metad->btm_root != P_NONE);
     375              : 
     376     15024303 :         rootblkno = metad->btm_fastroot;
     377              :         Assert(rootblkno != P_NONE);
     378     15024303 :         rootlevel = metad->btm_fastlevel;
     379              : 
     380     15024303 :         rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
     381     15024303 :         rootpage = BufferGetPage(rootbuf);
     382     15024303 :         rootopaque = BTPageGetOpaque(rootpage);
     383              : 
     384              :         /*
     385              :          * Since the cache might be stale, we check the page more carefully
     386              :          * here than normal.  We *must* check that it's not deleted. If it's
     387              :          * not alone on its level, then we reject too --- this may be overly
     388              :          * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
     389              :          * because that's not set in a "fast root".
     390              :          */
     391     15024303 :         if (!P_IGNORE(rootopaque) &&
     392     15024303 :             rootopaque->btpo_level == rootlevel &&
     393     15024303 :             P_LEFTMOST(rootopaque) &&
     394     15024303 :             P_RIGHTMOST(rootopaque))
     395              :         {
     396              :             /* OK, accept cached page as the root */
     397     15023368 :             return rootbuf;
     398              :         }
     399          935 :         _bt_relbuf(rel, rootbuf);
     400              :         /* Cache is stale, throw it away */
     401          935 :         if (rel->rd_amcache)
     402          935 :             pfree(rel->rd_amcache);
     403          935 :         rel->rd_amcache = NULL;
     404              :     }
     405              : 
     406       335909 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     407       335909 :     metad = _bt_getmeta(rel, metabuf);
     408              : 
     409              :     /* if no root page initialized yet, do it */
     410       335909 :     if (metad->btm_root == P_NONE)
     411              :     {
     412              :         Page        metapg;
     413              : 
     414              :         /* If access = BT_READ, caller doesn't want us to create root yet */
     415       334691 :         if (access == BT_READ)
     416              :         {
     417       327453 :             _bt_relbuf(rel, metabuf);
     418       327453 :             return InvalidBuffer;
     419              :         }
     420              : 
     421              :         /* trade in our read lock for a write lock */
     422         7238 :         _bt_unlockbuf(rel, metabuf);
     423         7238 :         _bt_lockbuf(rel, metabuf, BT_WRITE);
     424              : 
     425              :         /*
     426              :          * Race condition:  if someone else initialized the metadata between
     427              :          * the time we released the read lock and acquired the write lock, we
     428              :          * must avoid doing it again.
     429              :          */
     430         7238 :         if (metad->btm_root != P_NONE)
     431              :         {
     432              :             /*
     433              :              * Metadata initialized by someone else.  In order to guarantee no
     434              :              * deadlocks, we have to release the metadata page and start all
     435              :              * over again.  (Is that really true? But it's hardly worth trying
     436              :              * to optimize this case.)
     437              :              */
     438            1 :             _bt_relbuf(rel, metabuf);
     439            1 :             return _bt_getroot(rel, heaprel, access);
     440              :         }
     441              : 
     442              :         /*
     443              :          * Get, initialize, write, and leave a lock of the appropriate type on
     444              :          * the new root page.  Since this is the first page in the tree, it's
     445              :          * a leaf as well as the root.
     446              :          */
     447         7237 :         rootbuf = _bt_allocbuf(rel, heaprel);
     448         7237 :         rootblkno = BufferGetBlockNumber(rootbuf);
     449         7237 :         rootpage = BufferGetPage(rootbuf);
     450         7237 :         rootopaque = BTPageGetOpaque(rootpage);
     451         7237 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     452         7237 :         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
     453         7237 :         rootopaque->btpo_level = 0;
     454         7237 :         rootopaque->btpo_cycleid = 0;
     455              :         /* Get raw page pointer for metapage */
     456         7237 :         metapg = BufferGetPage(metabuf);
     457              : 
     458              :         /* NO ELOG(ERROR) till meta is updated */
     459         7237 :         START_CRIT_SECTION();
     460              : 
     461              :         /* upgrade metapage if needed */
     462         7237 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
     463            0 :             _bt_upgrademetapage(metapg);
     464              : 
     465         7237 :         metad->btm_root = rootblkno;
     466         7237 :         metad->btm_level = 0;
     467         7237 :         metad->btm_fastroot = rootblkno;
     468         7237 :         metad->btm_fastlevel = 0;
     469         7237 :         metad->btm_last_cleanup_num_delpages = 0;
     470         7237 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
     471              : 
     472         7237 :         MarkBufferDirty(rootbuf);
     473         7237 :         MarkBufferDirty(metabuf);
     474              : 
     475              :         /* XLOG stuff */
     476         7237 :         if (RelationNeedsWAL(rel))
     477         6920 :         {
     478              :             xl_btree_newroot xlrec;
     479              :             xl_btree_metadata md;
     480              : 
     481         6920 :             XLogBeginInsert();
     482         6920 :             XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
     483         6920 :             XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     484              : 
     485              :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     486         6920 :             md.version = metad->btm_version;
     487         6920 :             md.root = rootblkno;
     488         6920 :             md.level = 0;
     489         6920 :             md.fastroot = rootblkno;
     490         6920 :             md.fastlevel = 0;
     491         6920 :             md.last_cleanup_num_delpages = 0;
     492         6920 :             md.allequalimage = metad->btm_allequalimage;
     493              : 
     494         6920 :             XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
     495              : 
     496         6920 :             xlrec.rootblk = rootblkno;
     497         6920 :             xlrec.level = 0;
     498              : 
     499         6920 :             XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
     500              : 
     501         6920 :             recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
     502              :         }
     503              :         else
     504          317 :             recptr = XLogGetFakeLSN(rel);
     505              : 
     506         7237 :         PageSetLSN(rootpage, recptr);
     507         7237 :         PageSetLSN(metapg, recptr);
     508              : 
     509         7237 :         END_CRIT_SECTION();
     510              : 
     511              :         /*
     512              :          * swap root write lock for read lock.  There is no danger of anyone
     513              :          * else accessing the new root page while it's unlocked, since no one
     514              :          * else knows where it is yet.
     515              :          */
     516         7237 :         _bt_unlockbuf(rel, rootbuf);
     517         7237 :         _bt_lockbuf(rel, rootbuf, BT_READ);
     518              : 
     519              :         /* okay, metadata is correct, release lock on it without caching */
     520         7237 :         _bt_relbuf(rel, metabuf);
     521              :     }
     522              :     else
     523              :     {
     524         1218 :         rootblkno = metad->btm_fastroot;
     525              :         Assert(rootblkno != P_NONE);
     526         1218 :         rootlevel = metad->btm_fastlevel;
     527              : 
     528              :         /*
     529              :          * Cache the metapage data for next time
     530              :          */
     531         1218 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     532              :                                              sizeof(BTMetaPageData));
     533         1218 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     534              : 
     535              :         /*
     536              :          * We are done with the metapage; arrange to release it via first
     537              :          * _bt_relandgetbuf call
     538              :          */
     539         1218 :         rootbuf = metabuf;
     540              : 
     541              :         for (;;)
     542              :         {
     543         1218 :             rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     544         1218 :             rootpage = BufferGetPage(rootbuf);
     545         1218 :             rootopaque = BTPageGetOpaque(rootpage);
     546              : 
     547         1218 :             if (!P_IGNORE(rootopaque))
     548         1218 :                 break;
     549              : 
     550              :             /* it's dead, Jim.  step right one page */
     551            0 :             if (P_RIGHTMOST(rootopaque))
     552            0 :                 elog(ERROR, "no live root page found in index \"%s\"",
     553              :                      RelationGetRelationName(rel));
     554            0 :             rootblkno = rootopaque->btpo_next;
     555              :         }
     556              : 
     557         1218 :         if (rootopaque->btpo_level != rootlevel)
     558            0 :             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     559              :                  rootblkno, RelationGetRelationName(rel),
     560              :                  rootopaque->btpo_level, rootlevel);
     561              :     }
     562              : 
     563              :     /*
     564              :      * By here, we have a pin and read lock on the root page, and no lock set
     565              :      * on the metadata page.  Return the root page's buffer.
     566              :      */
     567         8455 :     return rootbuf;
     568              : }
     569              : 
     570              : /*
     571              :  *  _bt_gettrueroot() -- Get the true root page of the btree.
     572              :  *
     573              :  *      This is the same as the BT_READ case of _bt_getroot(), except
     574              :  *      we follow the true-root link not the fast-root link.
     575              :  *
     576              :  * By the time we acquire lock on the root page, it might have been split and
     577              :  * not be the true root anymore.  This is okay for the present uses of this
     578              :  * routine; we only really need to be able to move up at least one tree level
     579              :  * from whatever non-root page we were at.  If we ever do need to lock the
     580              :  * one true root page, we could loop here, re-reading the metapage on each
     581              :  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
     582              :  * moving to the root --- that'd deadlock against any concurrent root split.)
     583              :  */
     584              : Buffer
     585           16 : _bt_gettrueroot(Relation rel)
     586              : {
     587              :     Buffer      metabuf;
     588              :     Page        metapg;
     589              :     BTPageOpaque metaopaque;
     590              :     Buffer      rootbuf;
     591              :     Page        rootpage;
     592              :     BTPageOpaque rootopaque;
     593              :     BlockNumber rootblkno;
     594              :     uint32      rootlevel;
     595              :     BTMetaPageData *metad;
     596              : 
     597              :     /*
     598              :      * We don't try to use cached metapage data here, since (a) this path is
     599              :      * not performance-critical, and (b) if we are here it suggests our cache
     600              :      * is out-of-date anyway.  In light of point (b), it's probably safest to
     601              :      * actively flush any cached metapage info.
     602              :      */
     603           16 :     if (rel->rd_amcache)
     604           16 :         pfree(rel->rd_amcache);
     605           16 :     rel->rd_amcache = NULL;
     606              : 
     607           16 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     608           16 :     metapg = BufferGetPage(metabuf);
     609           16 :     metaopaque = BTPageGetOpaque(metapg);
     610           16 :     metad = BTPageGetMeta(metapg);
     611              : 
     612           16 :     if (!P_ISMETA(metaopaque) ||
     613           16 :         metad->btm_magic != BTREE_MAGIC)
     614            0 :         ereport(ERROR,
     615              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     616              :                  errmsg("index \"%s\" is not a btree",
     617              :                         RelationGetRelationName(rel))));
     618              : 
     619           16 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     620           16 :         metad->btm_version > BTREE_VERSION)
     621            0 :         ereport(ERROR,
     622              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     623              :                  errmsg("version mismatch in index \"%s\": file version %d, "
     624              :                         "current version %d, minimal supported version %d",
     625              :                         RelationGetRelationName(rel),
     626              :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     627              : 
     628              :     /* if no root page initialized yet, fail */
     629           16 :     if (metad->btm_root == P_NONE)
     630              :     {
     631            0 :         _bt_relbuf(rel, metabuf);
     632            0 :         return InvalidBuffer;
     633              :     }
     634              : 
     635           16 :     rootblkno = metad->btm_root;
     636           16 :     rootlevel = metad->btm_level;
     637              : 
     638              :     /*
     639              :      * We are done with the metapage; arrange to release it via first
     640              :      * _bt_relandgetbuf call
     641              :      */
     642           16 :     rootbuf = metabuf;
     643              : 
     644              :     for (;;)
     645              :     {
     646           16 :         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     647           16 :         rootpage = BufferGetPage(rootbuf);
     648           16 :         rootopaque = BTPageGetOpaque(rootpage);
     649              : 
     650           16 :         if (!P_IGNORE(rootopaque))
     651           16 :             break;
     652              : 
     653              :         /* it's dead, Jim.  step right one page */
     654            0 :         if (P_RIGHTMOST(rootopaque))
     655            0 :             elog(ERROR, "no live root page found in index \"%s\"",
     656              :                  RelationGetRelationName(rel));
     657            0 :         rootblkno = rootopaque->btpo_next;
     658              :     }
     659              : 
     660           16 :     if (rootopaque->btpo_level != rootlevel)
     661            0 :         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     662              :              rootblkno, RelationGetRelationName(rel),
     663              :              rootopaque->btpo_level, rootlevel);
     664              : 
     665           16 :     return rootbuf;
     666              : }
     667              : 
     668              : /*
     669              :  *  _bt_getrootheight() -- Get the height of the btree search tree.
     670              :  *
     671              :  *      We return the level (counting from zero) of the current fast root.
     672              :  *      This represents the number of tree levels we'd have to descend through
     673              :  *      to start any btree index search.
     674              :  *
     675              :  *      This is used by the planner for cost-estimation purposes.  Since it's
     676              :  *      only an estimate, slightly-stale data is fine, hence we don't worry
     677              :  *      about updating previously cached data.
     678              :  */
     679              : int
     680      3267468 : _bt_getrootheight(Relation rel)
     681              : {
     682              :     BTMetaPageData *metad;
     683              : 
     684      3267468 :     if (rel->rd_amcache == NULL)
     685              :     {
     686              :         Buffer      metabuf;
     687              : 
     688        65679 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     689        65679 :         metad = _bt_getmeta(rel, metabuf);
     690              : 
     691              :         /*
     692              :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     693              :          * to be made, so just stop here and report the index height is zero.
     694              :          * (XXX perhaps _bt_getroot() should be changed to allow this case.)
     695              :          */
     696        65679 :         if (metad->btm_root == P_NONE)
     697              :         {
     698        38246 :             _bt_relbuf(rel, metabuf);
     699        38246 :             return 0;
     700              :         }
     701              : 
     702              :         /*
     703              :          * Cache the metapage data for next time
     704              :          */
     705        27433 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     706              :                                              sizeof(BTMetaPageData));
     707        27433 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     708        27433 :         _bt_relbuf(rel, metabuf);
     709              :     }
     710              : 
     711              :     /* Get cached page */
     712      3229222 :     metad = (BTMetaPageData *) rel->rd_amcache;
     713              :     /* We shouldn't have cached it if any of these fail */
     714              :     Assert(metad->btm_magic == BTREE_MAGIC);
     715              :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     716              :     Assert(metad->btm_version <= BTREE_VERSION);
     717              :     Assert(!metad->btm_allequalimage ||
     718              :            metad->btm_version > BTREE_NOVAC_VERSION);
     719              :     Assert(metad->btm_fastroot != P_NONE);
     720              : 
     721      3229222 :     return metad->btm_fastlevel;
     722              : }
     723              : 
     724              : /*
     725              :  *  _bt_metaversion() -- Get version/status info from metapage.
     726              :  *
     727              :  *      Sets caller's *heapkeyspace and *allequalimage arguments using data
     728              :  *      from the B-Tree metapage (could be locally-cached version).  This
     729              :  *      information needs to be stashed in insertion scankey, so we provide a
     730              :  *      single function that fetches both at once.
     731              :  *
     732              :  *      This is used to determine the rules that must be used to descend a
     733              :  *      btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
     734              :  *      pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
     735              :  *      performance when inserting a new BTScanInsert-wise duplicate tuple
     736              :  *      among many leaf pages already full of such duplicates.
     737              :  *
     738              :  *      Also sets allequalimage field, which indicates whether or not it is
     739              :  *      safe to apply deduplication.  We rely on the assumption that
     740              :  *      btm_allequalimage will be zero'ed on heapkeyspace indexes that were
     741              :  *      pg_upgrade'd from Postgres 12.
     742              :  */
     743              : void
     744     17394723 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
     745              : {
     746              :     BTMetaPageData *metad;
     747              : 
     748     17394723 :     if (rel->rd_amcache == NULL)
     749              :     {
     750              :         Buffer      metabuf;
     751              : 
     752       714100 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     753       714100 :         metad = _bt_getmeta(rel, metabuf);
     754              : 
     755              :         /*
     756              :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     757              :          * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
     758              :          * be changed to allow this case.)
     759              :          */
     760       714100 :         if (metad->btm_root == P_NONE)
     761              :         {
     762       329870 :             *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     763       329870 :             *allequalimage = metad->btm_allequalimage;
     764              : 
     765       329870 :             _bt_relbuf(rel, metabuf);
     766       329870 :             return;
     767              :         }
     768              : 
     769              :         /*
     770              :          * Cache the metapage data for next time
     771              :          *
     772              :          * An on-the-fly version upgrade performed by _bt_upgrademetapage()
     773              :          * can change the nbtree version for an index without invalidating any
     774              :          * local cache.  This is okay because it can only happen when moving
     775              :          * from version 2 to version 3, both of which are !heapkeyspace
     776              :          * versions.
     777              :          */
     778       384230 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     779              :                                              sizeof(BTMetaPageData));
     780       384230 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     781       384230 :         _bt_relbuf(rel, metabuf);
     782              :     }
     783              : 
     784              :     /* Get cached page */
     785     17064853 :     metad = (BTMetaPageData *) rel->rd_amcache;
     786              :     /* We shouldn't have cached it if any of these fail */
     787              :     Assert(metad->btm_magic == BTREE_MAGIC);
     788              :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     789              :     Assert(metad->btm_version <= BTREE_VERSION);
     790              :     Assert(!metad->btm_allequalimage ||
     791              :            metad->btm_version > BTREE_NOVAC_VERSION);
     792              :     Assert(metad->btm_fastroot != P_NONE);
     793              : 
     794     17064853 :     *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     795     17064853 :     *allequalimage = metad->btm_allequalimage;
     796              : }
     797              : 
     798              : /*
     799              :  *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
     800              :  */
     801              : void
     802     28692127 : _bt_checkpage(Relation rel, Buffer buf)
     803              : {
     804     28692127 :     Page        page = BufferGetPage(buf);
     805              : 
     806              :     /*
     807              :      * ReadBuffer verifies that every newly-read page passes
     808              :      * PageHeaderIsValid, which means it either contains a reasonably sane
     809              :      * page header or is all-zero.  We have to defend against the all-zero
     810              :      * case, however.
     811              :      */
     812     28692127 :     if (PageIsNew(page))
     813            0 :         ereport(ERROR,
     814              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     815              :                  errmsg("index \"%s\" contains unexpected zero page at block %u",
     816              :                         RelationGetRelationName(rel),
     817              :                         BufferGetBlockNumber(buf)),
     818              :                  errhint("Please REINDEX it.")));
     819              : 
     820              :     /*
     821              :      * Additionally check that the special area looks sane.
     822              :      */
     823     28692127 :     if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
     824            0 :         ereport(ERROR,
     825              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     826              :                  errmsg("index \"%s\" contains corrupted page at block %u",
     827              :                         RelationGetRelationName(rel),
     828              :                         BufferGetBlockNumber(buf)),
     829              :                  errhint("Please REINDEX it.")));
     830     28692127 : }
     831              : 
     832              : /*
     833              :  *  _bt_getbuf() -- Get an existing block in a buffer, for read or write.
     834              :  *
     835              :  *      The general rule in nbtree is that it's never okay to access a
     836              :  *      page without holding both a buffer pin and a buffer lock on
     837              :  *      the page's buffer.
     838              :  *
     839              :  *      When this routine returns, the appropriate lock is set on the
     840              :  *      requested buffer and its reference count has been incremented
     841              :  *      (ie, the buffer is "locked and pinned").  Also, we apply
     842              :  *      _bt_checkpage to sanity-check the page, and perform Valgrind
     843              :  *      client requests that help Valgrind detect unsafe page accesses.
     844              :  *
     845              :  *      Note: raw LockBuffer() calls are disallowed in nbtree; all
     846              :  *      buffer lock requests need to go through wrapper functions such
     847              :  *      as _bt_lockbuf().
     848              :  */
     849              : Buffer
     850     16421948 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
     851              : {
     852              :     Buffer      buf;
     853              : 
     854              :     Assert(BlockNumberIsValid(blkno));
     855              : 
     856              :     /* Read an existing block of the relation */
     857     16421948 :     buf = ReadBuffer(rel, blkno);
     858     16421948 :     _bt_lockbuf(rel, buf, access);
     859     16421948 :     _bt_checkpage(rel, buf);
     860              : 
     861     16421948 :     return buf;
     862              : }
     863              : 
     864              : /*
     865              :  *  _bt_allocbuf() -- Allocate a new block/page.
     866              :  *
     867              :  * Returns a write-locked buffer containing an unallocated nbtree page.
     868              :  *
     869              :  * Callers are required to pass a valid heaprel.  We need heaprel so that we
     870              :  * can handle generating a snapshotConflictHorizon that makes reusing a page
     871              :  * from the FSM safe for queries that may be running on standbys.
     872              :  */
     873              : Buffer
     874        22443 : _bt_allocbuf(Relation rel, Relation heaprel)
     875              : {
     876              :     Buffer      buf;
     877              :     BlockNumber blkno;
     878              :     Page        page;
     879              : 
     880              :     Assert(heaprel != NULL);
     881              : 
     882              :     /*
     883              :      * First see if the FSM knows of any free pages.
     884              :      *
     885              :      * We can't trust the FSM's report unreservedly; we have to check that the
     886              :      * page is still free.  (For example, an already-free page could have been
     887              :      * re-used between the time the last VACUUM scanned it and the time the
     888              :      * VACUUM made its FSM updates.)
     889              :      *
     890              :      * In fact, it's worse than that: we can't even assume that it's safe to
     891              :      * take a lock on the reported page.  If somebody else has a lock on it,
     892              :      * or even worse our own caller does, we could deadlock.  (The own-caller
     893              :      * scenario is actually not improbable. Consider an index on a serial or
     894              :      * timestamp column.  Nearly all splits will be at the rightmost page, so
     895              :      * it's entirely likely that _bt_split will call us while holding a lock
     896              :      * on the page most recently acquired from FSM. A VACUUM running
     897              :      * concurrently with the previous split could well have placed that page
     898              :      * back in FSM.)
     899              :      *
     900              :      * To get around that, we ask for only a conditional lock on the reported
     901              :      * page.  If we fail, then someone else is using the page, and we may
     902              :      * reasonably assume it's not free.  (If we happen to be wrong, the worst
     903              :      * consequence is the page will be lost to use till the next VACUUM, which
     904              :      * is no big problem.)
     905              :      */
     906              :     for (;;)
     907              :     {
     908        22443 :         blkno = GetFreeIndexPage(rel);
     909        22443 :         if (blkno == InvalidBlockNumber)
     910        22316 :             break;
     911          127 :         buf = ReadBuffer(rel, blkno);
     912          127 :         if (_bt_conditionallockbuf(rel, buf))
     913              :         {
     914          127 :             page = BufferGetPage(buf);
     915              : 
     916              :             /*
     917              :              * It's possible to find an all-zeroes page in an index.  For
     918              :              * example, a backend might successfully extend the relation one
     919              :              * page and then crash before it is able to make a WAL entry for
     920              :              * adding the page.  If we find a zeroed page then reclaim it
     921              :              * immediately.
     922              :              */
     923          127 :             if (PageIsNew(page))
     924              :             {
     925              :                 /* Okay to use page.  Initialize and return it. */
     926            0 :                 _bt_pageinit(page, BufferGetPageSize(buf));
     927            0 :                 return buf;
     928              :             }
     929              : 
     930          127 :             if (BTPageIsRecyclable(page, heaprel))
     931              :             {
     932              :                 /*
     933              :                  * If we are generating WAL for Hot Standby then create a WAL
     934              :                  * record that will allow us to conflict with queries running
     935              :                  * on standby, in case they have snapshots older than safexid
     936              :                  * value
     937              :                  */
     938          127 :                 if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
     939              :                 {
     940              :                     xl_btree_reuse_page xlrec_reuse;
     941              : 
     942              :                     /*
     943              :                      * Note that we don't register the buffer with the record,
     944              :                      * because this operation doesn't modify the page (that
     945              :                      * already happened, back when VACUUM deleted the page).
     946              :                      * This record only exists to provide a conflict point for
     947              :                      * Hot Standby.  See record REDO routine comments.
     948              :                      */
     949           66 :                     xlrec_reuse.locator = rel->rd_locator;
     950           66 :                     xlrec_reuse.block = blkno;
     951           66 :                     xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
     952           66 :                     xlrec_reuse.isCatalogRel =
     953           66 :                         RelationIsAccessibleInLogicalDecoding(heaprel);
     954              : 
     955           66 :                     XLogBeginInsert();
     956           66 :                     XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);
     957              : 
     958           66 :                     XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
     959              :                 }
     960              : 
     961              :                 /* Okay to use page.  Re-initialize and return it. */
     962          127 :                 _bt_pageinit(page, BufferGetPageSize(buf));
     963          127 :                 return buf;
     964              :             }
     965            0 :             elog(DEBUG2, "FSM returned nonrecyclable page");
     966            0 :             _bt_relbuf(rel, buf);
     967              :         }
     968              :         else
     969              :         {
     970            0 :             elog(DEBUG2, "FSM returned nonlockable page");
     971              :             /* couldn't get lock, so just drop pin */
     972            0 :             ReleaseBuffer(buf);
     973              :         }
     974              :     }
     975              : 
     976              :     /*
     977              :      * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
     978              :      * risk a race condition against btvacuumscan --- see comments therein.
     979              :      * This forces us to repeat the valgrind request that _bt_lockbuf()
     980              :      * otherwise would make, as we can't use _bt_lockbuf() without introducing
     981              :      * a race.
     982              :      */
     983        22316 :     buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
     984        22316 :     if (!RelationUsesLocalBuffers(rel))
     985              :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
     986              : 
     987              :     /* Initialize the new page before returning it */
     988        22316 :     page = BufferGetPage(buf);
     989              :     Assert(PageIsNew(page));
     990        22316 :     _bt_pageinit(page, BufferGetPageSize(buf));
     991              : 
     992        22316 :     return buf;
     993              : }
     994              : 
     995              : /*
     996              :  *  _bt_relandgetbuf() -- release a locked buffer and get another one.
     997              :  *
     998              :  * This is equivalent to _bt_relbuf followed by _bt_getbuf.  Also, if obuf is
     999              :  * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
    1000              :  * simplifies some callers.
    1001              :  *
    1002              :  * The original motivation for using this was to avoid two entries to the
    1003              :  * bufmgr when one would do.  However, now it's mainly just a notational
    1004              :  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
    1005              :  * is when the target page is the same one already in the buffer.
    1006              :  */
    1007              : Buffer
    1008     12202703 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
    1009              : {
    1010              :     Buffer      buf;
    1011              : 
    1012              :     Assert(BlockNumberIsValid(blkno));
    1013     12202703 :     if (BufferIsValid(obuf))
    1014              :     {
    1015     12192167 :         if (BufferGetBlockNumber(obuf) == blkno)
    1016              :         {
    1017              :             /* trade in old lock mode for new lock */
    1018            0 :             _bt_unlockbuf(rel, obuf);
    1019            0 :             buf = obuf;
    1020              :         }
    1021              :         else
    1022              :         {
    1023              :             /* release lock and pin at once, that's a bit more efficient */
    1024     12192167 :             _bt_relbuf(rel, obuf);
    1025     12192167 :             buf = ReadBuffer(rel, blkno);
    1026              :         }
    1027              :     }
    1028              :     else
    1029        10536 :         buf = ReadBuffer(rel, blkno);
    1030              : 
    1031     12202703 :     _bt_lockbuf(rel, buf, access);
    1032     12202703 :     _bt_checkpage(rel, buf);
    1033              : 
    1034     12202703 :     return buf;
    1035              : }
    1036              : 
    1037              : /*
    1038              :  *  _bt_relbuf() -- release a locked buffer.
    1039              :  *
    1040              :  * Lock and pin (refcount) are both dropped. This is a bit more efficient than
    1041              :  * doing the two operations separately.
    1042              :  */
    1043              : void
    1044     25943714 : _bt_relbuf(Relation rel, Buffer buf)
    1045              : {
    1046              :     /*
    1047              :      * Buffer is pinned and locked, which means that it is expected to be
    1048              :      * defined and addressable.  Check that proactively.
    1049              :      */
    1050              :     VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1051     25943714 :     if (!RelationUsesLocalBuffers(rel))
    1052              :         VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
    1053              : 
    1054     25943714 :     UnlockReleaseBuffer(buf);
    1055     25943714 : }
    1056              : 
    1057              : /*
    1058              :  *  _bt_lockbuf() -- lock a pinned buffer.
    1059              :  *
    1060              :  * Lock is acquired without acquiring another pin.  This is like a raw
    1061              :  * LockBuffer() call, but performs extra steps needed by Valgrind.
    1062              :  *
    1063              :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1064              :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1065              :  */
    1066              : void
    1067     29214752 : _bt_lockbuf(Relation rel, Buffer buf, int access)
    1068              : {
    1069              :     /* LockBuffer() asserts that pin is held by this backend */
    1070     29214752 :     LockBuffer(buf, access);
    1071              : 
    1072              :     /*
    1073              :      * It doesn't matter that _bt_unlockbuf() won't get called in the event of
    1074              :      * an nbtree error (e.g. a unique violation error).  That won't cause
    1075              :      * Valgrind false positives.
    1076              :      *
    1077              :      * The nbtree client requests are superimposed on top of the bufmgr.c
    1078              :      * buffer pin client requests.  In the event of an nbtree error the buffer
    1079              :      * will certainly get marked as defined when the backend once again
    1080              :      * acquires its first pin on the buffer. (Of course, if the backend never
    1081              :      * touches the buffer again then it doesn't matter that it remains
    1082              :      * non-accessible to Valgrind.)
    1083              :      *
    1084              :      * Note: When an IndexTuple C pointer gets computed using an ItemId read
    1085              :      * from a page while a lock was held, the C pointer becomes unsafe to
    1086              :      * dereference forever as soon as the lock is released.  Valgrind can only
    1087              :      * detect cases where the pointer gets dereferenced with no _current_
    1088              :      * lock/pin held, though.
    1089              :      */
    1090     29214752 :     if (!RelationUsesLocalBuffers(rel))
    1091              :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1092     29214752 : }
    1093              : 
    1094              : /*
    1095              :  *  _bt_unlockbuf() -- unlock a pinned buffer.
    1096              :  */
    1097              : void
    1098      3324662 : _bt_unlockbuf(Relation rel, Buffer buf)
    1099              : {
    1100              :     /*
    1101              :      * Buffer is pinned and locked, which means that it is expected to be
    1102              :      * defined and addressable.  Check that proactively.
    1103              :      */
    1104              :     VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1105              : 
    1106              :     /* LockBuffer() asserts that pin is held by this backend */
    1107      3324662 :     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1108              : 
    1109      3324662 :     if (!RelationUsesLocalBuffers(rel))
    1110              :         VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
    1111      3324662 : }
    1112              : 
    1113              : /*
    1114              :  *  _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
    1115              :  *  buffer.
    1116              :  *
    1117              :  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
    1118              :  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
    1119              :  */
    1120              : bool
    1121        32746 : _bt_conditionallockbuf(Relation rel, Buffer buf)
    1122              : {
    1123              :     /* ConditionalLockBuffer() asserts that pin is held by this backend */
    1124        32746 :     if (!ConditionalLockBuffer(buf))
    1125         1431 :         return false;
    1126              : 
    1127        31315 :     if (!RelationUsesLocalBuffers(rel))
    1128              :         VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
    1129              : 
    1130        31315 :     return true;
    1131              : }
    1132              : 
    1133              : /*
    1134              :  *  _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
    1135              :  */
    1136              : void
    1137        12826 : _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
    1138              : {
    1139              :     /*
    1140              :      * Buffer is pinned and locked, which means that it is expected to be
    1141              :      * defined and addressable.  Check that proactively.
    1142              :      */
    1143              :     VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
    1144              : 
    1145              :     /* LockBuffer() asserts that pin is held by this backend */
    1146        12826 :     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1147        12826 :     LockBufferForCleanup(buf);
    1148        12826 : }
    1149              : 
    1150              : /*
    1151              :  *  _bt_pageinit() -- Initialize a new page.
    1152              :  *
    1153              :  * On return, the page header is initialized; data space is empty;
    1154              :  * special space is zeroed out.
    1155              :  */
    1156              : void
    1157       103635 : _bt_pageinit(Page page, Size size)
    1158              : {
    1159       103635 :     PageInit(page, size, sizeof(BTPageOpaqueData));
    1160       103635 : }
    1161              : 
    1162              : /*
    1163              :  * Delete item(s) from a btree leaf page during VACUUM.
    1164              :  *
    1165              :  * This routine assumes that the caller already has a full cleanup lock on
    1166              :  * the buffer.  Also, the given deletable and updatable arrays *must* be
    1167              :  * sorted in ascending order.
    1168              :  *
    1169              :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1170              :  * in an existing posting list item are to be removed.  This works by
    1171              :  * updating/overwriting an existing item with caller's new version of the item
    1172              :  * (a version that lacks the TIDs that are to be deleted).
    1173              :  *
    1174              :  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
    1175              :  * generate their own snapshotConflictHorizon directly from the tableam,
    1176              :  * whereas VACUUMs rely on the initial VACUUM table scan performing
    1177              :  * WAL-logging that takes care of the issue for the table's indexes
    1178              :  * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
    1179              :  * deletes don't do.
    1180              :  */
    1181              : void
    1182         7906 : _bt_delitems_vacuum(Relation rel, Buffer buf,
    1183              :                     OffsetNumber *deletable, int ndeletable,
    1184              :                     BTVacuumPosting *updatable, int nupdatable)
    1185              : {
    1186         7906 :     Page        page = BufferGetPage(buf);
    1187              :     BTPageOpaque opaque;
    1188         7906 :     bool        needswal = RelationNeedsWAL(rel);
    1189         7906 :     char       *updatedbuf = NULL;
    1190         7906 :     Size        updatedbuflen = 0;
    1191              :     OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1192              :     XLogRecPtr  recptr;
    1193              : 
    1194              :     /* Shouldn't be called unless there's something to do */
    1195              :     Assert(ndeletable > 0 || nupdatable > 0);
    1196              : 
    1197              :     /* Generate new version of posting lists without deleted TIDs */
    1198         7906 :     if (nupdatable > 0)
    1199          825 :         updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1200              :                                          updatedoffsets, &updatedbuflen,
    1201              :                                          needswal);
    1202              : 
    1203              :     /* No ereport(ERROR) until changes are logged */
    1204         7906 :     START_CRIT_SECTION();
    1205              : 
    1206              :     /*
    1207              :      * Handle posting tuple updates.
    1208              :      *
    1209              :      * Deliberately do this before handling simple deletes.  If we did it the
    1210              :      * other way around (i.e. WAL record order -- simple deletes before
    1211              :      * updates) then we'd have to make compensating changes to the 'updatable'
    1212              :      * array of offset numbers.
    1213              :      *
    1214              :      * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
    1215              :      * happens to already be set.  It's important that we not interfere with
    1216              :      * any future simple index tuple deletion operations.
    1217              :      */
    1218        29045 :     for (int i = 0; i < nupdatable; i++)
    1219              :     {
    1220        21139 :         OffsetNumber updatedoffset = updatedoffsets[i];
    1221              :         IndexTuple  itup;
    1222              :         Size        itemsz;
    1223              : 
    1224        21139 :         itup = updatable[i]->itup;
    1225        21139 :         itemsz = MAXALIGN(IndexTupleSize(itup));
    1226        21139 :         if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
    1227            0 :             elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1228              :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1229              :     }
    1230              : 
    1231              :     /* Now handle simple deletes of entire tuples */
    1232         7906 :     if (ndeletable > 0)
    1233         7630 :         PageIndexMultiDelete(page, deletable, ndeletable);
    1234              : 
    1235              :     /*
    1236              :      * We can clear the vacuum cycle ID since this page has certainly been
    1237              :      * processed by the current vacuum scan.
    1238              :      */
    1239         7906 :     opaque = BTPageGetOpaque(page);
    1240         7906 :     opaque->btpo_cycleid = 0;
    1241              : 
    1242              :     /*
    1243              :      * Clear the BTP_HAS_GARBAGE page flag.
    1244              :      *
    1245              :      * This flag indicates the presence of LP_DEAD items on the page (though
    1246              :      * not reliably).  Note that we only rely on it with pg_upgrade'd
    1247              :      * !heapkeyspace indexes.  That's why clearing it here won't usually
    1248              :      * interfere with simple index tuple deletion.
    1249              :      */
    1250         7906 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1251              : 
    1252         7906 :     MarkBufferDirty(buf);
    1253              : 
    1254              :     /* XLOG stuff */
    1255         7906 :     if (needswal)
    1256              :     {
    1257              :         xl_btree_vacuum xlrec_vacuum;
    1258              : 
    1259         7905 :         xlrec_vacuum.ndeleted = ndeletable;
    1260         7905 :         xlrec_vacuum.nupdated = nupdatable;
    1261              : 
    1262         7905 :         XLogBeginInsert();
    1263         7905 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1264         7905 :         XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
    1265              : 
    1266         7905 :         if (ndeletable > 0)
    1267         7629 :             XLogRegisterBufData(0, deletable,
    1268              :                                 ndeletable * sizeof(OffsetNumber));
    1269              : 
    1270         7905 :         if (nupdatable > 0)
    1271              :         {
    1272          825 :             XLogRegisterBufData(0, updatedoffsets,
    1273              :                                 nupdatable * sizeof(OffsetNumber));
    1274          825 :             XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1275              :         }
    1276              : 
    1277         7905 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
    1278              :     }
    1279              :     else
    1280            1 :         recptr = XLogGetFakeLSN(rel);
    1281              : 
    1282         7906 :     PageSetLSN(page, recptr);
    1283              : 
    1284         7906 :     END_CRIT_SECTION();
    1285              : 
    1286              :     /* can't leak memory here */
    1287         7906 :     if (updatedbuf != NULL)
    1288          825 :         pfree(updatedbuf);
    1289              :     /* free tuples allocated within _bt_delitems_update() */
    1290        29045 :     for (int i = 0; i < nupdatable; i++)
    1291        21139 :         pfree(updatable[i]->itup);
    1292         7906 : }
    1293              : 
    1294              : /*
    1295              :  * Delete item(s) from a btree leaf page during single-page cleanup.
    1296              :  *
    1297              :  * This routine assumes that the caller has pinned and write locked the
    1298              :  * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
    1299              :  * ascending order.
    1300              :  *
    1301              :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1302              :  * in an existing posting list item are to be removed.  This works by
    1303              :  * updating/overwriting an existing item with caller's new version of the item
    1304              :  * (a version that lacks the TIDs that are to be deleted).
    1305              :  *
    1306              :  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
    1307              :  * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
    1308              :  * (from the tableam).  This is used by the REDO routine to generate recovery
    1309              :  * conflicts.  The other difference is that only _bt_delitems_vacuum will
    1310              :  * clear page's VACUUM cycle ID.
    1311              :  */
    1312              : static void
    1313         5655 : _bt_delitems_delete(Relation rel, Buffer buf,
    1314              :                     TransactionId snapshotConflictHorizon, bool isCatalogRel,
    1315              :                     OffsetNumber *deletable, int ndeletable,
    1316              :                     BTVacuumPosting *updatable, int nupdatable)
    1317              : {
    1318         5655 :     Page        page = BufferGetPage(buf);
    1319              :     BTPageOpaque opaque;
    1320         5655 :     bool        needswal = RelationNeedsWAL(rel);
    1321         5655 :     char       *updatedbuf = NULL;
    1322         5655 :     Size        updatedbuflen = 0;
    1323              :     OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1324              :     XLogRecPtr  recptr;
    1325              : 
    1326              :     /* Shouldn't be called unless there's something to do */
    1327              :     Assert(ndeletable > 0 || nupdatable > 0);
    1328              : 
    1329              :     /* Generate new versions of posting lists without deleted TIDs */
    1330         5655 :     if (nupdatable > 0)
    1331          508 :         updatedbuf = _bt_delitems_update(updatable, nupdatable,
    1332              :                                          updatedoffsets, &updatedbuflen,
    1333              :                                          needswal);
    1334              : 
    1335              :     /* No ereport(ERROR) until changes are logged */
    1336         5655 :     START_CRIT_SECTION();
    1337              : 
    1338              :     /* Handle updates and deletes just like _bt_delitems_vacuum */
    1339        12363 :     for (int i = 0; i < nupdatable; i++)
    1340              :     {
    1341         6708 :         OffsetNumber updatedoffset = updatedoffsets[i];
    1342              :         IndexTuple  itup;
    1343              :         Size        itemsz;
    1344              : 
    1345         6708 :         itup = updatable[i]->itup;
    1346         6708 :         itemsz = MAXALIGN(IndexTupleSize(itup));
    1347         6708 :         if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
    1348            0 :             elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1349              :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1350              :     }
    1351              : 
    1352         5655 :     if (ndeletable > 0)
    1353         5597 :         PageIndexMultiDelete(page, deletable, ndeletable);
    1354              : 
    1355              :     /*
    1356              :      * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
    1357              :      * this point.  The VACUUM command alone controls vacuum cycle IDs.
    1358              :      */
    1359         5655 :     opaque = BTPageGetOpaque(page);
    1360              : 
    1361              :     /*
    1362              :      * Clear the BTP_HAS_GARBAGE page flag.
    1363              :      *
    1364              :      * This flag indicates the presence of LP_DEAD items on the page (though
    1365              :      * not reliably).  Note that we only rely on it with pg_upgrade'd
    1366              :      * !heapkeyspace indexes.
    1367              :      */
    1368         5655 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1369              : 
    1370         5655 :     MarkBufferDirty(buf);
    1371              : 
    1372              :     /* XLOG stuff */
    1373         5655 :     if (needswal)
    1374              :     {
    1375              :         xl_btree_delete xlrec_delete;
    1376              : 
    1377         5631 :         xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
    1378         5631 :         xlrec_delete.ndeleted = ndeletable;
    1379         5631 :         xlrec_delete.nupdated = nupdatable;
    1380         5631 :         xlrec_delete.isCatalogRel = isCatalogRel;
    1381              : 
    1382         5631 :         XLogBeginInsert();
    1383         5631 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1384         5631 :         XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
    1385              : 
    1386         5631 :         if (ndeletable > 0)
    1387         5573 :             XLogRegisterBufData(0, deletable,
    1388              :                                 ndeletable * sizeof(OffsetNumber));
    1389              : 
    1390         5631 :         if (nupdatable > 0)
    1391              :         {
    1392          508 :             XLogRegisterBufData(0, updatedoffsets,
    1393              :                                 nupdatable * sizeof(OffsetNumber));
    1394          508 :             XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1395              :         }
    1396              : 
    1397         5631 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
    1398              :     }
    1399              :     else
    1400           24 :         recptr = XLogGetFakeLSN(rel);
    1401              : 
    1402         5655 :     PageSetLSN(page, recptr);
    1403              : 
    1404         5655 :     END_CRIT_SECTION();
    1405              : 
    1406              :     /* can't leak memory here */
    1407         5655 :     if (updatedbuf != NULL)
    1408          508 :         pfree(updatedbuf);
    1409              :     /* free tuples allocated within _bt_delitems_update() */
    1410        12363 :     for (int i = 0; i < nupdatable; i++)
    1411         6708 :         pfree(updatable[i]->itup);
    1412         5655 : }
    1413              : 
    1414              : /*
    1415              :  * Set up state needed to delete TIDs from posting list tuples via "updating"
    1416              :  * the tuple.  Performs steps common to both _bt_delitems_vacuum and
    1417              :  * _bt_delitems_delete.  These steps must take place before each function's
    1418              :  * critical section begins.
    1419              :  *
    1420              :  * updatable and nupdatable are inputs, though note that we will use
    1421              :  * _bt_update_posting() to replace the original itup with a pointer to a final
    1422              :  * version in palloc()'d memory.  Caller should free the tuples when its done.
    1423              :  *
    1424              :  * The first nupdatable entries from updatedoffsets are set to the page offset
    1425              :  * number for posting list tuples that caller updates.  This is mostly useful
    1426              :  * because caller may need to WAL-log the page offsets (though we always do
    1427              :  * this for caller out of convenience).
    1428              :  *
    1429              :  * Returns buffer consisting of an array of xl_btree_update structs that
    1430              :  * describe the steps we perform here for caller (though only when needswal is
    1431              :  * true).  Also sets *updatedbuflen to the final size of the buffer.  This
    1432              :  * buffer is used by caller when WAL logging is required.
    1433              :  */
    1434              : static char *
    1435         1333 : _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
    1436              :                     OffsetNumber *updatedoffsets, Size *updatedbuflen,
    1437              :                     bool needswal)
    1438              : {
    1439         1333 :     char       *updatedbuf = NULL;
    1440         1333 :     Size        buflen = 0;
    1441              : 
    1442              :     /* Shouldn't be called unless there's something to do */
    1443              :     Assert(nupdatable > 0);
    1444              : 
    1445        29180 :     for (int i = 0; i < nupdatable; i++)
    1446              :     {
    1447        27847 :         BTVacuumPosting vacposting = updatable[i];
    1448              :         Size        itemsz;
    1449              : 
    1450              :         /* Replace work area IndexTuple with updated version */
    1451        27847 :         _bt_update_posting(vacposting);
    1452              : 
    1453              :         /* Keep track of size of xl_btree_update for updatedbuf in passing */
    1454        27847 :         itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
    1455        27847 :         buflen += itemsz;
    1456              : 
    1457              :         /* Build updatedoffsets buffer in passing */
    1458        27847 :         updatedoffsets[i] = vacposting->updatedoffset;
    1459              :     }
    1460              : 
    1461              :     /* XLOG stuff */
    1462         1333 :     if (needswal)
    1463              :     {
    1464         1333 :         Size        offset = 0;
    1465              : 
    1466              :         /* Allocate, set final size for caller */
    1467         1333 :         updatedbuf = palloc(buflen);
    1468         1333 :         *updatedbuflen = buflen;
    1469        29180 :         for (int i = 0; i < nupdatable; i++)
    1470              :         {
    1471        27847 :             BTVacuumPosting vacposting = updatable[i];
    1472              :             Size        itemsz;
    1473              :             xl_btree_update update;
    1474              : 
    1475        27847 :             update.ndeletedtids = vacposting->ndeletedtids;
    1476        27847 :             memcpy(updatedbuf + offset, &update.ndeletedtids,
    1477              :                    SizeOfBtreeUpdate);
    1478        27847 :             offset += SizeOfBtreeUpdate;
    1479              : 
    1480        27847 :             itemsz = update.ndeletedtids * sizeof(uint16);
    1481        27847 :             memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
    1482        27847 :             offset += itemsz;
    1483              :         }
    1484              :     }
    1485              : 
    1486         1333 :     return updatedbuf;
    1487              : }
    1488              : 
    1489              : /*
    1490              :  * Comparator used by _bt_delitems_delete_check() to restore deltids array
    1491              :  * back to its original leaf-page-wise sort order
    1492              :  */
    1493              : static int
    1494      3297424 : _bt_delitems_cmp(const void *a, const void *b)
    1495              : {
    1496      3297424 :     const TM_IndexDelete *indexdelete1 = a;
    1497      3297424 :     const TM_IndexDelete *indexdelete2 = b;
    1498              : 
    1499              :     Assert(indexdelete1->id != indexdelete2->id);
    1500              : 
    1501      3297424 :     return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
    1502              : }
    1503              : 
    1504              : /*
    1505              :  * Try to delete item(s) from a btree leaf page during single-page cleanup.
    1506              :  *
    1507              :  * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
    1508              :  * tuples from caller's deltids array: those whose TIDs are found safe to
    1509              :  * delete by the tableam (or already marked LP_DEAD in index, and so already
    1510              :  * known to be deletable by our simple index deletion caller).  We physically
    1511              :  * delete index tuples from buf leaf page last of all (for index tuples where
    1512              :  * that is known to be safe following our table_index_delete_tuples() call).
    1513              :  *
    1514              :  * Simple index deletion caller only includes TIDs from index tuples marked
    1515              :  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
    1516              :  * included without increasing the total number of distinct table blocks for
    1517              :  * the deletion operation as a whole.  This approach often allows us to delete
    1518              :  * some extra index tuples that were practically free for tableam to check in
    1519              :  * passing (when they actually turn out to be safe to delete).  It probably
    1520              :  * only makes sense for the tableam to go ahead with these extra checks when
    1521              :  * it is block-oriented (otherwise the checks probably won't be practically
    1522              :  * free, which we rely on).  The tableam interface requires the tableam side
    1523              :  * to handle the problem, though, so this is okay (we as an index AM are free
    1524              :  * to make the simplifying assumption that all tableams must be block-based).
    1525              :  *
    1526              :  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
    1527              :  * without expecting that tableam will check most of them.  The tableam has
    1528              :  * considerable discretion around which entries/blocks it checks.  Our role in
    1529              :  * costing the bottom-up deletion operation is strictly advisory.
    1530              :  *
    1531              :  * Note: Caller must have added deltids entries (i.e. entries that go in
    1532              :  * delstate's main array) in leaf-page-wise order: page offset number order,
    1533              :  * TID order among entries taken from the same posting list tuple (tiebreak on
    1534              :  * TID).  This order is convenient to work with here.
    1535              :  *
    1536              :  * Note: We also rely on the id field of each deltids element "capturing" this
    1537              :  * original leaf-page-wise order.  That is, we expect to be able to get back
    1538              :  * to the original leaf-page-wise order just by sorting deltids on the id
    1539              :  * field (tableam will sort deltids for its own reasons, so we'll need to put
    1540              :  * it back in leaf-page-wise order afterwards).
    1541              :  */
    1542              : void
    1543         7526 : _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
    1544              :                           TM_IndexDeleteOp *delstate)
    1545              : {
    1546         7526 :     Page        page = BufferGetPage(buf);
    1547              :     TransactionId snapshotConflictHorizon;
    1548              :     bool        isCatalogRel;
    1549         7526 :     OffsetNumber postingidxoffnum = InvalidOffsetNumber;
    1550         7526 :     int         ndeletable = 0,
    1551         7526 :                 nupdatable = 0;
    1552              :     OffsetNumber deletable[MaxIndexTuplesPerPage];
    1553              :     BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1554              : 
    1555              :     /* Use tableam interface to determine which tuples to delete first */
    1556         7526 :     snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
    1557         7526 :     isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
    1558              : 
    1559              :     /* Should not WAL-log snapshotConflictHorizon unless it's required */
    1560         7526 :     if (!XLogStandbyInfoActive())
    1561         2035 :         snapshotConflictHorizon = InvalidTransactionId;
    1562              : 
    1563              :     /*
    1564              :      * Construct a leaf-page-wise description of what _bt_delitems_delete()
    1565              :      * needs to do to physically delete index tuples from the page.
    1566              :      *
    1567              :      * Must sort deltids array to restore leaf-page-wise order (original order
    1568              :      * before call to tableam).  This is the order that the loop expects.
    1569              :      *
    1570              :      * Note that deltids array might be a lot smaller now.  It might even have
    1571              :      * no entries at all (with bottom-up deletion caller), in which case there
    1572              :      * is nothing left to do.
    1573              :      */
    1574         7526 :     qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
    1575              :           _bt_delitems_cmp);
    1576         7526 :     if (delstate->ndeltids == 0)
    1577              :     {
    1578              :         Assert(delstate->bottomup);
    1579         1871 :         return;
    1580              :     }
    1581              : 
    1582              :     /* We definitely have to delete at least one index tuple (or one TID) */
    1583       484879 :     for (int i = 0; i < delstate->ndeltids; i++)
    1584              :     {
    1585       479224 :         TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
    1586       479224 :         OffsetNumber idxoffnum = dstatus->idxoffnum;
    1587       479224 :         ItemId      itemid = PageGetItemId(page, idxoffnum);
    1588       479224 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    1589              :         int         nestedi,
    1590              :                     nitem;
    1591              :         BTVacuumPosting vacposting;
    1592              : 
    1593              :         Assert(OffsetNumberIsValid(idxoffnum));
    1594              : 
    1595       479224 :         if (idxoffnum == postingidxoffnum)
    1596              :         {
    1597              :             /*
    1598              :              * This deltid entry is a TID from a posting list tuple that has
    1599              :              * already been completely processed
    1600              :              */
    1601              :             Assert(BTreeTupleIsPosting(itup));
    1602              :             Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
    1603              :                                       &delstate->deltids[i].tid) < 0);
    1604              :             Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
    1605              :                                       &delstate->deltids[i].tid) >= 0);
    1606        18840 :             continue;
    1607              :         }
    1608              : 
    1609       460384 :         if (!BTreeTupleIsPosting(itup))
    1610              :         {
    1611              :             /* Plain non-pivot tuple */
    1612              :             Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
    1613       443853 :             if (dstatus->knowndeletable)
    1614       357634 :                 deletable[ndeletable++] = idxoffnum;
    1615       443853 :             continue;
    1616              :         }
    1617              : 
    1618              :         /*
    1619              :          * itup is a posting list tuple whose lowest deltids entry (which may
    1620              :          * or may not be for the first TID from itup) is considered here now.
    1621              :          * We should process all of the deltids entries for the posting list
    1622              :          * together now, though (not just the lowest).  Remember to skip over
    1623              :          * later itup-related entries during later iterations of outermost
    1624              :          * loop.
    1625              :          */
    1626        16531 :         postingidxoffnum = idxoffnum;   /* Remember work in outermost loop */
    1627        16531 :         nestedi = i;            /* Initialize for first itup deltids entry */
    1628        16531 :         vacposting = NULL;      /* Describes final action for itup */
    1629        16531 :         nitem = BTreeTupleGetNPosting(itup);
    1630        74268 :         for (int p = 0; p < nitem; p++)
    1631              :         {
    1632        57737 :             ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
    1633        57737 :             int         ptidcmp = -1;
    1634              : 
    1635              :             /*
    1636              :              * This nested loop reuses work across ptid TIDs taken from itup.
    1637              :              * We take advantage of the fact that both itup's TIDs and deltids
    1638              :              * entries (within a single itup/posting list grouping) must both
    1639              :              * be in ascending TID order.
    1640              :              */
    1641        83502 :             for (; nestedi < delstate->ndeltids; nestedi++)
    1642              :             {
    1643        80427 :                 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
    1644        80427 :                 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
    1645              : 
    1646              :                 /* Stop once we get past all itup related deltids entries */
    1647              :                 Assert(tdstatus->idxoffnum >= idxoffnum);
    1648        80427 :                 if (tdstatus->idxoffnum != idxoffnum)
    1649        14921 :                     break;
    1650              : 
    1651              :                 /* Skip past non-deletable itup related entries up front */
    1652        65506 :                 if (!tdstatus->knowndeletable)
    1653         5397 :                     continue;
    1654              : 
    1655              :                 /* Entry is first partial ptid match (or an exact match)? */
    1656        60109 :                 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
    1657        60109 :                 if (ptidcmp >= 0)
    1658              :                 {
    1659              :                     /* Greater than or equal (partial or exact) match... */
    1660        39741 :                     break;
    1661              :                 }
    1662              :             }
    1663              : 
    1664              :             /* ...exact ptid match to a deletable deltids entry? */
    1665        57737 :             if (ptidcmp != 0)
    1666        27763 :                 continue;
    1667              : 
    1668              :             /* Exact match for deletable deltids entry -- ptid gets deleted */
    1669        29974 :             if (vacposting == NULL)
    1670              :             {
    1671        14757 :                 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1672              :                                     nitem * sizeof(uint16));
    1673        14757 :                 vacposting->itup = itup;
    1674        14757 :                 vacposting->updatedoffset = idxoffnum;
    1675        14757 :                 vacposting->ndeletedtids = 0;
    1676              :             }
    1677        29974 :             vacposting->deletetids[vacposting->ndeletedtids++] = p;
    1678              :         }
    1679              : 
    1680              :         /* Final decision on itup, a posting list tuple */
    1681              : 
    1682        16531 :         if (vacposting == NULL)
    1683              :         {
    1684              :             /* No TIDs to delete from itup -- do nothing */
    1685              :         }
    1686        14757 :         else if (vacposting->ndeletedtids == nitem)
    1687              :         {
    1688              :             /* Straight delete of itup (to delete all TIDs) */
    1689         8049 :             deletable[ndeletable++] = idxoffnum;
    1690              :             /* Turns out we won't need granular information */
    1691         8049 :             pfree(vacposting);
    1692              :         }
    1693              :         else
    1694              :         {
    1695              :             /* Delete some (but not all) TIDs from itup */
    1696              :             Assert(vacposting->ndeletedtids > 0 &&
    1697              :                    vacposting->ndeletedtids < nitem);
    1698         6708 :             updatable[nupdatable++] = vacposting;
    1699              :         }
    1700              :     }
    1701              : 
    1702              :     /* Physically delete tuples (or TIDs) using deletable (or updatable) */
    1703         5655 :     _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
    1704              :                         deletable, ndeletable, updatable, nupdatable);
    1705              : 
    1706              :     /* be tidy */
    1707        12363 :     for (int i = 0; i < nupdatable; i++)
    1708         6708 :         pfree(updatable[i]);
    1709              : }
    1710              : 
    1711              : /*
    1712              :  * Check that leftsib page (the btpo_prev of target page) is not marked with
    1713              :  * INCOMPLETE_SPLIT flag.  Used during page deletion.
    1714              :  *
    1715              :  * Returning true indicates that page flag is set in leftsib (which is
    1716              :  * definitely still the left sibling of target).  When that happens, the
    1717              :  * target doesn't have a downlink in parent, and the page deletion algorithm
    1718              :  * isn't prepared to handle that.  Deletion of the target page (or the whole
    1719              :  * subtree that contains the target page) cannot take place.
    1720              :  *
    1721              :  * Caller should not have a lock on the target page itself, since pages on the
    1722              :  * same level must always be locked left to right to avoid deadlocks.
    1723              :  */
    1724              : static bool
    1725         3634 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
    1726              : {
    1727              :     Buffer      buf;
    1728              :     Page        page;
    1729              :     BTPageOpaque opaque;
    1730              :     bool        result;
    1731              : 
    1732              :     /* Easy case: No left sibling */
    1733         3634 :     if (leftsib == P_NONE)
    1734         2926 :         return false;
    1735              : 
    1736          708 :     buf = _bt_getbuf(rel, leftsib, BT_READ);
    1737          708 :     page = BufferGetPage(buf);
    1738          708 :     opaque = BTPageGetOpaque(page);
    1739              : 
    1740              :     /*
    1741              :      * If the left sibling was concurrently split, so that its next-pointer
    1742              :      * doesn't point to the current page anymore, the split that created
    1743              :      * target must be completed.  Caller can reasonably expect that there will
    1744              :      * be a downlink to the target page that it can relocate using its stack.
    1745              :      * (We don't allow splitting an incompletely split page again until the
    1746              :      * previous split has been completed.)
    1747              :      */
    1748          708 :     result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
    1749          708 :     _bt_relbuf(rel, buf);
    1750              : 
    1751          708 :     return result;
    1752              : }
    1753              : 
    1754              : /*
    1755              :  * Check that leafrightsib page (the btpo_next of target leaf page) is not
    1756              :  * marked with ISHALFDEAD flag.  Used during page deletion.
    1757              :  *
    1758              :  * Returning true indicates that page flag is set in leafrightsib, so page
    1759              :  * deletion cannot go ahead.  Our caller is not prepared to deal with the case
    1760              :  * where the parent page does not have a pivot tuples whose downlink points to
    1761              :  * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
    1762              :  * seem worth going to the trouble of teaching our caller to deal with it.
    1763              :  * The situation will be resolved after VACUUM finishes the deletion of the
    1764              :  * half-dead page (when a future VACUUM operation reaches the target page
    1765              :  * again).
    1766              :  *
    1767              :  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
    1768              :  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
    1769              :  * okay because of the restriction on deleting pages that are the rightmost
    1770              :  * page of their parent (i.e. that such deletions can only take place when the
    1771              :  * entire subtree must be deleted).  The leaf level check made here will apply
    1772              :  * to a right "cousin" leaf page rather than a simple right sibling leaf page
    1773              :  * in cases where caller actually goes on to attempt deleting pages that are
    1774              :  * above the leaf page.  The right cousin leaf page is representative of the
    1775              :  * left edge of the subtree to the right of the to-be-deleted subtree as a
    1776              :  * whole, which is exactly the condition that our caller cares about.
    1777              :  * (Besides, internal pages are never marked half-dead, so it isn't even
    1778              :  * possible to _directly_ assess if an internal page is part of some other
    1779              :  * to-be-deleted subtree.)
    1780              :  */
    1781              : static bool
    1782         3435 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
    1783              : {
    1784              :     Buffer      buf;
    1785              :     Page        page;
    1786              :     BTPageOpaque opaque;
    1787              :     bool        result;
    1788              : 
    1789              :     Assert(leafrightsib != P_NONE);
    1790              : 
    1791         3435 :     buf = _bt_getbuf(rel, leafrightsib, BT_READ);
    1792         3435 :     page = BufferGetPage(buf);
    1793         3435 :     opaque = BTPageGetOpaque(page);
    1794              : 
    1795              :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
    1796         3435 :     result = P_ISHALFDEAD(opaque);
    1797         3435 :     _bt_relbuf(rel, buf);
    1798              : 
    1799         3435 :     return result;
    1800              : }
    1801              : 
    1802              : /*
    1803              :  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
    1804              :  *
    1805              :  * This action unlinks the leaf page from the b-tree structure, removing all
    1806              :  * pointers leading to it --- but not touching its own left and right links.
    1807              :  * The page cannot be physically reclaimed right away, since other processes
    1808              :  * may currently be trying to follow links leading to the page; they have to
    1809              :  * be allowed to use its right-link to recover.  See nbtree/README.
    1810              :  *
    1811              :  * On entry, the target buffer must be pinned and locked (either read or write
    1812              :  * lock is OK).  The page must be an empty leaf page, which may be half-dead
    1813              :  * already (a half-dead page should only be passed to us when an earlier
    1814              :  * VACUUM operation was interrupted, though).  Note in particular that caller
    1815              :  * should never pass a buffer containing an existing deleted page here.  The
    1816              :  * lock and pin on caller's buffer will be dropped before we return.
    1817              :  *
    1818              :  * Maintains bulk delete stats for caller, which are taken from vstate.  We
    1819              :  * need to cooperate closely with caller here so that whole VACUUM operation
    1820              :  * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
    1821              :  * delete in passing.  If such pages happen to be from a block number that is
    1822              :  * ahead of the current scanblkno position, then caller is expected to count
    1823              :  * them directly later on.  It's simpler for us to understand caller's
    1824              :  * requirements than it would be for caller to understand when or how a
    1825              :  * deleted page became deleted after the fact.
    1826              :  *
    1827              :  * NOTE: this leaks memory.  Rather than trying to clean up everything
    1828              :  * carefully, it's better to run it in a temp context that can be reset
    1829              :  * frequently.
    1830              :  */
    1831              : void
    1832         3542 : _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
    1833              : {
    1834              :     BlockNumber rightsib;
    1835              :     bool        rightsib_empty;
    1836              :     Page        page;
    1837              :     BTPageOpaque opaque;
    1838              : 
    1839              :     /*
    1840              :      * Save original leafbuf block number from caller.  Only deleted blocks
    1841              :      * that are <= scanblkno are added to bulk delete stat's pages_deleted
    1842              :      * count.
    1843              :      */
    1844         3542 :     BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
    1845              : 
    1846              :     /*
    1847              :      * "stack" is a search stack leading (approximately) to the target page.
    1848              :      * It is initially NULL, but when iterating, we keep it to avoid
    1849              :      * duplicated search effort.
    1850              :      *
    1851              :      * Also, when "stack" is not NULL, we have already checked that the
    1852              :      * current page is not the right half of an incomplete split, i.e. the
    1853              :      * left sibling does not have its INCOMPLETE_SPLIT flag set, including
    1854              :      * when the current target page is to the right of caller's initial page
    1855              :      * (the scanblkno page).
    1856              :      */
    1857         3542 :     BTStack     stack = NULL;
    1858              : 
    1859              :     for (;;)
    1860              :     {
    1861         6979 :         page = BufferGetPage(leafbuf);
    1862         6979 :         opaque = BTPageGetOpaque(page);
    1863              : 
    1864              :         /*
    1865              :          * Internal pages are never deleted directly, only as part of deleting
    1866              :          * the whole subtree all the way down to leaf level.
    1867              :          *
    1868              :          * Also check for deleted pages here.  Caller never passes us a fully
    1869              :          * deleted page.  Only VACUUM can delete pages, so there can't have
    1870              :          * been a concurrent deletion.  Assume that we reached any deleted
    1871              :          * page encountered here by following a sibling link, and that the
    1872              :          * index is corrupt.
    1873              :          */
    1874              :         Assert(!P_ISDELETED(opaque));
    1875         6979 :         if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
    1876              :         {
    1877              :             /*
    1878              :              * Pre-9.4 page deletion only marked internal pages as half-dead,
    1879              :              * but now we only use that flag on leaf pages. The old algorithm
    1880              :              * was never supposed to leave half-dead pages in the tree, it was
    1881              :              * just a transient state, but it was nevertheless possible in
    1882              :              * error scenarios. We don't know how to deal with them here. They
    1883              :              * are harmless as far as searches are considered, but inserts
    1884              :              * into the deleted keyspace could add out-of-order downlinks in
    1885              :              * the upper levels. Log a notice, hopefully the admin will notice
    1886              :              * and reindex.
    1887              :              */
    1888            0 :             if (P_ISHALFDEAD(opaque))
    1889            0 :                 ereport(LOG,
    1890              :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1891              :                          errmsg("index \"%s\" contains a half-dead internal page",
    1892              :                                 RelationGetRelationName(rel)),
    1893              :                          errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
    1894              : 
    1895            0 :             if (P_ISDELETED(opaque))
    1896            0 :                 ereport(LOG,
    1897              :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1898              :                          errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
    1899              :                                          BufferGetBlockNumber(leafbuf),
    1900              :                                          scanblkno,
    1901              :                                          RelationGetRelationName(rel))));
    1902              : 
    1903            0 :             _bt_relbuf(rel, leafbuf);
    1904          125 :             return;
    1905              :         }
    1906              : 
    1907              :         /*
    1908              :          * We can never delete rightmost pages nor root pages.  While at it,
    1909              :          * check that page is empty, since it's possible that the leafbuf page
    1910              :          * was empty a moment ago, but has since had some inserts.
    1911              :          *
    1912              :          * To keep the algorithm simple, we also never delete an incompletely
    1913              :          * split page (they should be rare enough that this doesn't make any
    1914              :          * meaningful difference to disk usage):
    1915              :          *
    1916              :          * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
    1917              :          * left half of an incomplete split, but ensuring that it's not the
    1918              :          * right half is more complicated.  For that, we have to check that
    1919              :          * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
    1920              :          * _bt_leftsib_splitflag().  On the first iteration, we temporarily
    1921              :          * release the lock on scanblkno/leafbuf, check the left sibling, and
    1922              :          * construct a search stack to scanblkno.  On subsequent iterations,
    1923              :          * we know we stepped right from a page that passed these tests, so
    1924              :          * it's OK.
    1925              :          */
    1926         6979 :         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
    1927         6862 :             P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1928         6862 :             P_INCOMPLETE_SPLIT(opaque))
    1929              :         {
    1930              :             /* Should never fail to delete a half-dead page */
    1931              :             Assert(!P_ISHALFDEAD(opaque));
    1932              : 
    1933          117 :             _bt_relbuf(rel, leafbuf);
    1934          117 :             return;
    1935              :         }
    1936              : 
    1937              :         /*
    1938              :          * First, remove downlink pointing to the page (or a parent of the
    1939              :          * page, if we are going to delete a taller subtree), and mark the
    1940              :          * leafbuf page half-dead
    1941              :          */
    1942         6862 :         if (!P_ISHALFDEAD(opaque))
    1943              :         {
    1944              :             /*
    1945              :              * We need an approximate pointer to the page's parent page.  We
    1946              :              * use a variant of the standard search mechanism to search for
    1947              :              * the page's high key; this will give us a link to either the
    1948              :              * current parent or someplace to its left (if there are multiple
    1949              :              * equal high keys, which is possible with !heapkeyspace indexes).
    1950              :              *
    1951              :              * Also check if this is the right-half of an incomplete split
    1952              :              * (see comment above).
    1953              :              */
    1954         6862 :             if (!stack)
    1955         3427 :             {
    1956              :                 BTScanInsert itup_key;
    1957              :                 ItemId      itemid;
    1958              :                 IndexTuple  targetkey;
    1959              :                 BlockNumber leftsib,
    1960              :                             leafblkno;
    1961              :                 Buffer      sleafbuf;
    1962              : 
    1963         3427 :                 itemid = PageGetItemId(page, P_HIKEY);
    1964         3427 :                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
    1965              : 
    1966         3427 :                 leftsib = opaque->btpo_prev;
    1967         3427 :                 leafblkno = BufferGetBlockNumber(leafbuf);
    1968              : 
    1969              :                 /*
    1970              :                  * To avoid deadlocks, we'd better drop the leaf page lock
    1971              :                  * before going further.
    1972              :                  */
    1973         3427 :                 _bt_unlockbuf(rel, leafbuf);
    1974              : 
    1975              :                 /*
    1976              :                  * Check that the left sibling of leafbuf (if any) is not
    1977              :                  * marked with INCOMPLETE_SPLIT flag before proceeding
    1978              :                  */
    1979              :                 Assert(leafblkno == scanblkno);
    1980         3427 :                 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
    1981              :                 {
    1982            0 :                     ReleaseBuffer(leafbuf);
    1983            0 :                     return;
    1984              :                 }
    1985              : 
    1986              :                 /*
    1987              :                  * We need an insertion scan key, so build one.
    1988              :                  *
    1989              :                  * _bt_search searches for the leaf page that contains any
    1990              :                  * matching non-pivot tuples, but we need it to "search" for
    1991              :                  * the high key pivot from the page that we're set to delete.
    1992              :                  * Compensate for the mismatch by having _bt_search locate the
    1993              :                  * last position < equal-to-untruncated-prefix non-pivots.
    1994              :                  */
    1995         3427 :                 itup_key = _bt_mkscankey(rel, targetkey);
    1996              : 
    1997              :                 /* Set up a BTLessStrategyNumber-like insertion scan key */
    1998         3427 :                 itup_key->nextkey = false;
    1999         3427 :                 itup_key->backward = true;
    2000         3427 :                 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, true);
    2001              :                 /* won't need a second lock or pin on leafbuf */
    2002         3427 :                 _bt_relbuf(rel, sleafbuf);
    2003              : 
    2004              :                 /*
    2005              :                  * Re-lock the leaf page, and start over to use our stack
    2006              :                  * within _bt_mark_page_halfdead.  We must do it that way
    2007              :                  * because it's possible that leafbuf can no longer be
    2008              :                  * deleted.  We need to recheck.
    2009              :                  *
    2010              :                  * Note: We can't simply hold on to the sleafbuf lock instead,
    2011              :                  * because it's barely possible that sleafbuf is not the same
    2012              :                  * page as leafbuf.  This happens when leafbuf split after our
    2013              :                  * original lock was dropped, but before _bt_search finished
    2014              :                  * its descent.  We rely on the assumption that we'll find
    2015              :                  * leafbuf isn't safe to delete anymore in this scenario.
    2016              :                  * (Page deletion can cope with the stack being to the left of
    2017              :                  * leafbuf, but not to the right of leafbuf.)
    2018              :                  */
    2019         3427 :                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
    2020         3427 :                 continue;
    2021              :             }
    2022              : 
    2023              :             /*
    2024              :              * See if it's safe to delete the leaf page, and determine how
    2025              :              * many parent/internal pages above the leaf level will be
    2026              :              * deleted.  If it's safe then _bt_mark_page_halfdead will also
    2027              :              * perform the first phase of deletion, which includes marking the
    2028              :              * leafbuf page half-dead.
    2029              :              */
    2030              :             Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
    2031         3435 :             if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
    2032              :                                         stack))
    2033              :             {
    2034            8 :                 _bt_relbuf(rel, leafbuf);
    2035            8 :                 return;
    2036              :             }
    2037              :         }
    2038              :         else
    2039              :         {
    2040            0 :             INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
    2041              :         }
    2042              : 
    2043              :         /*
    2044              :          * Then unlink it from its siblings.  Each call to
    2045              :          * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
    2046              :          * making it shallower.  Iterate until the leafbuf page is deleted.
    2047              :          */
    2048         3427 :         rightsib_empty = false;
    2049              :         Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
    2050         7043 :         while (P_ISHALFDEAD(opaque))
    2051              :         {
    2052              :             /* Check for interrupts in _bt_unlink_halfdead_page */
    2053         3616 :             if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
    2054              :                                           &rightsib_empty, vstate))
    2055              :             {
    2056              :                 /*
    2057              :                  * _bt_unlink_halfdead_page should never fail, since we
    2058              :                  * established that deletion is generally safe in
    2059              :                  * _bt_mark_page_halfdead -- index must be corrupt.
    2060              :                  *
    2061              :                  * Note that _bt_unlink_halfdead_page already released the
    2062              :                  * lock and pin on leafbuf for us.
    2063              :                  */
    2064              :                 Assert(false);
    2065            0 :                 return;
    2066              :             }
    2067              :         }
    2068              : 
    2069              :         Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
    2070              : 
    2071         3427 :         rightsib = opaque->btpo_next;
    2072              : 
    2073         3427 :         _bt_relbuf(rel, leafbuf);
    2074              : 
    2075              :         /*
    2076              :          * Check here, as calling loops will have locks held, preventing
    2077              :          * interrupts from being processed.
    2078              :          */
    2079         3427 :         CHECK_FOR_INTERRUPTS();
    2080              : 
    2081              :         /*
    2082              :          * The page has now been deleted. If its right sibling is completely
    2083              :          * empty, it's possible that the reason we haven't deleted it earlier
    2084              :          * is that it was the rightmost child of the parent. Now that we
    2085              :          * removed the downlink for this page, the right sibling might now be
    2086              :          * the only child of the parent, and could be removed. It would be
    2087              :          * picked up by the next vacuum anyway, but might as well try to
    2088              :          * remove it now, so loop back to process the right sibling.
    2089              :          *
    2090              :          * Note: This relies on the assumption that _bt_getstackbuf() will be
    2091              :          * able to reuse our original descent stack with a different child
    2092              :          * block (provided that the child block is to the right of the
    2093              :          * original leaf page reached by _bt_search()). It will even update
    2094              :          * the descent stack each time we loop around, avoiding repeated work.
    2095              :          */
    2096         3427 :         if (!rightsib_empty)
    2097         3417 :             break;
    2098              : 
    2099           10 :         leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2100              :     }
    2101              : }
    2102              : 
    2103              : /*
    2104              :  * First stage of page deletion.
    2105              :  *
    2106              :  * Establish the height of the to-be-deleted subtree with leafbuf at its
    2107              :  * lowest level, remove the downlink to the subtree, and mark leafbuf
    2108              :  * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
    2109              :  * but may include additional internal pages (at most one per level of the
    2110              :  * tree below the root).
    2111              :  *
    2112              :  * Caller must pass a valid heaprel, since it's just about possible that our
    2113              :  * call to _bt_lock_subtree_parent will need to allocate a new index page to
    2114              :  * complete a page split.  Every call to _bt_allocbuf needs to pass a heaprel.
    2115              :  *
    2116              :  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
    2117              :  * the rightmost child of its parent (and parent has more than one downlink).
    2118              :  * Returns 'true' when the first stage of page deletion completed
    2119              :  * successfully.
    2120              :  */
    2121              : static bool
    2122         3435 : _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
    2123              :                        BTStack stack)
    2124              : {
    2125              :     BlockNumber leafblkno;
    2126              :     BlockNumber leafrightsib;
    2127              :     BlockNumber topparent;
    2128              :     BlockNumber topparentrightsib;
    2129              :     ItemId      itemid;
    2130              :     Page        page;
    2131              :     BTPageOpaque opaque;
    2132              :     Buffer      subtreeparent;
    2133              :     OffsetNumber poffset;
    2134              :     OffsetNumber nextoffset;
    2135              :     IndexTuple  itup;
    2136              :     IndexTupleData trunctuple;
    2137              :     XLogRecPtr  recptr;
    2138              : 
    2139         3435 :     page = BufferGetPage(leafbuf);
    2140         3435 :     opaque = BTPageGetOpaque(page);
    2141              : 
    2142              :     Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
    2143              :            P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
    2144              :            P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2145              :     Assert(heaprel != NULL);
    2146              : 
    2147              :     /*
    2148              :      * Save info about the leaf page.
    2149              :      */
    2150         3435 :     leafblkno = BufferGetBlockNumber(leafbuf);
    2151         3435 :     leafrightsib = opaque->btpo_next;
    2152              : 
    2153              :     /*
    2154              :      * Before attempting to lock the parent page, check that the right sibling
    2155              :      * is not in half-dead state.  A half-dead right sibling would have no
    2156              :      * downlink in the parent, which would be highly confusing later when we
    2157              :      * delete the downlink.  It would fail the "right sibling of target page
    2158              :      * is also the next child in parent page" cross-check below.
    2159              :      */
    2160         3435 :     if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
    2161              :     {
    2162            0 :         elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    2163              :              leafblkno, leafrightsib);
    2164            0 :         return false;
    2165              :     }
    2166              : 
    2167              :     /*
    2168              :      * We cannot delete a page that is the rightmost child of its immediate
    2169              :      * parent, unless it is the only child --- in which case the parent has to
    2170              :      * be deleted too, and the same condition applies recursively to it. We
    2171              :      * have to check this condition all the way up before trying to delete,
    2172              :      * and lock the parent of the root of the to-be-deleted subtree (the
    2173              :      * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
    2174              :      * for us.  We remove the downlink to the "top parent" page (subtree root
    2175              :      * page) from the subtree parent page below.
    2176              :      *
    2177              :      * Initialize topparent to be leafbuf page now.  The final to-be-deleted
    2178              :      * subtree is often a degenerate one page subtree consisting only of the
    2179              :      * leafbuf page.  When that happens, the leafbuf page is the final subtree
    2180              :      * root page/top parent page.
    2181              :      */
    2182         3435 :     topparent = leafblkno;
    2183         3435 :     topparentrightsib = leafrightsib;
    2184         3435 :     if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
    2185              :                                  &subtreeparent, &poffset,
    2186              :                                  &topparent, &topparentrightsib))
    2187            8 :         return false;
    2188              : 
    2189         3427 :     page = BufferGetPage(subtreeparent);
    2190         3427 :     opaque = BTPageGetOpaque(page);
    2191              : 
    2192              : #ifdef USE_ASSERT_CHECKING
    2193              : 
    2194              :     /*
    2195              :      * This is just an assertion because _bt_lock_subtree_parent should have
    2196              :      * guaranteed tuple has the expected contents
    2197              :      */
    2198              :     itemid = PageGetItemId(page, poffset);
    2199              :     itup = (IndexTuple) PageGetItem(page, itemid);
    2200              :     Assert(BTreeTupleGetDownLink(itup) == topparent);
    2201              : #endif
    2202              : 
    2203         3427 :     nextoffset = OffsetNumberNext(poffset);
    2204         3427 :     itemid = PageGetItemId(page, nextoffset);
    2205         3427 :     itup = (IndexTuple) PageGetItem(page, itemid);
    2206              : 
    2207              :     /*
    2208              :      * Check that the parent-page index items we're about to delete/overwrite
    2209              :      * in subtree parent page contain what we expect.  This can fail if the
    2210              :      * index has become corrupt for some reason.  When that happens we back
    2211              :      * out of deletion of the leafbuf subtree.  (This is just like the case
    2212              :      * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
    2213              :      */
    2214         3427 :     if (BTreeTupleGetDownLink(itup) != topparentrightsib)
    2215              :     {
    2216            0 :         ereport(LOG,
    2217              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2218              :                  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
    2219              :                                  topparentrightsib, topparent,
    2220              :                                  BTreeTupleGetDownLink(itup),
    2221              :                                  BufferGetBlockNumber(subtreeparent),
    2222              :                                  RelationGetRelationName(rel))));
    2223              : 
    2224            0 :         _bt_relbuf(rel, subtreeparent);
    2225              :         Assert(false);
    2226            0 :         return false;
    2227              :     }
    2228              : 
    2229              :     /*
    2230              :      * Any insert which would have gone on the leaf block will now go to its
    2231              :      * right sibling.  In other words, the key space moves right.
    2232              :      */
    2233         3427 :     PredicateLockPageCombine(rel, leafblkno, leafrightsib);
    2234              : 
    2235              :     /* No ereport(ERROR) until changes are logged */
    2236         3427 :     START_CRIT_SECTION();
    2237              : 
    2238              :     /*
    2239              :      * Update parent of subtree.  We want to delete the downlink to the top
    2240              :      * parent page/root of the subtree, and the *following* key.  Easiest way
    2241              :      * is to copy the right sibling's downlink over the downlink that points
    2242              :      * to top parent page, and then delete the right sibling's original pivot
    2243              :      * tuple.
    2244              :      *
    2245              :      * Lanin and Shasha make the key space move left when deleting a page,
    2246              :      * whereas the key space moves right here.  That's why we cannot simply
    2247              :      * delete the pivot tuple with the downlink to the top parent page.  See
    2248              :      * nbtree/README.
    2249              :      */
    2250         3427 :     page = BufferGetPage(subtreeparent);
    2251         3427 :     opaque = BTPageGetOpaque(page);
    2252              : 
    2253         3427 :     itemid = PageGetItemId(page, poffset);
    2254         3427 :     itup = (IndexTuple) PageGetItem(page, itemid);
    2255         3427 :     BTreeTupleSetDownLink(itup, topparentrightsib);
    2256              : 
    2257         3427 :     nextoffset = OffsetNumberNext(poffset);
    2258         3427 :     PageIndexTupleDelete(page, nextoffset);
    2259              : 
    2260              :     /*
    2261              :      * Mark the leaf page as half-dead, and stamp it with a link to the top
    2262              :      * parent page.  When the leaf page is also the top parent page, the link
    2263              :      * is set to InvalidBlockNumber.
    2264              :      */
    2265         3427 :     page = BufferGetPage(leafbuf);
    2266         3427 :     opaque = BTPageGetOpaque(page);
    2267         3427 :     opaque->btpo_flags |= BTP_HALF_DEAD;
    2268              : 
    2269              :     Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
    2270         3427 :     MemSet(&trunctuple, 0, sizeof(IndexTupleData));
    2271         3427 :     trunctuple.t_info = sizeof(IndexTupleData);
    2272         3427 :     if (topparent != leafblkno)
    2273           85 :         BTreeTupleSetTopParent(&trunctuple, topparent);
    2274              :     else
    2275         3342 :         BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
    2276              : 
    2277         3427 :     if (!PageIndexTupleOverwrite(page, P_HIKEY, &trunctuple, IndexTupleSize(&trunctuple)))
    2278            0 :         elog(ERROR, "could not overwrite high key in half-dead page");
    2279              : 
    2280              :     /* Must mark buffers dirty before XLogInsert */
    2281         3427 :     MarkBufferDirty(subtreeparent);
    2282         3427 :     MarkBufferDirty(leafbuf);
    2283              : 
    2284              :     /* XLOG stuff */
    2285         3427 :     if (RelationNeedsWAL(rel))
    2286         3427 :     {
    2287              :         xl_btree_mark_page_halfdead xlrec;
    2288              : 
    2289         3427 :         xlrec.poffset = poffset;
    2290         3427 :         xlrec.leafblk = leafblkno;
    2291         3427 :         if (topparent != leafblkno)
    2292           85 :             xlrec.topparent = topparent;
    2293              :         else
    2294         3342 :             xlrec.topparent = InvalidBlockNumber;
    2295              : 
    2296         3427 :         XLogBeginInsert();
    2297         3427 :         XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
    2298         3427 :         XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
    2299              : 
    2300         3427 :         page = BufferGetPage(leafbuf);
    2301         3427 :         opaque = BTPageGetOpaque(page);
    2302         3427 :         xlrec.leftblk = opaque->btpo_prev;
    2303         3427 :         xlrec.rightblk = opaque->btpo_next;
    2304              : 
    2305         3427 :         XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
    2306              : 
    2307         3427 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
    2308              :     }
    2309              :     else
    2310            0 :         recptr = XLogGetFakeLSN(rel);
    2311              : 
    2312         3427 :     page = BufferGetPage(subtreeparent);
    2313         3427 :     PageSetLSN(page, recptr);
    2314         3427 :     page = BufferGetPage(leafbuf);
    2315         3427 :     PageSetLSN(page, recptr);
    2316              : 
    2317         3427 :     END_CRIT_SECTION();
    2318              : 
    2319         3427 :     _bt_relbuf(rel, subtreeparent);
    2320         3427 :     return true;
    2321              : }
    2322              : 
    2323              : /*
    2324              :  * Second stage of page deletion.
    2325              :  *
    2326              :  * Unlinks a single page (in the subtree undergoing deletion) from its
    2327              :  * siblings.  Also marks the page deleted.
    2328              :  *
    2329              :  * To get rid of the whole subtree, including the leaf page itself, call here
    2330              :  * until the leaf page is deleted.  The original "top parent" established in
    2331              :  * the first stage of deletion is deleted in the first call here, while the
    2332              :  * leaf page is deleted in the last call here.  Note that the leaf page itself
    2333              :  * is often the initial top parent page.
    2334              :  *
    2335              :  * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
    2336              :  * the right sibling of the current target page is empty, *rightsib_empty is
    2337              :  * set to true, allowing caller to delete the target's right sibling page in
    2338              :  * passing.  Note that *rightsib_empty is only actually used by caller when
    2339              :  * target page is leafbuf, following last call here for leafbuf/the subtree
    2340              :  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
    2341              :  * consistent.)
    2342              :  *
    2343              :  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
    2344              :  * On success exit, we'll be holding pin and write lock.  On failure exit,
    2345              :  * we'll release both pin and lock before returning (we define it that way
    2346              :  * to avoid having to reacquire a lock we already released).
    2347              :  */
    2348              : static bool
    2349         3616 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
    2350              :                          bool *rightsib_empty, BTVacState *vstate)
    2351              : {
    2352         3616 :     BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
    2353         3616 :     IndexBulkDeleteResult *stats = vstate->stats;
    2354              :     BlockNumber leafleftsib;
    2355              :     BlockNumber leafrightsib;
    2356              :     BlockNumber target;
    2357              :     BlockNumber leftsib;
    2358              :     BlockNumber rightsib;
    2359         3616 :     Buffer      lbuf = InvalidBuffer;
    2360              :     Buffer      buf;
    2361              :     Buffer      rbuf;
    2362         3616 :     Buffer      metabuf = InvalidBuffer;
    2363         3616 :     Page        metapg = NULL;
    2364         3616 :     BTMetaPageData *metad = NULL;
    2365              :     ItemId      itemid;
    2366              :     Page        page;
    2367              :     BTPageOpaque opaque;
    2368              :     FullTransactionId safexid;
    2369              :     bool        rightsib_is_rightmost;
    2370              :     uint32      targetlevel;
    2371              :     IndexTuple  leafhikey;
    2372              :     BlockNumber leaftopparent;
    2373              :     XLogRecPtr  recptr;
    2374              : 
    2375         3616 :     page = BufferGetPage(leafbuf);
    2376         3616 :     opaque = BTPageGetOpaque(page);
    2377              : 
    2378              :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
    2379              : 
    2380              :     /*
    2381              :      * Remember some information about the leaf page.
    2382              :      */
    2383         3616 :     itemid = PageGetItemId(page, P_HIKEY);
    2384         3616 :     leafhikey = (IndexTuple) PageGetItem(page, itemid);
    2385         3616 :     target = BTreeTupleGetTopParent(leafhikey);
    2386         3616 :     leafleftsib = opaque->btpo_prev;
    2387         3616 :     leafrightsib = opaque->btpo_next;
    2388              : 
    2389         3616 :     _bt_unlockbuf(rel, leafbuf);
    2390              : 
    2391         3616 :     INJECTION_POINT("nbtree-leave-page-half-dead", NULL);
    2392              : 
    2393              :     /*
    2394              :      * Check here, as calling loops will have locks held, preventing
    2395              :      * interrupts from being processed.
    2396              :      */
    2397         3616 :     CHECK_FOR_INTERRUPTS();
    2398              : 
    2399              :     /* Unlink the current top parent of the subtree */
    2400         3616 :     if (!BlockNumberIsValid(target))
    2401              :     {
    2402              :         /* Target is leaf page (or leaf page is top parent, if you prefer) */
    2403         3427 :         target = leafblkno;
    2404              : 
    2405         3427 :         buf = leafbuf;
    2406         3427 :         leftsib = leafleftsib;
    2407         3427 :         targetlevel = 0;
    2408              :     }
    2409              :     else
    2410              :     {
    2411              :         /* Target is the internal page taken from leaf's top parent link */
    2412              :         Assert(target != leafblkno);
    2413              : 
    2414              :         /* Fetch the block number of the target's left sibling */
    2415          189 :         buf = _bt_getbuf(rel, target, BT_READ);
    2416          189 :         page = BufferGetPage(buf);
    2417          189 :         opaque = BTPageGetOpaque(page);
    2418          189 :         leftsib = opaque->btpo_prev;
    2419          189 :         targetlevel = opaque->btpo_level;
    2420              :         Assert(targetlevel > 0);
    2421              : 
    2422              :         /*
    2423              :          * To avoid deadlocks, we'd better drop the target page lock before
    2424              :          * going further.
    2425              :          */
    2426          189 :         _bt_unlockbuf(rel, buf);
    2427              :     }
    2428              : 
    2429              :     /*
    2430              :      * We have to lock the pages we need to modify in the standard order:
    2431              :      * moving right, then up.  Else we will deadlock against other writers.
    2432              :      *
    2433              :      * So, first lock the leaf page, if it's not the target.  Then find and
    2434              :      * write-lock the current left sibling of the target page.  The sibling
    2435              :      * that was current a moment ago could have split, so we may have to move
    2436              :      * right.
    2437              :      */
    2438         3616 :     if (target != leafblkno)
    2439          189 :         _bt_lockbuf(rel, leafbuf, BT_WRITE);
    2440         3616 :     if (leftsib != P_NONE)
    2441              :     {
    2442          682 :         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2443          682 :         page = BufferGetPage(lbuf);
    2444          682 :         opaque = BTPageGetOpaque(page);
    2445          682 :         while (P_ISDELETED(opaque) || opaque->btpo_next != target)
    2446              :         {
    2447            0 :             bool        leftsibvalid = true;
    2448              : 
    2449              :             /*
    2450              :              * Before we follow the link from the page that was the left
    2451              :              * sibling mere moments ago, validate its right link.  This
    2452              :              * reduces the opportunities for loop to fail to ever make any
    2453              :              * progress in the presence of index corruption.
    2454              :              *
    2455              :              * Note: we rely on the assumption that there can only be one
    2456              :              * vacuum process running at a time (against the same index).
    2457              :              */
    2458            0 :             if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
    2459            0 :                 leftsib == opaque->btpo_next)
    2460            0 :                 leftsibvalid = false;
    2461              : 
    2462            0 :             leftsib = opaque->btpo_next;
    2463            0 :             _bt_relbuf(rel, lbuf);
    2464              : 
    2465            0 :             if (!leftsibvalid)
    2466              :             {
    2467              :                 /*
    2468              :                  * This is known to fail in the field; sibling link corruption
    2469              :                  * is relatively common.  Press on with vacuuming rather than
    2470              :                  * just throwing an ERROR.
    2471              :                  */
    2472            0 :                 ereport(LOG,
    2473              :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    2474              :                          errmsg_internal("valid left sibling for deletion target could not be located: "
    2475              :                                          "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
    2476              :                                          leftsib, target, leafblkno, scanblkno,
    2477              :                                          targetlevel, RelationGetRelationName(rel))));
    2478              : 
    2479              :                 /* Must release all pins and locks on failure exit */
    2480            0 :                 ReleaseBuffer(buf);
    2481            0 :                 if (target != leafblkno)
    2482            0 :                     _bt_relbuf(rel, leafbuf);
    2483              : 
    2484            0 :                 return false;
    2485              :             }
    2486              : 
    2487            0 :             CHECK_FOR_INTERRUPTS();
    2488              : 
    2489              :             /* step right one page */
    2490            0 :             lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2491            0 :             page = BufferGetPage(lbuf);
    2492            0 :             opaque = BTPageGetOpaque(page);
    2493              :         }
    2494              :     }
    2495              :     else
    2496         2934 :         lbuf = InvalidBuffer;
    2497              : 
    2498              :     /* Next write-lock the target page itself */
    2499         3616 :     _bt_lockbuf(rel, buf, BT_WRITE);
    2500         3616 :     page = BufferGetPage(buf);
    2501         3616 :     opaque = BTPageGetOpaque(page);
    2502              : 
    2503              :     /*
    2504              :      * Check page is still empty etc, else abandon deletion.  This is just for
    2505              :      * paranoia's sake; a half-dead page cannot resurrect because there can be
    2506              :      * only one vacuum process running at a time.
    2507              :      */
    2508         3616 :     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
    2509            0 :         elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
    2510              :              target, RelationGetRelationName(rel));
    2511              : 
    2512         3616 :     if (opaque->btpo_prev != leftsib)
    2513            0 :         ereport(ERROR,
    2514              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2515              :                  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
    2516              :                                  leftsib, opaque->btpo_prev, target,
    2517              :                                  RelationGetRelationName(rel))));
    2518              : 
    2519         3616 :     if (target == leafblkno)
    2520              :     {
    2521         3427 :         if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    2522         3427 :             !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
    2523            0 :             elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
    2524              :                  target, RelationGetRelationName(rel));
    2525              : 
    2526              :         /* Leaf page is also target page: don't set leaftopparent */
    2527         3427 :         leaftopparent = InvalidBlockNumber;
    2528              :     }
    2529              :     else
    2530              :     {
    2531              :         IndexTuple  finaldataitem;
    2532              : 
    2533          189 :         if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
    2534          189 :             P_ISLEAF(opaque))
    2535            0 :             elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
    2536              :                  targetlevel, target, RelationGetRelationName(rel));
    2537              : 
    2538              :         /* Target is internal: set leaftopparent for next call here...  */
    2539          189 :         itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
    2540          189 :         finaldataitem = (IndexTuple) PageGetItem(page, itemid);
    2541          189 :         leaftopparent = BTreeTupleGetDownLink(finaldataitem);
    2542              :         /* ...except when it would be a redundant pointer-to-self */
    2543          189 :         if (leaftopparent == leafblkno)
    2544           85 :             leaftopparent = InvalidBlockNumber;
    2545              :     }
    2546              : 
    2547              :     /* No leaftopparent for level 0 (leaf page) or level 1 target */
    2548              :     Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
    2549              : 
    2550              :     /*
    2551              :      * And next write-lock the (current) right sibling.
    2552              :      */
    2553         3616 :     rightsib = opaque->btpo_next;
    2554         3616 :     rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2555         3616 :     page = BufferGetPage(rbuf);
    2556         3616 :     opaque = BTPageGetOpaque(page);
    2557              : 
    2558              :     /*
    2559              :      * Validate target's right sibling page.  Its left link must point back to
    2560              :      * the target page.
    2561              :      */
    2562         3616 :     if (opaque->btpo_prev != target)
    2563              :     {
    2564              :         /*
    2565              :          * This is known to fail in the field; sibling link corruption is
    2566              :          * relatively common.  Press on with vacuuming rather than just
    2567              :          * throwing an ERROR (same approach used for left-sibling's-right-link
    2568              :          * validation check a moment ago).
    2569              :          */
    2570            0 :         ereport(LOG,
    2571              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2572              :                  errmsg_internal("right sibling's left-link doesn't match: "
    2573              :                                  "right sibling %u of target %u with leafblkno %u "
    2574              :                                  "and scanblkno %u spuriously links to non-target %u "
    2575              :                                  "on level %u of index \"%s\"",
    2576              :                                  rightsib, target, leafblkno,
    2577              :                                  scanblkno, opaque->btpo_prev,
    2578              :                                  targetlevel, RelationGetRelationName(rel))));
    2579              : 
    2580              :         /* Must release all pins and locks on failure exit */
    2581            0 :         if (BufferIsValid(lbuf))
    2582            0 :             _bt_relbuf(rel, lbuf);
    2583            0 :         _bt_relbuf(rel, rbuf);
    2584            0 :         _bt_relbuf(rel, buf);
    2585            0 :         if (target != leafblkno)
    2586            0 :             _bt_relbuf(rel, leafbuf);
    2587              : 
    2588            0 :         return false;
    2589              :     }
    2590              : 
    2591         3616 :     rightsib_is_rightmost = P_RIGHTMOST(opaque);
    2592         3616 :     *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2593              : 
    2594              :     /*
    2595              :      * If we are deleting the next-to-last page on the target's level, then
    2596              :      * the rightsib is a candidate to become the new fast root. (In theory, it
    2597              :      * might be possible to push the fast root even further down, but the odds
    2598              :      * of doing so are slim, and the locking considerations daunting.)
    2599              :      *
    2600              :      * We can safely acquire a lock on the metapage here --- see comments for
    2601              :      * _bt_newlevel().
    2602              :      */
    2603         3616 :     if (leftsib == P_NONE && rightsib_is_rightmost)
    2604              :     {
    2605           45 :         page = BufferGetPage(rbuf);
    2606           45 :         opaque = BTPageGetOpaque(page);
    2607           45 :         if (P_RIGHTMOST(opaque))
    2608              :         {
    2609              :             /* rightsib will be the only one left on the level */
    2610           45 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2611           45 :             metapg = BufferGetPage(metabuf);
    2612           45 :             metad = BTPageGetMeta(metapg);
    2613              : 
    2614              :             /*
    2615              :              * The expected case here is btm_fastlevel == targetlevel+1; if
    2616              :              * the fastlevel is <= targetlevel, something is wrong, and we
    2617              :              * choose to overwrite it to fix it.
    2618              :              */
    2619           45 :             if (metad->btm_fastlevel > targetlevel + 1)
    2620              :             {
    2621              :                 /* no update wanted */
    2622            0 :                 _bt_relbuf(rel, metabuf);
    2623            0 :                 metabuf = InvalidBuffer;
    2624              :             }
    2625              :         }
    2626              :     }
    2627              : 
    2628              :     /*
    2629              :      * Here we begin doing the deletion.
    2630              :      */
    2631              : 
    2632              :     /* No ereport(ERROR) until changes are logged */
    2633         3616 :     START_CRIT_SECTION();
    2634              : 
    2635              :     /*
    2636              :      * Update siblings' side-links.  Note the target page's side-links will
    2637              :      * continue to point to the siblings.  Asserts here are just rechecking
    2638              :      * things we already verified above.
    2639              :      */
    2640         3616 :     if (BufferIsValid(lbuf))
    2641              :     {
    2642          682 :         page = BufferGetPage(lbuf);
    2643          682 :         opaque = BTPageGetOpaque(page);
    2644              :         Assert(opaque->btpo_next == target);
    2645          682 :         opaque->btpo_next = rightsib;
    2646              :     }
    2647         3616 :     page = BufferGetPage(rbuf);
    2648         3616 :     opaque = BTPageGetOpaque(page);
    2649              :     Assert(opaque->btpo_prev == target);
    2650         3616 :     opaque->btpo_prev = leftsib;
    2651              : 
    2652              :     /*
    2653              :      * If we deleted a parent of the targeted leaf page, instead of the leaf
    2654              :      * itself, update the leaf to point to the next remaining child in the
    2655              :      * subtree.
    2656              :      *
    2657              :      * Note: We rely on the fact that a buffer pin on the leaf page has been
    2658              :      * held since leafhikey was initialized.  This is safe, though only
    2659              :      * because the page was already half-dead at that point.  The leaf page
    2660              :      * cannot have been modified by any other backend during the period when
    2661              :      * no lock was held.
    2662              :      */
    2663         3616 :     if (target != leafblkno)
    2664          189 :         BTreeTupleSetTopParent(leafhikey, leaftopparent);
    2665              : 
    2666              :     /*
    2667              :      * Mark the page itself deleted.  It can be recycled when all current
    2668              :      * transactions are gone.  Storing GetTopTransactionId() would work, but
    2669              :      * we're in VACUUM and would not otherwise have an XID.  Having already
    2670              :      * updated links to the target, ReadNextFullTransactionId() suffices as an
    2671              :      * upper bound.  Any scan having retained a now-stale link is advertising
    2672              :      * in its PGPROC an xmin less than or equal to the value we read here.  It
    2673              :      * will continue to do so, holding back the xmin horizon, for the duration
    2674              :      * of that scan.
    2675              :      */
    2676         3616 :     page = BufferGetPage(buf);
    2677         3616 :     opaque = BTPageGetOpaque(page);
    2678              :     Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
    2679              : 
    2680              :     /*
    2681              :      * Store upper bound XID that's used to determine when deleted page is no
    2682              :      * longer needed as a tombstone
    2683              :      */
    2684         3616 :     safexid = ReadNextFullTransactionId();
    2685         3616 :     BTPageSetDeleted(page, safexid);
    2686         3616 :     opaque->btpo_cycleid = 0;
    2687              : 
    2688              :     /* And update the metapage, if needed */
    2689         3616 :     if (BufferIsValid(metabuf))
    2690              :     {
    2691              :         /* upgrade metapage if needed */
    2692           45 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
    2693            0 :             _bt_upgrademetapage(metapg);
    2694           45 :         metad->btm_fastroot = rightsib;
    2695           45 :         metad->btm_fastlevel = targetlevel;
    2696           45 :         MarkBufferDirty(metabuf);
    2697              :     }
    2698              : 
    2699              :     /* Must mark buffers dirty before XLogInsert */
    2700         3616 :     MarkBufferDirty(rbuf);
    2701         3616 :     MarkBufferDirty(buf);
    2702         3616 :     if (BufferIsValid(lbuf))
    2703          682 :         MarkBufferDirty(lbuf);
    2704         3616 :     if (target != leafblkno)
    2705          189 :         MarkBufferDirty(leafbuf);
    2706              : 
    2707              :     /* XLOG stuff */
    2708         3616 :     if (RelationNeedsWAL(rel))
    2709         3616 :     {
    2710              :         xl_btree_unlink_page xlrec;
    2711              :         xl_btree_metadata xlmeta;
    2712              :         uint8       xlinfo;
    2713              : 
    2714         3616 :         XLogBeginInsert();
    2715              : 
    2716         3616 :         XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
    2717         3616 :         if (BufferIsValid(lbuf))
    2718          682 :             XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2719         3616 :         XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
    2720         3616 :         if (target != leafblkno)
    2721          189 :             XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
    2722              : 
    2723              :         /* information stored on the target/to-be-unlinked block */
    2724         3616 :         xlrec.leftsib = leftsib;
    2725         3616 :         xlrec.rightsib = rightsib;
    2726         3616 :         xlrec.level = targetlevel;
    2727         3616 :         xlrec.safexid = safexid;
    2728              : 
    2729              :         /* information needed to recreate the leaf block (if not the target) */
    2730         3616 :         xlrec.leafleftsib = leafleftsib;
    2731         3616 :         xlrec.leafrightsib = leafrightsib;
    2732         3616 :         xlrec.leaftopparent = leaftopparent;
    2733              : 
    2734         3616 :         XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
    2735              : 
    2736         3616 :         if (BufferIsValid(metabuf))
    2737              :         {
    2738           45 :             XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2739              : 
    2740              :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2741           45 :             xlmeta.version = metad->btm_version;
    2742           45 :             xlmeta.root = metad->btm_root;
    2743           45 :             xlmeta.level = metad->btm_level;
    2744           45 :             xlmeta.fastroot = metad->btm_fastroot;
    2745           45 :             xlmeta.fastlevel = metad->btm_fastlevel;
    2746           45 :             xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2747           45 :             xlmeta.allequalimage = metad->btm_allequalimage;
    2748              : 
    2749           45 :             XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
    2750           45 :             xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
    2751              :         }
    2752              :         else
    2753         3571 :             xlinfo = XLOG_BTREE_UNLINK_PAGE;
    2754              : 
    2755         3616 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2756              :     }
    2757              :     else
    2758            0 :         recptr = XLogGetFakeLSN(rel);
    2759              : 
    2760         3616 :     if (BufferIsValid(metabuf))
    2761           45 :         PageSetLSN(metapg, recptr);
    2762         3616 :     page = BufferGetPage(rbuf);
    2763         3616 :     PageSetLSN(page, recptr);
    2764         3616 :     page = BufferGetPage(buf);
    2765         3616 :     PageSetLSN(page, recptr);
    2766         3616 :     if (BufferIsValid(lbuf))
    2767              :     {
    2768          682 :         page = BufferGetPage(lbuf);
    2769          682 :         PageSetLSN(page, recptr);
    2770              :     }
    2771         3616 :     if (target != leafblkno)
    2772              :     {
    2773          189 :         page = BufferGetPage(leafbuf);
    2774          189 :         PageSetLSN(page, recptr);
    2775              :     }
    2776              : 
    2777         3616 :     END_CRIT_SECTION();
    2778              : 
    2779              :     /* release metapage */
    2780         3616 :     if (BufferIsValid(metabuf))
    2781           45 :         _bt_relbuf(rel, metabuf);
    2782              : 
    2783              :     /* release siblings */
    2784         3616 :     if (BufferIsValid(lbuf))
    2785          682 :         _bt_relbuf(rel, lbuf);
    2786         3616 :     _bt_relbuf(rel, rbuf);
    2787              : 
    2788              :     /* If the target is not leafbuf, we're done with it now -- release it */
    2789         3616 :     if (target != leafblkno)
    2790          189 :         _bt_relbuf(rel, buf);
    2791              : 
    2792              :     /*
    2793              :      * Maintain pages_newly_deleted, which is simply the number of pages
    2794              :      * deleted by the ongoing VACUUM operation.
    2795              :      *
    2796              :      * Maintain pages_deleted in a way that takes into account how
    2797              :      * btvacuumpage() will count deleted pages that have yet to become
    2798              :      * scanblkno -- only count page when it's not going to get that treatment
    2799              :      * later on.
    2800              :      */
    2801         3616 :     stats->pages_newly_deleted++;
    2802         3616 :     if (target <= scanblkno)
    2803         3452 :         stats->pages_deleted++;
    2804              : 
    2805              :     /*
    2806              :      * Remember information about the target page (now a newly deleted page)
    2807              :      * in dedicated vstate space for later.  The page will be considered as a
    2808              :      * candidate to place in the FSM at the end of the current btvacuumscan()
    2809              :      * call.
    2810              :      */
    2811         3616 :     _bt_pendingfsm_add(vstate, target, safexid);
    2812              : 
    2813              :     /* Success - hold on to lock on leafbuf (might also have been target) */
    2814         3616 :     return true;
    2815              : }
    2816              : 
    2817              : /*
    2818              :  * Establish how tall the to-be-deleted subtree will be during the first stage
    2819              :  * of page deletion.
    2820              :  *
    2821              :  * Caller's child argument is the block number of the page caller wants to
    2822              :  * delete (this is leafbuf's block number, except when we're called
    2823              :  * recursively).  stack is a search stack leading to it.  Note that we will
    2824              :  * update the stack entry(s) to reflect current downlink positions --- this is
    2825              :  * similar to the corresponding point in page split handling.
    2826              :  *
    2827              :  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
    2828              :  * false.  Returns true on success, in which case caller can use certain
    2829              :  * details established here to perform the first stage of deletion.  This
    2830              :  * function is the last point at which page deletion may be deemed unsafe
    2831              :  * (barring index corruption, or unexpected concurrent page deletions).
    2832              :  *
    2833              :  * We write lock the parent of the root of the to-be-deleted subtree for
    2834              :  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
    2835              :  * caller).  Caller will have to remove a downlink from *subtreeparent.  We
    2836              :  * also set a *subtreeparent offset number in *poffset, to indicate the
    2837              :  * location of the pivot tuple that contains the relevant downlink.
    2838              :  *
    2839              :  * The root of the to-be-deleted subtree is called the "top parent".  Note
    2840              :  * that the leafbuf page is often the final "top parent" page (you can think
    2841              :  * of the leafbuf page as a degenerate single page subtree when that happens).
    2842              :  * Caller should initialize *topparent to the target leafbuf page block number
    2843              :  * (while *topparentrightsib should be set to leafbuf's right sibling block
    2844              :  * number).  We will update *topparent (and *topparentrightsib) for caller
    2845              :  * here, though only when it turns out that caller will delete at least one
    2846              :  * internal page (i.e. only when caller needs to store a valid link to the top
    2847              :  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
    2848              :  */
    2849              : static bool
    2850         3642 : _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
    2851              :                         BTStack stack, Buffer *subtreeparent,
    2852              :                         OffsetNumber *poffset, BlockNumber *topparent,
    2853              :                         BlockNumber *topparentrightsib)
    2854              : {
    2855              :     BlockNumber parent,
    2856              :                 leftsibparent;
    2857              :     OffsetNumber parentoffset,
    2858              :                 maxoff;
    2859              :     Buffer      pbuf;
    2860              :     Page        page;
    2861              :     BTPageOpaque opaque;
    2862              : 
    2863              :     /*
    2864              :      * Locate the pivot tuple whose downlink points to "child".  Write lock
    2865              :      * the parent page itself.
    2866              :      */
    2867         3642 :     pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
    2868         3642 :     if (pbuf == InvalidBuffer)
    2869              :     {
    2870              :         /*
    2871              :          * Failed to "re-find" a pivot tuple whose downlink matched our child
    2872              :          * block number on the parent level -- the index must be corrupt.
    2873              :          * Don't even try to delete the leafbuf subtree.  Just report the
    2874              :          * issue and press on with vacuuming the index.
    2875              :          *
    2876              :          * Note: _bt_getstackbuf() recovers from concurrent page splits that
    2877              :          * take place on the parent level.  Its approach is a near-exhaustive
    2878              :          * linear search.  This also gives it a surprisingly good chance of
    2879              :          * recovering in the event of a buggy or inconsistent opclass.  But we
    2880              :          * don't rely on that here.
    2881              :          */
    2882            0 :         ereport(LOG,
    2883              :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2884              :                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
    2885              :                                  RelationGetRelationName(rel), child)));
    2886              :         Assert(false);
    2887            0 :         return false;
    2888              :     }
    2889              : 
    2890         3642 :     parent = stack->bts_blkno;
    2891         3642 :     parentoffset = stack->bts_offset;
    2892              : 
    2893         3642 :     page = BufferGetPage(pbuf);
    2894         3642 :     opaque = BTPageGetOpaque(page);
    2895         3642 :     maxoff = PageGetMaxOffsetNumber(page);
    2896         3642 :     leftsibparent = opaque->btpo_prev;
    2897              : 
    2898              :     /*
    2899              :      * _bt_getstackbuf() completes page splits on returned parent buffer when
    2900              :      * required.
    2901              :      *
    2902              :      * In general it's a bad idea for VACUUM to use up more disk space, which
    2903              :      * is why page deletion does not finish incomplete page splits most of the
    2904              :      * time.  We allow this limited exception because the risk is much lower,
    2905              :      * and the potential downside of not proceeding is much higher:  A single
    2906              :      * internal page with the INCOMPLETE_SPLIT flag set might otherwise
    2907              :      * prevent us from deleting hundreds of empty leaf pages from one level
    2908              :      * down.
    2909              :      */
    2910              :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    2911              : 
    2912         3642 :     if (parentoffset < maxoff)
    2913              :     {
    2914              :         /*
    2915              :          * Child is not the rightmost child in parent, so it's safe to delete
    2916              :          * the subtree whose root/topparent is child page
    2917              :          */
    2918         3427 :         *subtreeparent = pbuf;
    2919         3427 :         *poffset = parentoffset;
    2920         3427 :         return true;
    2921              :     }
    2922              : 
    2923              :     /*
    2924              :      * Child is the rightmost child of parent.
    2925              :      *
    2926              :      * Since it's the rightmost child of parent, deleting the child (or
    2927              :      * deleting the subtree whose root/topparent is the child page) is only
    2928              :      * safe when it's also possible to delete the parent.
    2929              :      */
    2930              :     Assert(parentoffset == maxoff);
    2931          215 :     if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
    2932              :     {
    2933              :         /*
    2934              :          * Child isn't parent's only child, or parent is rightmost on its
    2935              :          * entire level.  Definitely cannot delete any pages.
    2936              :          */
    2937            8 :         _bt_relbuf(rel, pbuf);
    2938            8 :         return false;
    2939              :     }
    2940              : 
    2941              :     /*
    2942              :      * Now make sure that the parent deletion is itself safe by examining the
    2943              :      * child's grandparent page.  Recurse, passing the parent page as the
    2944              :      * child page (child's grandparent is the parent on the next level up). If
    2945              :      * parent deletion is unsafe, then child deletion must also be unsafe (in
    2946              :      * which case caller cannot delete any pages at all).
    2947              :      */
    2948          207 :     *topparent = parent;
    2949          207 :     *topparentrightsib = opaque->btpo_next;
    2950              : 
    2951              :     /*
    2952              :      * Release lock on parent before recursing.
    2953              :      *
    2954              :      * It's OK to release page locks on parent before recursive call locks
    2955              :      * grandparent.  An internal page can only acquire an entry if the child
    2956              :      * is split, but that cannot happen as long as we still hold a lock on the
    2957              :      * leafbuf page.
    2958              :      */
    2959          207 :     _bt_relbuf(rel, pbuf);
    2960              : 
    2961              :     /*
    2962              :      * Before recursing, check that the left sibling of parent (if any) is not
    2963              :      * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
    2964              :      * parent lock).
    2965              :      *
    2966              :      * Note: We deliberately avoid completing incomplete splits here.
    2967              :      */
    2968          207 :     if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
    2969            0 :         return false;
    2970              : 
    2971              :     /* Recurse to examine child page's grandparent page */
    2972          207 :     return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
    2973              :                                    subtreeparent, poffset,
    2974              :                                    topparent, topparentrightsib);
    2975              : }
    2976              : 
    2977              : /*
    2978              :  * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
    2979              :  * optimization.
    2980              :  *
    2981              :  * Called at the start of a btvacuumscan().  Caller's cleanuponly argument
    2982              :  * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
    2983              :  *
    2984              :  * We expect to allocate memory inside VACUUM's top-level memory context here.
    2985              :  * The working buffer is subject to a limit based on work_mem.  Our strategy
    2986              :  * when the array can no longer grow within the bounds of that limit is to
    2987              :  * stop saving additional newly deleted pages, while proceeding as usual with
    2988              :  * the pages that we can fit.
    2989              :  */
    2990              : void
    2991         1741 : _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
    2992              : {
    2993              :     Size        maxbufsize;
    2994              : 
    2995              :     /*
    2996              :      * Don't bother with optimization in cleanup-only case -- we don't expect
    2997              :      * any newly deleted pages.  Besides, cleanup-only calls to btvacuumscan()
    2998              :      * can only take place because this optimization didn't work out during
    2999              :      * the last VACUUM.
    3000              :      */
    3001         1741 :     if (cleanuponly)
    3002            7 :         return;
    3003              : 
    3004              :     /*
    3005              :      * Cap maximum size of array so that we always respect work_mem.  Avoid
    3006              :      * int overflow here.
    3007              :      */
    3008         1734 :     vstate->bufsize = 256;
    3009         1734 :     maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
    3010         1734 :     maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
    3011              :     /* BTVacState.maxbufsize has type int */
    3012         1734 :     maxbufsize = Min(maxbufsize, INT_MAX);
    3013              :     /* Stay sane with small work_mem */
    3014         1734 :     maxbufsize = Max(maxbufsize, vstate->bufsize);
    3015         1734 :     vstate->maxbufsize = (int) maxbufsize;
    3016              : 
    3017              :     /* Allocate buffer, indicate that there are currently 0 pending pages */
    3018         1734 :     vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
    3019         1734 :     vstate->npendingpages = 0;
    3020              : }
    3021              : 
    3022              : /*
    3023              :  * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
    3024              :  * the ongoing VACUUM operation) into the free space map -- though only when
    3025              :  * it is actually safe to do so by now.
    3026              :  *
    3027              :  * Called at the end of a btvacuumscan(), just before free space map vacuuming
    3028              :  * takes place.
    3029              :  *
    3030              :  * Frees memory allocated by _bt_pendingfsm_init(), if any.
    3031              :  */
    3032              : void
    3033         1741 : _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
    3034              : {
    3035         1741 :     IndexBulkDeleteResult *stats = vstate->stats;
    3036         1741 :     Relation    heaprel = vstate->info->heaprel;
    3037              : 
    3038              :     Assert(stats->pages_newly_deleted >= vstate->npendingpages);
    3039              :     Assert(heaprel != NULL);
    3040              : 
    3041         1741 :     if (vstate->npendingpages == 0)
    3042              :     {
    3043              :         /* Just free memory when nothing to do */
    3044         1652 :         if (vstate->pendingpages)
    3045         1645 :             pfree(vstate->pendingpages);
    3046              : 
    3047         1652 :         return;
    3048              :     }
    3049              : 
    3050              : #ifdef DEBUG_BTREE_PENDING_FSM
    3051              : 
    3052              :     /*
    3053              :      * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
    3054              :      * placing pending pages in the FSM.  Note that the optimization will
    3055              :      * never be effective without some other backend concurrently consuming an
    3056              :      * XID.
    3057              :      */
    3058              :     pg_usleep(5000000L);
    3059              : #endif
    3060              : 
    3061              :     /*
    3062              :      * Recompute VACUUM XID boundaries.
    3063              :      *
    3064              :      * We don't actually care about the oldest non-removable XID.  Computing
    3065              :      * the oldest such XID has a useful side-effect that we rely on: it
    3066              :      * forcibly updates the XID horizon state for this backend.  This step is
    3067              :      * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
    3068              :      * that it is now safe to recycle newly deleted pages without this step.
    3069              :      */
    3070           89 :     GetOldestNonRemovableTransactionId(heaprel);
    3071              : 
    3072          145 :     for (int i = 0; i < vstate->npendingpages; i++)
    3073              :     {
    3074          145 :         BlockNumber target = vstate->pendingpages[i].target;
    3075          145 :         FullTransactionId safexid = vstate->pendingpages[i].safexid;
    3076              : 
    3077              :         /*
    3078              :          * Do the equivalent of checking BTPageIsRecyclable(), but without
    3079              :          * accessing the page again a second time.
    3080              :          *
    3081              :          * Give up on finding the first non-recyclable page -- all later pages
    3082              :          * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
    3083              :          * to the array in safexid order.
    3084              :          */
    3085          145 :         if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
    3086           89 :             break;
    3087              : 
    3088           56 :         RecordFreeIndexPage(rel, target);
    3089           56 :         stats->pages_free++;
    3090              :     }
    3091              : 
    3092           89 :     pfree(vstate->pendingpages);
    3093              : }
    3094              : 
    3095              : /*
    3096              :  * Maintain array of pages that were deleted during current btvacuumscan()
    3097              :  * call, for use in _bt_pendingfsm_finalize()
    3098              :  */
    3099              : static void
    3100         3616 : _bt_pendingfsm_add(BTVacState *vstate,
    3101              :                    BlockNumber target,
    3102              :                    FullTransactionId safexid)
    3103              : {
    3104              :     Assert(vstate->npendingpages <= vstate->bufsize);
    3105              :     Assert(vstate->bufsize <= vstate->maxbufsize);
    3106              : 
    3107              : #ifdef USE_ASSERT_CHECKING
    3108              : 
    3109              :     /*
    3110              :      * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
    3111              :      * array will always be in safexid order (since that is the order that we
    3112              :      * save them in here)
    3113              :      */
    3114              :     if (vstate->npendingpages > 0)
    3115              :     {
    3116              :         FullTransactionId lastsafexid =
    3117              :             vstate->pendingpages[vstate->npendingpages - 1].safexid;
    3118              : 
    3119              :         Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
    3120              :     }
    3121              : #endif
    3122              : 
    3123              :     /*
    3124              :      * If temp buffer reaches maxbufsize/work_mem capacity then we discard
    3125              :      * information about this page.
    3126              :      *
    3127              :      * Note that this also covers the case where we opted to not use the
    3128              :      * optimization in _bt_pendingfsm_init().
    3129              :      */
    3130         3616 :     if (vstate->npendingpages == vstate->maxbufsize)
    3131            0 :         return;
    3132              : 
    3133              :     /* Consider enlarging buffer */
    3134         3616 :     if (vstate->npendingpages == vstate->bufsize)
    3135              :     {
    3136            5 :         int         newbufsize = vstate->bufsize * 2;
    3137              : 
    3138              :         /* Respect work_mem */
    3139            5 :         if (newbufsize > vstate->maxbufsize)
    3140            0 :             newbufsize = vstate->maxbufsize;
    3141              : 
    3142            5 :         vstate->bufsize = newbufsize;
    3143            5 :         vstate->pendingpages =
    3144            5 :             repalloc(vstate->pendingpages,
    3145            5 :                      sizeof(BTPendingFSM) * vstate->bufsize);
    3146              :     }
    3147              : 
    3148              :     /* Save metadata for newly deleted page */
    3149         3616 :     vstate->pendingpages[vstate->npendingpages].target = target;
    3150         3616 :     vstate->pendingpages[vstate->npendingpages].safexid = safexid;
    3151         3616 :     vstate->npendingpages++;
    3152              : }
        

Generated by: LCOV version 2.0-1