LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 627 704 89.1 %
Date: 2020-06-01 09:07:10 Functions: 23 24 95.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtpage.c
       4             :  *    BTree-specific page management code for the Postgres btree access
       5             :  *    method.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  *
      11             :  * IDENTIFICATION
      12             :  *    src/backend/access/nbtree/nbtpage.c
      13             :  *
      14             :  *  NOTES
      15             :  *     Postgres btree pages look like ordinary relation pages.  The opaque
      16             :  *     data at high addresses includes pointers to left and right siblings
      17             :  *     and flag data describing page state.  The first page in a btree, page
      18             :  *     zero, is special -- it stores meta-information describing the tree.
      19             :  *     Pages one and higher store the actual tree data.
      20             :  *
      21             :  *-------------------------------------------------------------------------
      22             :  */
      23             : #include "postgres.h"
      24             : 
      25             : #include "access/nbtree.h"
      26             : #include "access/nbtxlog.h"
      27             : #include "access/tableam.h"
      28             : #include "access/transam.h"
      29             : #include "access/xlog.h"
      30             : #include "access/xloginsert.h"
      31             : #include "miscadmin.h"
      32             : #include "storage/indexfsm.h"
      33             : #include "storage/lmgr.h"
      34             : #include "storage/predicate.h"
      35             : #include "utils/snapmgr.h"
      36             : 
      37             : static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
      38             : static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
      39             :                                TransactionId latestRemovedXid);
      40             : static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
      41             :                                      OffsetNumber *deletable, int ndeletable);
      42             : static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
      43             :                                    BTStack stack);
      44             : static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
      45             :                                      BlockNumber scanblkno,
      46             :                                      bool *rightsib_empty,
      47             :                                      TransactionId *oldestBtpoXact,
      48             :                                      uint32 *ndeleted);
      49             : static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
      50             :                                     BTStack stack,
      51             :                                     Buffer *subtreeparent,
      52             :                                     OffsetNumber *poffset,
      53             :                                     BlockNumber *topparent,
      54             :                                     BlockNumber *topparentrightsib);
      55             : 
      56             : /*
      57             :  *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
      58             :  */
      59             : void
      60       76564 : _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
      61             :                  bool allequalimage)
      62             : {
      63             :     BTMetaPageData *metad;
      64             :     BTPageOpaque metaopaque;
      65             : 
      66       76564 :     _bt_pageinit(page, BLCKSZ);
      67             : 
      68       76564 :     metad = BTPageGetMeta(page);
      69       76564 :     metad->btm_magic = BTREE_MAGIC;
      70       76564 :     metad->btm_version = BTREE_VERSION;
      71       76564 :     metad->btm_root = rootbknum;
      72       76564 :     metad->btm_level = level;
      73       76564 :     metad->btm_fastroot = rootbknum;
      74       76564 :     metad->btm_fastlevel = level;
      75       76564 :     metad->btm_oldest_btpo_xact = InvalidTransactionId;
      76       76564 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
      77       76564 :     metad->btm_allequalimage = allequalimage;
      78             : 
      79       76564 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
      80       76564 :     metaopaque->btpo_flags = BTP_META;
      81             : 
      82             :     /*
      83             :      * Set pd_lower just past the end of the metadata.  This is essential,
      84             :      * because without doing so, metadata will be lost if xlog.c compresses
      85             :      * the page.
      86             :      */
      87       76564 :     ((PageHeader) page)->pd_lower =
      88       76564 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
      89       76564 : }
      90             : 
      91             : /*
      92             :  *  _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
      93             :  *      3, the last version that can be updated without broadly affecting
      94             :  *      on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
      95             :  *
      96             :  *      This routine does purely in-memory image upgrade.  Caller is
      97             :  *      responsible for locking, WAL-logging etc.
      98             :  */
      99             : void
     100           0 : _bt_upgrademetapage(Page page)
     101             : {
     102             :     BTMetaPageData *metad;
     103             :     BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
     104             : 
     105           0 :     metad = BTPageGetMeta(page);
     106           0 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
     107             : 
     108             :     /* It must be really a meta page of upgradable version */
     109             :     Assert(metaopaque->btpo_flags & BTP_META);
     110             :     Assert(metad->btm_version < BTREE_NOVAC_VERSION);
     111             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     112             : 
     113             :     /* Set version number and fill extra fields added into version 3 */
     114           0 :     metad->btm_version = BTREE_NOVAC_VERSION;
     115           0 :     metad->btm_oldest_btpo_xact = InvalidTransactionId;
     116           0 :     metad->btm_last_cleanup_num_heap_tuples = -1.0;
     117             :     /* Only a REINDEX can set this field */
     118             :     Assert(!metad->btm_allequalimage);
     119           0 :     metad->btm_allequalimage = false;
     120             : 
     121             :     /* Adjust pd_lower (see _bt_initmetapage() for details) */
     122           0 :     ((PageHeader) page)->pd_lower =
     123           0 :         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
     124           0 : }
     125             : 
     126             : /*
     127             :  * Get metadata from share-locked buffer containing metapage, while performing
     128             :  * standard sanity checks.
     129             :  *
     130             :  * Callers that cache data returned here in local cache should note that an
     131             :  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
     132             :  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
     133             :  */
     134             : static BTMetaPageData *
     135     1042146 : _bt_getmeta(Relation rel, Buffer metabuf)
     136             : {
     137             :     Page        metapg;
     138             :     BTPageOpaque metaopaque;
     139             :     BTMetaPageData *metad;
     140             : 
     141     1042146 :     metapg = BufferGetPage(metabuf);
     142     1042146 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
     143     1042146 :     metad = BTPageGetMeta(metapg);
     144             : 
     145             :     /* sanity-check the metapage */
     146     1042146 :     if (!P_ISMETA(metaopaque) ||
     147     1042146 :         metad->btm_magic != BTREE_MAGIC)
     148           0 :         ereport(ERROR,
     149             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     150             :                  errmsg("index \"%s\" is not a btree",
     151             :                         RelationGetRelationName(rel))));
     152             : 
     153     1042146 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     154     1042146 :         metad->btm_version > BTREE_VERSION)
     155           0 :         ereport(ERROR,
     156             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     157             :                  errmsg("version mismatch in index \"%s\": file version %d, "
     158             :                         "current version %d, minimal supported version %d",
     159             :                         RelationGetRelationName(rel),
     160             :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     161             : 
     162     1042146 :     return metad;
     163             : }
     164             : 
     165             : /*
     166             :  *  _bt_update_meta_cleanup_info() -- Update cleanup-related information in
     167             :  *                                    the metapage.
     168             :  *
     169             :  *      This routine checks if provided cleanup-related information is matching
     170             :  *      to those written in the metapage.  On mismatch, metapage is overwritten.
     171             :  */
     172             : void
     173       61788 : _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
     174             :                              float8 numHeapTuples)
     175             : {
     176             :     Buffer      metabuf;
     177             :     Page        metapg;
     178             :     BTMetaPageData *metad;
     179       61788 :     bool        needsRewrite = false;
     180             :     XLogRecPtr  recptr;
     181             : 
     182             :     /* read the metapage and check if it needs rewrite */
     183       61788 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     184       61788 :     metapg = BufferGetPage(metabuf);
     185       61788 :     metad = BTPageGetMeta(metapg);
     186             : 
     187             :     /* outdated version of metapage always needs rewrite */
     188       61788 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
     189           0 :         needsRewrite = true;
     190       61788 :     else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
     191       61626 :              metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
     192       58764 :         needsRewrite = true;
     193             : 
     194       61788 :     if (!needsRewrite)
     195             :     {
     196        3024 :         _bt_relbuf(rel, metabuf);
     197        3024 :         return;
     198             :     }
     199             : 
     200             :     /* trade in our read lock for a write lock */
     201       58764 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     202       58764 :     LockBuffer(metabuf, BT_WRITE);
     203             : 
     204       58764 :     START_CRIT_SECTION();
     205             : 
     206             :     /* upgrade meta-page if needed */
     207       58764 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
     208           0 :         _bt_upgrademetapage(metapg);
     209             : 
     210             :     /* update cleanup-related information */
     211       58764 :     metad->btm_oldest_btpo_xact = oldestBtpoXact;
     212       58764 :     metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
     213       58764 :     MarkBufferDirty(metabuf);
     214             : 
     215             :     /* write wal record if needed */
     216       58764 :     if (RelationNeedsWAL(rel))
     217             :     {
     218             :         xl_btree_metadata md;
     219             : 
     220       58744 :         XLogBeginInsert();
     221       58744 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     222             : 
     223             :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     224       58744 :         md.version = metad->btm_version;
     225       58744 :         md.root = metad->btm_root;
     226       58744 :         md.level = metad->btm_level;
     227       58744 :         md.fastroot = metad->btm_fastroot;
     228       58744 :         md.fastlevel = metad->btm_fastlevel;
     229       58744 :         md.oldest_btpo_xact = oldestBtpoXact;
     230       58744 :         md.last_cleanup_num_heap_tuples = numHeapTuples;
     231       58744 :         md.allequalimage = metad->btm_allequalimage;
     232             : 
     233       58744 :         XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
     234             : 
     235       58744 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
     236             : 
     237       58744 :         PageSetLSN(metapg, recptr);
     238             :     }
     239             : 
     240       58764 :     END_CRIT_SECTION();
     241       58764 :     _bt_relbuf(rel, metabuf);
     242             : }
     243             : 
     244             : /*
     245             :  *  _bt_getroot() -- Get the root page of the btree.
     246             :  *
     247             :  *      Since the root page can move around the btree file, we have to read
     248             :  *      its location from the metadata page, and then read the root page
     249             :  *      itself.  If no root page exists yet, we have to create one.
     250             :  *
     251             :  *      The access type parameter (BT_READ or BT_WRITE) controls whether
     252             :  *      a new root page will be created or not.  If access = BT_READ,
     253             :  *      and no root page exists, we just return InvalidBuffer.  For
     254             :  *      BT_WRITE, we try to create the root page if it doesn't exist.
     255             :  *      NOTE that the returned root page will have only a read lock set
     256             :  *      on it even if access = BT_WRITE!
     257             :  *
     258             :  *      The returned page is not necessarily the true root --- it could be
     259             :  *      a "fast root" (a page that is alone in its level due to deletions).
     260             :  *      Also, if the root page is split while we are "in flight" to it,
     261             :  *      what we will return is the old root, which is now just the leftmost
     262             :  *      page on a probably-not-very-wide level.  For most purposes this is
     263             :  *      as good as or better than the true root, so we do not bother to
     264             :  *      insist on finding the true root.  We do, however, guarantee to
     265             :  *      return a live (not deleted or half-dead) page.
     266             :  *
     267             :  *      On successful return, the root page is pinned and read-locked.
     268             :  *      The metadata page is not locked or pinned on exit.
     269             :  */
     270             : Buffer
     271    22615868 : _bt_getroot(Relation rel, int access)
     272             : {
     273             :     Buffer      metabuf;
     274             :     Buffer      rootbuf;
     275             :     Page        rootpage;
     276             :     BTPageOpaque rootopaque;
     277             :     BlockNumber rootblkno;
     278             :     uint32      rootlevel;
     279             :     BTMetaPageData *metad;
     280             : 
     281             :     /*
     282             :      * Try to use previously-cached metapage data to find the root.  This
     283             :      * normally saves one buffer access per index search, which is a very
     284             :      * helpful savings in bufmgr traffic and hence contention.
     285             :      */
     286    22615868 :     if (rel->rd_amcache != NULL)
     287             :     {
     288    22243644 :         metad = (BTMetaPageData *) rel->rd_amcache;
     289             :         /* We shouldn't have cached it if any of these fail */
     290             :         Assert(metad->btm_magic == BTREE_MAGIC);
     291             :         Assert(metad->btm_version >= BTREE_MIN_VERSION);
     292             :         Assert(metad->btm_version <= BTREE_VERSION);
     293             :         Assert(!metad->btm_allequalimage ||
     294             :                metad->btm_version > BTREE_NOVAC_VERSION);
     295             :         Assert(metad->btm_root != P_NONE);
     296             : 
     297    22243644 :         rootblkno = metad->btm_fastroot;
     298             :         Assert(rootblkno != P_NONE);
     299    22243644 :         rootlevel = metad->btm_fastlevel;
     300             : 
     301    22243644 :         rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
     302    22243644 :         rootpage = BufferGetPage(rootbuf);
     303    22243644 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     304             : 
     305             :         /*
     306             :          * Since the cache might be stale, we check the page more carefully
     307             :          * here than normal.  We *must* check that it's not deleted. If it's
     308             :          * not alone on its level, then we reject too --- this may be overly
     309             :          * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
     310             :          * because that's not set in a "fast root".
     311             :          */
     312    22243644 :         if (!P_IGNORE(rootopaque) &&
     313    22243644 :             rootopaque->btpo.level == rootlevel &&
     314    22243644 :             P_LEFTMOST(rootopaque) &&
     315    22243644 :             P_RIGHTMOST(rootopaque))
     316             :         {
     317             :             /* OK, accept cached page as the root */
     318    22241146 :             return rootbuf;
     319             :         }
     320        2498 :         _bt_relbuf(rel, rootbuf);
     321             :         /* Cache is stale, throw it away */
     322        2498 :         if (rel->rd_amcache)
     323        2498 :             pfree(rel->rd_amcache);
     324        2498 :         rel->rd_amcache = NULL;
     325             :     }
     326             : 
     327      374722 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     328      374722 :     metad = _bt_getmeta(rel, metabuf);
     329             : 
     330             :     /* if no root page initialized yet, do it */
     331      374722 :     if (metad->btm_root == P_NONE)
     332             :     {
     333             :         Page        metapg;
     334             : 
     335             :         /* If access = BT_READ, caller doesn't want us to create root yet */
     336      372116 :         if (access == BT_READ)
     337             :         {
     338      359470 :             _bt_relbuf(rel, metabuf);
     339      359470 :             return InvalidBuffer;
     340             :         }
     341             : 
     342             :         /* trade in our read lock for a write lock */
     343       12646 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     344       12646 :         LockBuffer(metabuf, BT_WRITE);
     345             : 
     346             :         /*
     347             :          * Race condition:  if someone else initialized the metadata between
     348             :          * the time we released the read lock and acquired the write lock, we
     349             :          * must avoid doing it again.
     350             :          */
     351       12646 :         if (metad->btm_root != P_NONE)
     352             :         {
     353             :             /*
     354             :              * Metadata initialized by someone else.  In order to guarantee no
     355             :              * deadlocks, we have to release the metadata page and start all
     356             :              * over again.  (Is that really true? But it's hardly worth trying
     357             :              * to optimize this case.)
     358             :              */
     359           0 :             _bt_relbuf(rel, metabuf);
     360           0 :             return _bt_getroot(rel, access);
     361             :         }
     362             : 
     363             :         /*
     364             :          * Get, initialize, write, and leave a lock of the appropriate type on
     365             :          * the new root page.  Since this is the first page in the tree, it's
     366             :          * a leaf as well as the root.
     367             :          */
     368       12646 :         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
     369       12646 :         rootblkno = BufferGetBlockNumber(rootbuf);
     370       12646 :         rootpage = BufferGetPage(rootbuf);
     371       12646 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     372       12646 :         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
     373       12646 :         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
     374       12646 :         rootopaque->btpo.level = 0;
     375       12646 :         rootopaque->btpo_cycleid = 0;
     376             :         /* Get raw page pointer for metapage */
     377       12646 :         metapg = BufferGetPage(metabuf);
     378             : 
     379             :         /* NO ELOG(ERROR) till meta is updated */
     380       12646 :         START_CRIT_SECTION();
     381             : 
     382             :         /* upgrade metapage if needed */
     383       12646 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
     384           0 :             _bt_upgrademetapage(metapg);
     385             : 
     386       12646 :         metad->btm_root = rootblkno;
     387       12646 :         metad->btm_level = 0;
     388       12646 :         metad->btm_fastroot = rootblkno;
     389       12646 :         metad->btm_fastlevel = 0;
     390       12646 :         metad->btm_oldest_btpo_xact = InvalidTransactionId;
     391       12646 :         metad->btm_last_cleanup_num_heap_tuples = -1.0;
     392             : 
     393       12646 :         MarkBufferDirty(rootbuf);
     394       12646 :         MarkBufferDirty(metabuf);
     395             : 
     396             :         /* XLOG stuff */
     397       12646 :         if (RelationNeedsWAL(rel))
     398             :         {
     399             :             xl_btree_newroot xlrec;
     400             :             XLogRecPtr  recptr;
     401             :             xl_btree_metadata md;
     402             : 
     403       12418 :             XLogBeginInsert();
     404       12418 :             XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
     405       12418 :             XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     406             : 
     407             :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
     408       12418 :             md.version = metad->btm_version;
     409       12418 :             md.root = rootblkno;
     410       12418 :             md.level = 0;
     411       12418 :             md.fastroot = rootblkno;
     412       12418 :             md.fastlevel = 0;
     413       12418 :             md.oldest_btpo_xact = InvalidTransactionId;
     414       12418 :             md.last_cleanup_num_heap_tuples = -1.0;
     415       12418 :             md.allequalimage = metad->btm_allequalimage;
     416             : 
     417       12418 :             XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
     418             : 
     419       12418 :             xlrec.rootblk = rootblkno;
     420       12418 :             xlrec.level = 0;
     421             : 
     422       12418 :             XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
     423             : 
     424       12418 :             recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
     425             : 
     426       12418 :             PageSetLSN(rootpage, recptr);
     427       12418 :             PageSetLSN(metapg, recptr);
     428             :         }
     429             : 
     430       12646 :         END_CRIT_SECTION();
     431             : 
     432             :         /*
     433             :          * swap root write lock for read lock.  There is no danger of anyone
     434             :          * else accessing the new root page while it's unlocked, since no one
     435             :          * else knows where it is yet.
     436             :          */
     437       12646 :         LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
     438       12646 :         LockBuffer(rootbuf, BT_READ);
     439             : 
     440             :         /* okay, metadata is correct, release lock on it without caching */
     441       12646 :         _bt_relbuf(rel, metabuf);
     442             :     }
     443             :     else
     444             :     {
     445        2606 :         rootblkno = metad->btm_fastroot;
     446             :         Assert(rootblkno != P_NONE);
     447        2606 :         rootlevel = metad->btm_fastlevel;
     448             : 
     449             :         /*
     450             :          * Cache the metapage data for next time
     451             :          */
     452        2606 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     453             :                                              sizeof(BTMetaPageData));
     454        2606 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     455             : 
     456             :         /*
     457             :          * We are done with the metapage; arrange to release it via first
     458             :          * _bt_relandgetbuf call
     459             :          */
     460        2606 :         rootbuf = metabuf;
     461             : 
     462             :         for (;;)
     463             :         {
     464        2606 :             rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     465        2606 :             rootpage = BufferGetPage(rootbuf);
     466        2606 :             rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     467             : 
     468        2606 :             if (!P_IGNORE(rootopaque))
     469        2606 :                 break;
     470             : 
     471             :             /* it's dead, Jim.  step right one page */
     472           0 :             if (P_RIGHTMOST(rootopaque))
     473           0 :                 elog(ERROR, "no live root page found in index \"%s\"",
     474             :                      RelationGetRelationName(rel));
     475           0 :             rootblkno = rootopaque->btpo_next;
     476             :         }
     477             : 
     478             :         /* Note: can't check btpo.level on deleted pages */
     479        2606 :         if (rootopaque->btpo.level != rootlevel)
     480           0 :             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     481             :                  rootblkno, RelationGetRelationName(rel),
     482             :                  rootopaque->btpo.level, rootlevel);
     483             :     }
     484             : 
     485             :     /*
     486             :      * By here, we have a pin and read lock on the root page, and no lock set
     487             :      * on the metadata page.  Return the root page's buffer.
     488             :      */
     489       15252 :     return rootbuf;
     490             : }
     491             : 
     492             : /*
     493             :  *  _bt_gettrueroot() -- Get the true root page of the btree.
     494             :  *
     495             :  *      This is the same as the BT_READ case of _bt_getroot(), except
     496             :  *      we follow the true-root link not the fast-root link.
     497             :  *
     498             :  * By the time we acquire lock on the root page, it might have been split and
     499             :  * not be the true root anymore.  This is okay for the present uses of this
     500             :  * routine; we only really need to be able to move up at least one tree level
     501             :  * from whatever non-root page we were at.  If we ever do need to lock the
     502             :  * one true root page, we could loop here, re-reading the metapage on each
     503             :  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
     504             :  * moving to the root --- that'd deadlock against any concurrent root split.)
     505             :  */
     506             : Buffer
     507           6 : _bt_gettrueroot(Relation rel)
     508             : {
     509             :     Buffer      metabuf;
     510             :     Page        metapg;
     511             :     BTPageOpaque metaopaque;
     512             :     Buffer      rootbuf;
     513             :     Page        rootpage;
     514             :     BTPageOpaque rootopaque;
     515             :     BlockNumber rootblkno;
     516             :     uint32      rootlevel;
     517             :     BTMetaPageData *metad;
     518             : 
     519             :     /*
     520             :      * We don't try to use cached metapage data here, since (a) this path is
     521             :      * not performance-critical, and (b) if we are here it suggests our cache
     522             :      * is out-of-date anyway.  In light of point (b), it's probably safest to
     523             :      * actively flush any cached metapage info.
     524             :      */
     525           6 :     if (rel->rd_amcache)
     526           6 :         pfree(rel->rd_amcache);
     527           6 :     rel->rd_amcache = NULL;
     528             : 
     529           6 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     530           6 :     metapg = BufferGetPage(metabuf);
     531           6 :     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
     532           6 :     metad = BTPageGetMeta(metapg);
     533             : 
     534           6 :     if (!P_ISMETA(metaopaque) ||
     535           6 :         metad->btm_magic != BTREE_MAGIC)
     536           0 :         ereport(ERROR,
     537             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     538             :                  errmsg("index \"%s\" is not a btree",
     539             :                         RelationGetRelationName(rel))));
     540             : 
     541           6 :     if (metad->btm_version < BTREE_MIN_VERSION ||
     542           6 :         metad->btm_version > BTREE_VERSION)
     543           0 :         ereport(ERROR,
     544             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     545             :                  errmsg("version mismatch in index \"%s\": file version %d, "
     546             :                         "current version %d, minimal supported version %d",
     547             :                         RelationGetRelationName(rel),
     548             :                         metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
     549             : 
     550             :     /* if no root page initialized yet, fail */
     551           6 :     if (metad->btm_root == P_NONE)
     552             :     {
     553           0 :         _bt_relbuf(rel, metabuf);
     554           0 :         return InvalidBuffer;
     555             :     }
     556             : 
     557           6 :     rootblkno = metad->btm_root;
     558           6 :     rootlevel = metad->btm_level;
     559             : 
     560             :     /*
     561             :      * We are done with the metapage; arrange to release it via first
     562             :      * _bt_relandgetbuf call
     563             :      */
     564           6 :     rootbuf = metabuf;
     565             : 
     566             :     for (;;)
     567             :     {
     568           6 :         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
     569           6 :         rootpage = BufferGetPage(rootbuf);
     570           6 :         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
     571             : 
     572           6 :         if (!P_IGNORE(rootopaque))
     573           6 :             break;
     574             : 
     575             :         /* it's dead, Jim.  step right one page */
     576           0 :         if (P_RIGHTMOST(rootopaque))
     577           0 :             elog(ERROR, "no live root page found in index \"%s\"",
     578             :                  RelationGetRelationName(rel));
     579           0 :         rootblkno = rootopaque->btpo_next;
     580             :     }
     581             : 
     582             :     /* Note: can't check btpo.level on deleted pages */
     583           6 :     if (rootopaque->btpo.level != rootlevel)
     584           0 :         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
     585             :              rootblkno, RelationGetRelationName(rel),
     586             :              rootopaque->btpo.level, rootlevel);
     587             : 
     588           6 :     return rootbuf;
     589             : }
     590             : 
     591             : /*
     592             :  *  _bt_getrootheight() -- Get the height of the btree search tree.
     593             :  *
     594             :  *      We return the level (counting from zero) of the current fast root.
     595             :  *      This represents the number of tree levels we'd have to descend through
     596             :  *      to start any btree index search.
     597             :  *
     598             :  *      This is used by the planner for cost-estimation purposes.  Since it's
     599             :  *      only an estimate, slightly-stale data is fine, hence we don't worry
     600             :  *      about updating previously cached data.
     601             :  */
     602             : int
     603     6034822 : _bt_getrootheight(Relation rel)
     604             : {
     605             :     BTMetaPageData *metad;
     606             : 
     607     6034822 :     if (rel->rd_amcache == NULL)
     608             :     {
     609             :         Buffer      metabuf;
     610             : 
     611       59322 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     612       59322 :         metad = _bt_getmeta(rel, metabuf);
     613             : 
     614             :         /*
     615             :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     616             :          * to be made, so just stop here and report the index height is zero.
     617             :          * (XXX perhaps _bt_getroot() should be changed to allow this case.)
     618             :          */
     619       59322 :         if (metad->btm_root == P_NONE)
     620             :         {
     621       27470 :             _bt_relbuf(rel, metabuf);
     622       27470 :             return 0;
     623             :         }
     624             : 
     625             :         /*
     626             :          * Cache the metapage data for next time
     627             :          */
     628       31852 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     629             :                                              sizeof(BTMetaPageData));
     630       31852 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     631       31852 :         _bt_relbuf(rel, metabuf);
     632             :     }
     633             : 
     634             :     /* Get cached page */
     635     6007352 :     metad = (BTMetaPageData *) rel->rd_amcache;
     636             :     /* We shouldn't have cached it if any of these fail */
     637             :     Assert(metad->btm_magic == BTREE_MAGIC);
     638             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     639             :     Assert(metad->btm_version <= BTREE_VERSION);
     640             :     Assert(!metad->btm_allequalimage ||
     641             :            metad->btm_version > BTREE_NOVAC_VERSION);
     642             :     Assert(metad->btm_fastroot != P_NONE);
     643             : 
     644     6007352 :     return metad->btm_fastlevel;
     645             : }
     646             : 
     647             : /*
     648             :  *  _bt_metaversion() -- Get version/status info from metapage.
     649             :  *
     650             :  *      Sets caller's *heapkeyspace and *allequalimage arguments using data
     651             :  *      from the B-Tree metapage (could be locally-cached version).  This
     652             :  *      information needs to be stashed in insertion scankey, so we provide a
     653             :  *      single function that fetches both at once.
     654             :  *
     655             :  *      This is used to determine the rules that must be used to descend a
     656             :  *      btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
     657             :  *      pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
     658             :  *      performance when inserting a new BTScanInsert-wise duplicate tuple
     659             :  *      among many leaf pages already full of such duplicates.
     660             :  *
     661             :  *      Also sets allequalimage field, which indicates whether or not it is
     662             :  *      safe to apply deduplication.  We rely on the assumption that
     663             :  *      btm_allequalimage will be zero'ed on heapkeyspace indexes that were
     664             :  *      pg_upgrade'd from Postgres 12.
     665             :  */
     666             : void
     667    24647020 : _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
     668             : {
     669             :     BTMetaPageData *metad;
     670             : 
     671    24647020 :     if (rel->rd_amcache == NULL)
     672             :     {
     673             :         Buffer      metabuf;
     674             : 
     675      608102 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
     676      608102 :         metad = _bt_getmeta(rel, metabuf);
     677             : 
     678             :         /*
     679             :          * If there's no root page yet, _bt_getroot() doesn't expect a cache
     680             :          * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
     681             :          * be changed to allow this case.)
     682             :          */
     683      608102 :         if (metad->btm_root == P_NONE)
     684             :         {
     685      368816 :             *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     686      368816 :             *allequalimage = metad->btm_allequalimage;
     687             : 
     688      368816 :             _bt_relbuf(rel, metabuf);
     689      368816 :             return;
     690             :         }
     691             : 
     692             :         /*
     693             :          * Cache the metapage data for next time
     694             :          *
     695             :          * An on-the-fly version upgrade performed by _bt_upgrademetapage()
     696             :          * can change the nbtree version for an index without invalidating any
     697             :          * local cache.  This is okay because it can only happen when moving
     698             :          * from version 2 to version 3, both of which are !heapkeyspace
     699             :          * versions.
     700             :          */
     701      239286 :         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
     702             :                                              sizeof(BTMetaPageData));
     703      239286 :         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
     704      239286 :         _bt_relbuf(rel, metabuf);
     705             :     }
     706             : 
     707             :     /* Get cached page */
     708    24278204 :     metad = (BTMetaPageData *) rel->rd_amcache;
     709             :     /* We shouldn't have cached it if any of these fail */
     710             :     Assert(metad->btm_magic == BTREE_MAGIC);
     711             :     Assert(metad->btm_version >= BTREE_MIN_VERSION);
     712             :     Assert(metad->btm_version <= BTREE_VERSION);
     713             :     Assert(!metad->btm_allequalimage ||
     714             :            metad->btm_version > BTREE_NOVAC_VERSION);
     715             :     Assert(metad->btm_fastroot != P_NONE);
     716             : 
     717    24278204 :     *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
     718    24278204 :     *allequalimage = metad->btm_allequalimage;
     719             : }
     720             : 
     721             : /*
     722             :  *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
     723             :  */
     724             : void
     725    43009650 : _bt_checkpage(Relation rel, Buffer buf)
     726             : {
     727    43009650 :     Page        page = BufferGetPage(buf);
     728             : 
     729             :     /*
     730             :      * ReadBuffer verifies that every newly-read page passes
     731             :      * PageHeaderIsValid, which means it either contains a reasonably sane
     732             :      * page header or is all-zero.  We have to defend against the all-zero
     733             :      * case, however.
     734             :      */
     735    43009650 :     if (PageIsNew(page))
     736           0 :         ereport(ERROR,
     737             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     738             :                  errmsg("index \"%s\" contains unexpected zero page at block %u",
     739             :                         RelationGetRelationName(rel),
     740             :                         BufferGetBlockNumber(buf)),
     741             :                  errhint("Please REINDEX it.")));
     742             : 
     743             :     /*
     744             :      * Additionally check that the special area looks sane.
     745             :      */
     746    43009650 :     if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
     747           0 :         ereport(ERROR,
     748             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
     749             :                  errmsg("index \"%s\" contains corrupted page at block %u",
     750             :                         RelationGetRelationName(rel),
     751             :                         BufferGetBlockNumber(buf)),
     752             :                  errhint("Please REINDEX it.")));
     753    43009650 : }
     754             : 
     755             : /*
     756             :  * Log the reuse of a page from the FSM.
     757             :  */
     758             : static void
     759         408 : _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
     760             : {
     761             :     xl_btree_reuse_page xlrec_reuse;
     762             : 
     763             :     /*
     764             :      * Note that we don't register the buffer with the record, because this
     765             :      * operation doesn't modify the page. This record only exists to provide a
     766             :      * conflict point for Hot Standby.
     767             :      */
     768             : 
     769             :     /* XLOG stuff */
     770         408 :     xlrec_reuse.node = rel->rd_node;
     771         408 :     xlrec_reuse.block = blkno;
     772         408 :     xlrec_reuse.latestRemovedXid = latestRemovedXid;
     773             : 
     774         408 :     XLogBeginInsert();
     775         408 :     XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
     776             : 
     777         408 :     XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
     778         408 : }
     779             : 
     780             : /*
     781             :  *  _bt_getbuf() -- Get a buffer by block number for read or write.
     782             :  *
     783             :  *      blkno == P_NEW means to get an unallocated index page.  The page
     784             :  *      will be initialized before returning it.
     785             :  *
     786             :  *      When this routine returns, the appropriate lock is set on the
     787             :  *      requested buffer and its reference count has been incremented
     788             :  *      (ie, the buffer is "locked and pinned").  Also, we apply
     789             :  *      _bt_checkpage to sanity-check the page (except in P_NEW case).
     790             :  */
     791             : Buffer
     792    23668022 : _bt_getbuf(Relation rel, BlockNumber blkno, int access)
     793             : {
     794             :     Buffer      buf;
     795             : 
     796    23668022 :     if (blkno != P_NEW)
     797             :     {
     798             :         /* Read an existing block of the relation */
     799    23606618 :         buf = ReadBuffer(rel, blkno);
     800    23606618 :         LockBuffer(buf, access);
     801    23606618 :         _bt_checkpage(rel, buf);
     802             :     }
     803             :     else
     804             :     {
     805             :         bool        needLock;
     806             :         Page        page;
     807             : 
     808             :         Assert(access == BT_WRITE);
     809             : 
     810             :         /*
     811             :          * First see if the FSM knows of any free pages.
     812             :          *
     813             :          * We can't trust the FSM's report unreservedly; we have to check that
     814             :          * the page is still free.  (For example, an already-free page could
     815             :          * have been re-used between the time the last VACUUM scanned it and
     816             :          * the time the VACUUM made its FSM updates.)
     817             :          *
     818             :          * In fact, it's worse than that: we can't even assume that it's safe
     819             :          * to take a lock on the reported page.  If somebody else has a lock
     820             :          * on it, or even worse our own caller does, we could deadlock.  (The
     821             :          * own-caller scenario is actually not improbable. Consider an index
     822             :          * on a serial or timestamp column.  Nearly all splits will be at the
     823             :          * rightmost page, so it's entirely likely that _bt_split will call us
     824             :          * while holding a lock on the page most recently acquired from FSM. A
     825             :          * VACUUM running concurrently with the previous split could well have
     826             :          * placed that page back in FSM.)
     827             :          *
     828             :          * To get around that, we ask for only a conditional lock on the
     829             :          * reported page.  If we fail, then someone else is using the page,
     830             :          * and we may reasonably assume it's not free.  (If we happen to be
     831             :          * wrong, the worst consequence is the page will be lost to use till
     832             :          * the next VACUUM, which is no big problem.)
     833             :          */
     834             :         for (;;)
     835             :         {
     836       61404 :             blkno = GetFreeIndexPage(rel);
     837       61404 :             if (blkno == InvalidBlockNumber)
     838       60996 :                 break;
     839         408 :             buf = ReadBuffer(rel, blkno);
     840         408 :             if (ConditionalLockBuffer(buf))
     841             :             {
     842         408 :                 page = BufferGetPage(buf);
     843         408 :                 if (_bt_page_recyclable(page))
     844             :                 {
     845             :                     /*
     846             :                      * If we are generating WAL for Hot Standby then create a
     847             :                      * WAL record that will allow us to conflict with queries
     848             :                      * running on standby, in case they have snapshots older
     849             :                      * than btpo.xact.  This can only apply if the page does
     850             :                      * have a valid btpo.xact value, ie not if it's new.  (We
     851             :                      * must check that because an all-zero page has no special
     852             :                      * space.)
     853             :                      */
     854         408 :                     if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
     855         408 :                         !PageIsNew(page))
     856             :                     {
     857         408 :                         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     858             : 
     859         408 :                         _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
     860             :                     }
     861             : 
     862             :                     /* Okay to use page.  Re-initialize and return it */
     863         408 :                     _bt_pageinit(page, BufferGetPageSize(buf));
     864         408 :                     return buf;
     865             :                 }
     866           0 :                 elog(DEBUG2, "FSM returned nonrecyclable page");
     867           0 :                 _bt_relbuf(rel, buf);
     868             :             }
     869             :             else
     870             :             {
     871           0 :                 elog(DEBUG2, "FSM returned nonlockable page");
     872             :                 /* couldn't get lock, so just drop pin */
     873           0 :                 ReleaseBuffer(buf);
     874             :             }
     875             :         }
     876             : 
     877             :         /*
     878             :          * Extend the relation by one page.
     879             :          *
     880             :          * We have to use a lock to ensure no one else is extending the rel at
     881             :          * the same time, else we will both try to initialize the same new
     882             :          * page.  We can skip locking for new or temp relations, however,
     883             :          * since no one else could be accessing them.
     884             :          */
     885       60996 :         needLock = !RELATION_IS_LOCAL(rel);
     886             : 
     887       60996 :         if (needLock)
     888       58218 :             LockRelationForExtension(rel, ExclusiveLock);
     889             : 
     890       60996 :         buf = ReadBuffer(rel, P_NEW);
     891             : 
     892             :         /* Acquire buffer lock on new page */
     893       60996 :         LockBuffer(buf, BT_WRITE);
     894             : 
     895             :         /*
     896             :          * Release the file-extension lock; it's now OK for someone else to
     897             :          * extend the relation some more.  Note that we cannot release this
     898             :          * lock before we have buffer lock on the new page, or we risk a race
     899             :          * condition against btvacuumscan --- see comments therein.
     900             :          */
     901       60996 :         if (needLock)
     902       58218 :             UnlockRelationForExtension(rel, ExclusiveLock);
     903             : 
     904             :         /* Initialize the new page before returning it */
     905       60996 :         page = BufferGetPage(buf);
     906             :         Assert(PageIsNew(page));
     907       60996 :         _bt_pageinit(page, BufferGetPageSize(buf));
     908             :     }
     909             : 
     910             :     /* ref count and lock type are correct */
     911    23667614 :     return buf;
     912             : }
     913             : 
     914             : /*
     915             :  *  _bt_relandgetbuf() -- release a locked buffer and get another one.
     916             :  *
     917             :  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
     918             :  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
     919             :  * then it reduces to just _bt_getbuf; allowing this case simplifies some
     920             :  * callers.
     921             :  *
     922             :  * The original motivation for using this was to avoid two entries to the
     923             :  * bufmgr when one would do.  However, now it's mainly just a notational
     924             :  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
     925             :  * is when the target page is the same one already in the buffer.
     926             :  */
     927             : Buffer
     928    19210218 : _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
     929             : {
     930             :     Buffer      buf;
     931             : 
     932             :     Assert(blkno != P_NEW);
     933    19210218 :     if (BufferIsValid(obuf))
     934    19199682 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
     935    19210218 :     buf = ReleaseAndReadBuffer(obuf, rel, blkno);
     936    19210218 :     LockBuffer(buf, access);
     937    19210218 :     _bt_checkpage(rel, buf);
     938    19210218 :     return buf;
     939             : }
     940             : 
     941             : /*
     942             :  *  _bt_relbuf() -- release a locked buffer.
     943             :  *
     944             :  * Lock and pin (refcount) are both dropped.
     945             :  */
     946             : void
     947    12901406 : _bt_relbuf(Relation rel, Buffer buf)
     948             : {
     949    12901406 :     UnlockReleaseBuffer(buf);
     950    12901406 : }
     951             : 
     952             : /*
     953             :  *  _bt_pageinit() -- Initialize a new page.
     954             :  *
     955             :  * On return, the page header is initialized; data space is empty;
     956             :  * special space is zeroed out.
     957             :  */
     958             : void
     959      265952 : _bt_pageinit(Page page, Size size)
     960             : {
     961      265952 :     PageInit(page, size, sizeof(BTPageOpaqueData));
     962      265952 : }
     963             : 
     964             : /*
     965             :  *  _bt_page_recyclable() -- Is an existing page recyclable?
     966             :  *
     967             :  * This exists to make sure _bt_getbuf and btvacuumscan have the same
     968             :  * policy about whether a page is safe to re-use.  But note that _bt_getbuf
     969             :  * knows enough to distinguish the PageIsNew condition from the other one.
     970             :  * At some point it might be appropriate to redesign this to have a three-way
     971             :  * result value.
     972             :  */
     973             : bool
     974      110344 : _bt_page_recyclable(Page page)
     975             : {
     976             :     BTPageOpaque opaque;
     977             : 
     978             :     /*
     979             :      * It's possible to find an all-zeroes page in an index --- for example, a
     980             :      * backend might successfully extend the relation one page and then crash
     981             :      * before it is able to make a WAL entry for adding the page. If we find a
     982             :      * zeroed page then reclaim it.
     983             :      */
     984      110344 :     if (PageIsNew(page))
     985           0 :         return true;
     986             : 
     987             :     /*
     988             :      * Otherwise, recycle if deleted and too old to have any processes
     989             :      * interested in it.
     990             :      */
     991      110344 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     992      113312 :     if (P_ISDELETED(opaque) &&
     993        2968 :         TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
     994        2924 :         return true;
     995      107420 :     return false;
     996             : }
     997             : 
     998             : /*
     999             :  * Delete item(s) from a btree leaf page during VACUUM.
    1000             :  *
    1001             :  * This routine assumes that the caller has a super-exclusive write lock on
    1002             :  * the buffer.  Also, the given deletable and updatable arrays *must* be
    1003             :  * sorted in ascending order.
    1004             :  *
    1005             :  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
    1006             :  * in an existing posting list item are to be removed by VACUUM.  This works
    1007             :  * by updating/overwriting an existing item with caller's new version of the
    1008             :  * item (a version that lacks the TIDs that are to be deleted).
    1009             :  *
    1010             :  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
    1011             :  * generate their own latestRemovedXid by accessing the heap directly, whereas
    1012             :  * VACUUMs rely on the initial heap scan taking care of it indirectly.  Also,
    1013             :  * only VACUUM can perform granular deletes of individual TIDs in posting list
    1014             :  * tuples.
    1015             :  */
    1016             : void
    1017       18272 : _bt_delitems_vacuum(Relation rel, Buffer buf,
    1018             :                     OffsetNumber *deletable, int ndeletable,
    1019             :                     BTVacuumPosting *updatable, int nupdatable)
    1020             : {
    1021       18272 :     Page        page = BufferGetPage(buf);
    1022             :     BTPageOpaque opaque;
    1023             :     Size        itemsz;
    1024       18272 :     char       *updatedbuf = NULL;
    1025       18272 :     Size        updatedbuflen = 0;
    1026             :     OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
    1027             : 
    1028             :     /* Shouldn't be called unless there's something to do */
    1029             :     Assert(ndeletable > 0 || nupdatable > 0);
    1030             : 
    1031       30202 :     for (int i = 0; i < nupdatable; i++)
    1032             :     {
    1033             :         /* Replace work area IndexTuple with updated version */
    1034       11930 :         _bt_update_posting(updatable[i]);
    1035             : 
    1036             :         /* Maintain array of updatable page offsets for WAL record */
    1037       11930 :         updatedoffsets[i] = updatable[i]->updatedoffset;
    1038             :     }
    1039             : 
    1040             :     /* XLOG stuff -- allocate and fill buffer before critical section */
    1041       18272 :     if (nupdatable > 0 && RelationNeedsWAL(rel))
    1042             :     {
    1043         116 :         Size        offset = 0;
    1044             : 
    1045       12046 :         for (int i = 0; i < nupdatable; i++)
    1046             :         {
    1047       11930 :             BTVacuumPosting vacposting = updatable[i];
    1048             : 
    1049       11930 :             itemsz = SizeOfBtreeUpdate +
    1050       11930 :                 vacposting->ndeletedtids * sizeof(uint16);
    1051       11930 :             updatedbuflen += itemsz;
    1052             :         }
    1053             : 
    1054         116 :         updatedbuf = palloc(updatedbuflen);
    1055       12046 :         for (int i = 0; i < nupdatable; i++)
    1056             :         {
    1057       11930 :             BTVacuumPosting vacposting = updatable[i];
    1058             :             xl_btree_update update;
    1059             : 
    1060       11930 :             update.ndeletedtids = vacposting->ndeletedtids;
    1061       11930 :             memcpy(updatedbuf + offset, &update.ndeletedtids,
    1062             :                    SizeOfBtreeUpdate);
    1063       11930 :             offset += SizeOfBtreeUpdate;
    1064             : 
    1065       11930 :             itemsz = update.ndeletedtids * sizeof(uint16);
    1066       11930 :             memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
    1067       11930 :             offset += itemsz;
    1068             :         }
    1069             :     }
    1070             : 
    1071             :     /* No ereport(ERROR) until changes are logged */
    1072       18272 :     START_CRIT_SECTION();
    1073             : 
    1074             :     /*
    1075             :      * Handle posting tuple updates.
    1076             :      *
    1077             :      * Deliberately do this before handling simple deletes.  If we did it the
    1078             :      * other way around (i.e. WAL record order -- simple deletes before
    1079             :      * updates) then we'd have to make compensating changes to the 'updatable'
    1080             :      * array of offset numbers.
    1081             :      *
    1082             :      * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
    1083             :      * happens to already be set.  Although we unset the BTP_HAS_GARBAGE page
    1084             :      * level flag, unsetting individual LP_DEAD bits should still be avoided.
    1085             :      */
    1086       30202 :     for (int i = 0; i < nupdatable; i++)
    1087             :     {
    1088       11930 :         OffsetNumber updatedoffset = updatedoffsets[i];
    1089             :         IndexTuple  itup;
    1090             : 
    1091       11930 :         itup = updatable[i]->itup;
    1092       11930 :         itemsz = MAXALIGN(IndexTupleSize(itup));
    1093       11930 :         if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
    1094             :                                      itemsz))
    1095           0 :             elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
    1096             :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1097             :     }
    1098             : 
    1099             :     /* Now handle simple deletes of entire tuples */
    1100       18272 :     if (ndeletable > 0)
    1101       18256 :         PageIndexMultiDelete(page, deletable, ndeletable);
    1102             : 
    1103             :     /*
    1104             :      * We can clear the vacuum cycle ID since this page has certainly been
    1105             :      * processed by the current vacuum scan.
    1106             :      */
    1107       18272 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1108       18272 :     opaque->btpo_cycleid = 0;
    1109             : 
    1110             :     /*
    1111             :      * Mark the page as not containing any LP_DEAD items.  This is not
    1112             :      * certainly true (there might be some that have recently been marked, but
    1113             :      * weren't targeted by VACUUM's heap scan), but it will be true often
    1114             :      * enough.  VACUUM does not delete items purely because they have their
    1115             :      * LP_DEAD bit set, since doing so would necessitate explicitly logging a
    1116             :      * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
    1117             :      *
    1118             :      * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
    1119             :      * limited, since we never falsely unset an LP_DEAD bit.  Workloads that
    1120             :      * are particularly dependent on LP_DEAD bits being set quickly will
    1121             :      * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
    1122             :      * again anyway.  Furthermore, attempting a deduplication pass will remove
    1123             :      * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
    1124             :      * is set or not.
    1125             :      */
    1126       18272 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1127             : 
    1128       18272 :     MarkBufferDirty(buf);
    1129             : 
    1130             :     /* XLOG stuff */
    1131       18272 :     if (RelationNeedsWAL(rel))
    1132             :     {
    1133             :         XLogRecPtr  recptr;
    1134             :         xl_btree_vacuum xlrec_vacuum;
    1135             : 
    1136       18272 :         xlrec_vacuum.ndeleted = ndeletable;
    1137       18272 :         xlrec_vacuum.nupdated = nupdatable;
    1138             : 
    1139       18272 :         XLogBeginInsert();
    1140       18272 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1141       18272 :         XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
    1142             : 
    1143       18272 :         if (ndeletable > 0)
    1144       18256 :             XLogRegisterBufData(0, (char *) deletable,
    1145             :                                 ndeletable * sizeof(OffsetNumber));
    1146             : 
    1147       18272 :         if (nupdatable > 0)
    1148             :         {
    1149         116 :             XLogRegisterBufData(0, (char *) updatedoffsets,
    1150             :                                 nupdatable * sizeof(OffsetNumber));
    1151         116 :             XLogRegisterBufData(0, updatedbuf, updatedbuflen);
    1152             :         }
    1153             : 
    1154       18272 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
    1155             : 
    1156       18272 :         PageSetLSN(page, recptr);
    1157             :     }
    1158             : 
    1159       18272 :     END_CRIT_SECTION();
    1160             : 
    1161             :     /* can't leak memory here */
    1162       18272 :     if (updatedbuf != NULL)
    1163         116 :         pfree(updatedbuf);
    1164             :     /* free tuples generated by calling _bt_update_posting() */
    1165       30202 :     for (int i = 0; i < nupdatable; i++)
    1166       11930 :         pfree(updatable[i]->itup);
    1167       18272 : }
    1168             : 
    1169             : /*
    1170             :  * Delete item(s) from a btree leaf page during single-page cleanup.
    1171             :  *
    1172             :  * This routine assumes that the caller has pinned and write locked the
    1173             :  * buffer.  Also, the given deletable array *must* be sorted in ascending
    1174             :  * order.
    1175             :  *
    1176             :  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
    1177             :  * the page, but it needs to generate its own latestRemovedXid by accessing
    1178             :  * the heap.  This is used by the REDO routine to generate recovery conflicts.
    1179             :  * Also, it doesn't handle posting list tuples unless the entire tuple can be
    1180             :  * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
    1181             :  */
    1182             : void
    1183        3692 : _bt_delitems_delete(Relation rel, Buffer buf,
    1184             :                     OffsetNumber *deletable, int ndeletable,
    1185             :                     Relation heapRel)
    1186             : {
    1187        3692 :     Page        page = BufferGetPage(buf);
    1188             :     BTPageOpaque opaque;
    1189        3692 :     TransactionId latestRemovedXid = InvalidTransactionId;
    1190             : 
    1191             :     /* Shouldn't be called unless there's something to do */
    1192             :     Assert(ndeletable > 0);
    1193             : 
    1194        3692 :     if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
    1195             :         latestRemovedXid =
    1196        3642 :             _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
    1197             : 
    1198             :     /* No ereport(ERROR) until changes are logged */
    1199        3692 :     START_CRIT_SECTION();
    1200             : 
    1201             :     /* Fix the page */
    1202        3692 :     PageIndexMultiDelete(page, deletable, ndeletable);
    1203             : 
    1204             :     /*
    1205             :      * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
    1206             :      * because this is not called by VACUUM.  Just clear the BTP_HAS_GARBAGE
    1207             :      * page flag, since we deleted all items with their LP_DEAD bit set.
    1208             :      */
    1209        3692 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1210        3692 :     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
    1211             : 
    1212        3692 :     MarkBufferDirty(buf);
    1213             : 
    1214             :     /* XLOG stuff */
    1215        3692 :     if (RelationNeedsWAL(rel))
    1216             :     {
    1217             :         XLogRecPtr  recptr;
    1218             :         xl_btree_delete xlrec_delete;
    1219             : 
    1220        3692 :         xlrec_delete.latestRemovedXid = latestRemovedXid;
    1221        3692 :         xlrec_delete.ndeleted = ndeletable;
    1222             : 
    1223        3692 :         XLogBeginInsert();
    1224        3692 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1225        3692 :         XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
    1226             : 
    1227             :         /*
    1228             :          * The deletable array is not in the buffer, but pretend that it is.
    1229             :          * When XLogInsert stores the whole buffer, the array need not be
    1230             :          * stored too.
    1231             :          */
    1232        3692 :         XLogRegisterBufData(0, (char *) deletable,
    1233             :                             ndeletable * sizeof(OffsetNumber));
    1234             : 
    1235        3692 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
    1236             : 
    1237        3692 :         PageSetLSN(page, recptr);
    1238             :     }
    1239             : 
    1240        3692 :     END_CRIT_SECTION();
    1241        3692 : }
    1242             : 
    1243             : /*
    1244             :  * Get the latestRemovedXid from the table entries pointed to by the non-pivot
    1245             :  * tuples being deleted.
    1246             :  *
    1247             :  * This is a specialized version of index_compute_xid_horizon_for_tuples().
    1248             :  * It's needed because btree tuples don't always store table TID using the
    1249             :  * standard index tuple header field.
    1250             :  */
    1251             : static TransactionId
    1252        3642 : _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
    1253             :                 OffsetNumber *deletable, int ndeletable)
    1254             : {
    1255        3642 :     TransactionId latestRemovedXid = InvalidTransactionId;
    1256             :     int         spacenhtids;
    1257             :     int         nhtids;
    1258             :     ItemPointer htids;
    1259             : 
    1260             :     /* Array will grow iff there are posting list tuples to consider */
    1261        3642 :     spacenhtids = ndeletable;
    1262        3642 :     nhtids = 0;
    1263        3642 :     htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
    1264       55108 :     for (int i = 0; i < ndeletable; i++)
    1265             :     {
    1266             :         ItemId      itemid;
    1267             :         IndexTuple  itup;
    1268             : 
    1269       51466 :         itemid = PageGetItemId(page, deletable[i]);
    1270       51466 :         itup = (IndexTuple) PageGetItem(page, itemid);
    1271             : 
    1272             :         Assert(ItemIdIsDead(itemid));
    1273             :         Assert(!BTreeTupleIsPivot(itup));
    1274             : 
    1275       51466 :         if (!BTreeTupleIsPosting(itup))
    1276             :         {
    1277       51462 :             if (nhtids + 1 > spacenhtids)
    1278             :             {
    1279           4 :                 spacenhtids *= 2;
    1280             :                 htids = (ItemPointer)
    1281           4 :                     repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
    1282             :             }
    1283             : 
    1284             :             Assert(ItemPointerIsValid(&itup->t_tid));
    1285       51462 :             ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
    1286       51462 :             nhtids++;
    1287             :         }
    1288             :         else
    1289             :         {
    1290           4 :             int         nposting = BTreeTupleGetNPosting(itup);
    1291             : 
    1292           4 :             if (nhtids + nposting > spacenhtids)
    1293             :             {
    1294           4 :                 spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
    1295             :                 htids = (ItemPointer)
    1296           4 :                     repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
    1297             :             }
    1298             : 
    1299         892 :             for (int j = 0; j < nposting; j++)
    1300             :             {
    1301         888 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, j);
    1302             : 
    1303             :                 Assert(ItemPointerIsValid(htid));
    1304         888 :                 ItemPointerCopy(htid, &htids[nhtids]);
    1305         888 :                 nhtids++;
    1306             :             }
    1307             :         }
    1308             :     }
    1309             : 
    1310             :     Assert(nhtids >= ndeletable);
    1311             : 
    1312             :     latestRemovedXid =
    1313        3642 :         table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
    1314             : 
    1315        3642 :     pfree(htids);
    1316             : 
    1317        3642 :     return latestRemovedXid;
    1318             : }
    1319             : 
    1320             : /*
    1321             :  * Check that leftsib page (the btpo_prev of target page) is not marked with
    1322             :  * INCOMPLETE_SPLIT flag.  Used during page deletion.
    1323             :  *
    1324             :  * Returning true indicates that page flag is set in leftsib (which is
    1325             :  * definitely still the left sibling of target).  When that happens, the
    1326             :  * target doesn't have a downlink in parent, and the page deletion algorithm
    1327             :  * isn't prepared to handle that.  Deletion of the target page (or the whole
    1328             :  * subtree that contains the target page) cannot take place.
    1329             :  *
    1330             :  * Caller should not have a lock on the target page itself, since pages on the
    1331             :  * same level must always be locked left to right to avoid deadlocks.
    1332             :  */
    1333             : static bool
    1334        4134 : _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
    1335             : {
    1336             :     Buffer      buf;
    1337             :     Page        page;
    1338             :     BTPageOpaque opaque;
    1339             :     bool        result;
    1340             : 
    1341             :     /* Easy case: No left sibling */
    1342        4134 :     if (leftsib == P_NONE)
    1343        3026 :         return false;
    1344             : 
    1345        1108 :     buf = _bt_getbuf(rel, leftsib, BT_READ);
    1346        1108 :     page = BufferGetPage(buf);
    1347        1108 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1348             : 
    1349             :     /*
    1350             :      * If the left sibling was concurrently split, so that its next-pointer
    1351             :      * doesn't point to the current page anymore, the split that created
    1352             :      * target must be completed.  Caller can reasonably expect that there will
    1353             :      * be a downlink to the target page that it can relocate using its stack.
    1354             :      * (We don't allow splitting an incompletely split page again until the
    1355             :      * previous split has been completed.)
    1356             :      */
    1357        1108 :     result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
    1358        1108 :     _bt_relbuf(rel, buf);
    1359             : 
    1360        1108 :     return result;
    1361             : }
    1362             : 
    1363             : /*
    1364             :  * Check that leafrightsib page (the btpo_next of target leaf page) is not
    1365             :  * marked with ISHALFDEAD flag.  Used during page deletion.
    1366             :  *
    1367             :  * Returning true indicates that page flag is set in leafrightsib, so page
    1368             :  * deletion cannot go ahead.  Our caller is not prepared to deal with the case
    1369             :  * where the parent page does not have a pivot tuples whose downlink points to
    1370             :  * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
    1371             :  * seem worth going to the trouble of teaching our caller to deal with it.
    1372             :  * The situation will be resolved after VACUUM finishes the deletion of the
    1373             :  * half-dead page (when a future VACUUM operation reaches the target page
    1374             :  * again).
    1375             :  *
    1376             :  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
    1377             :  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
    1378             :  * okay because of the restriction on deleting pages that are the rightmost
    1379             :  * page of their parent (i.e. that such deletions can only take place when the
    1380             :  * entire subtree must be deleted).  The leaf level check made here will apply
    1381             :  * to a right "cousin" leaf page rather than a simple right sibling leaf page
    1382             :  * in cases where caller actually goes on to attempt deleting pages that are
    1383             :  * above the leaf page.  The right cousin leaf page is representative of the
    1384             :  * left edge of the subtree to the right of the to-be-deleted subtree as a
    1385             :  * whole, which is exactly the condition that our caller cares about.
    1386             :  * (Besides, internal pages are never marked half-dead, so it isn't even
    1387             :  * possible to _directly_ assess if an internal page is part of some other
    1388             :  * to-be-deleted subtree.)
    1389             :  */
    1390             : static bool
    1391        4084 : _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
    1392             : {
    1393             :     Buffer      buf;
    1394             :     Page        page;
    1395             :     BTPageOpaque opaque;
    1396             :     bool        result;
    1397             : 
    1398             :     Assert(leafrightsib != P_NONE);
    1399             : 
    1400        4084 :     buf = _bt_getbuf(rel, leafrightsib, BT_READ);
    1401        4084 :     page = BufferGetPage(buf);
    1402        4084 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1403             : 
    1404             :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
    1405        4084 :     result = P_ISHALFDEAD(opaque);
    1406        4084 :     _bt_relbuf(rel, buf);
    1407             : 
    1408        4084 :     return result;
    1409             : }
    1410             : 
    1411             : /*
    1412             :  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
    1413             :  *
    1414             :  * This action unlinks the leaf page from the b-tree structure, removing all
    1415             :  * pointers leading to it --- but not touching its own left and right links.
    1416             :  * The page cannot be physically reclaimed right away, since other processes
    1417             :  * may currently be trying to follow links leading to the page; they have to
    1418             :  * be allowed to use its right-link to recover.  See nbtree/README.
    1419             :  *
    1420             :  * On entry, the target buffer must be pinned and locked (either read or write
    1421             :  * lock is OK).  The page must be an empty leaf page, which may be half-dead
    1422             :  * already (a half-dead page should only be passed to us when an earlier
    1423             :  * VACUUM operation was interrupted, though).  Note in particular that caller
    1424             :  * should never pass a buffer containing an existing deleted page here.  The
    1425             :  * lock and pin on caller's buffer will be dropped before we return.
    1426             :  *
    1427             :  * Returns the number of pages successfully deleted (zero if page cannot
    1428             :  * be deleted now; could be more than one if parent or right sibling pages
    1429             :  * were deleted too).  Note that this does not include pages that we delete
    1430             :  * that the btvacuumscan scan has yet to reach; they'll get counted later
    1431             :  * instead.
    1432             :  *
    1433             :  * Maintains *oldestBtpoXact for any pages that get deleted.  Caller is
    1434             :  * responsible for maintaining *oldestBtpoXact in the case of pages that were
    1435             :  * deleted by a previous VACUUM.
    1436             :  *
    1437             :  * NOTE: this leaks memory.  Rather than trying to clean up everything
    1438             :  * carefully, it's better to run it in a temp context that can be reset
    1439             :  * frequently.
    1440             :  */
    1441             : uint32
    1442        4172 : _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
    1443             : {
    1444        4172 :     uint32      ndeleted = 0;
    1445             :     BlockNumber rightsib;
    1446             :     bool        rightsib_empty;
    1447             :     Page        page;
    1448             :     BTPageOpaque opaque;
    1449             : 
    1450             :     /*
    1451             :      * Save original leafbuf block number from caller.  Only deleted blocks
    1452             :      * that are <= scanblkno get counted in ndeleted return value.
    1453             :      */
    1454        4172 :     BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
    1455             : 
    1456             :     /*
    1457             :      * "stack" is a search stack leading (approximately) to the target page.
    1458             :      * It is initially NULL, but when iterating, we keep it to avoid
    1459             :      * duplicated search effort.
    1460             :      *
    1461             :      * Also, when "stack" is not NULL, we have already checked that the
    1462             :      * current page is not the right half of an incomplete split, i.e. the
    1463             :      * left sibling does not have its INCOMPLETE_SPLIT flag set, including
    1464             :      * when the current target page is to the right of caller's initial page
    1465             :      * (the scanblkno page).
    1466             :      */
    1467        4172 :     BTStack     stack = NULL;
    1468             : 
    1469             :     for (;;)
    1470             :     {
    1471        8260 :         page = BufferGetPage(leafbuf);
    1472        8260 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1473             : 
    1474             :         /*
    1475             :          * Internal pages are never deleted directly, only as part of deleting
    1476             :          * the whole subtree all the way down to leaf level.
    1477             :          *
    1478             :          * Also check for deleted pages here.  Caller never passes us a fully
    1479             :          * deleted page.  Only VACUUM can delete pages, so there can't have
    1480             :          * been a concurrent deletion.  Assume that we reached any deleted
    1481             :          * page encountered here by following a sibling link, and that the
    1482             :          * index is corrupt.
    1483             :          */
    1484             :         Assert(!P_ISDELETED(opaque));
    1485        8260 :         if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
    1486             :         {
    1487             :             /*
    1488             :              * Pre-9.4 page deletion only marked internal pages as half-dead,
    1489             :              * but now we only use that flag on leaf pages. The old algorithm
    1490             :              * was never supposed to leave half-dead pages in the tree, it was
    1491             :              * just a transient state, but it was nevertheless possible in
    1492             :              * error scenarios. We don't know how to deal with them here. They
    1493             :              * are harmless as far as searches are considered, but inserts
    1494             :              * into the deleted keyspace could add out-of-order downlinks in
    1495             :              * the upper levels. Log a notice, hopefully the admin will notice
    1496             :              * and reindex.
    1497             :              */
    1498           0 :             if (P_ISHALFDEAD(opaque))
    1499           0 :                 ereport(LOG,
    1500             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1501             :                          errmsg("index \"%s\" contains a half-dead internal page",
    1502             :                                 RelationGetRelationName(rel)),
    1503             :                          errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
    1504             : 
    1505           0 :             if (P_ISDELETED(opaque))
    1506           0 :                 ereport(LOG,
    1507             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
    1508             :                          errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
    1509             :                                          BufferGetBlockNumber(leafbuf),
    1510             :                                          scanblkno,
    1511             :                                          RelationGetRelationName(rel))));
    1512             : 
    1513           0 :             _bt_relbuf(rel, leafbuf);
    1514           0 :             return ndeleted;
    1515             :         }
    1516             : 
    1517             :         /*
    1518             :          * We can never delete rightmost pages nor root pages.  While at it,
    1519             :          * check that page is empty, since it's possible that the leafbuf page
    1520             :          * was empty a moment ago, but has since had some inserts.
    1521             :          *
    1522             :          * To keep the algorithm simple, we also never delete an incompletely
    1523             :          * split page (they should be rare enough that this doesn't make any
    1524             :          * meaningful difference to disk usage):
    1525             :          *
    1526             :          * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
    1527             :          * left half of an incomplete split, but ensuring that it's not the
    1528             :          * right half is more complicated.  For that, we have to check that
    1529             :          * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
    1530             :          * _bt_leftsib_splitflag().  On the first iteration, we temporarily
    1531             :          * release the lock on scanblkno/leafbuf, check the left sibling, and
    1532             :          * construct a search stack to scanblkno.  On subsequent iterations,
    1533             :          * we know we stepped right from a page that passed these tests, so
    1534             :          * it's OK.
    1535             :          */
    1536       16428 :         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
    1537        8168 :             P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    1538        8168 :             P_INCOMPLETE_SPLIT(opaque))
    1539             :         {
    1540             :             /* Should never fail to delete a half-dead page */
    1541             :             Assert(!P_ISHALFDEAD(opaque));
    1542             : 
    1543          92 :             _bt_relbuf(rel, leafbuf);
    1544          92 :             return ndeleted;
    1545             :         }
    1546             : 
    1547             :         /*
    1548             :          * First, remove downlink pointing to the page (or a parent of the
    1549             :          * page, if we are going to delete a taller subtree), and mark the
    1550             :          * leafbuf page half-dead
    1551             :          */
    1552        8168 :         if (!P_ISHALFDEAD(opaque))
    1553             :         {
    1554             :             /*
    1555             :              * We need an approximate pointer to the page's parent page.  We
    1556             :              * use a variant of the standard search mechanism to search for
    1557             :              * the page's high key; this will give us a link to either the
    1558             :              * current parent or someplace to its left (if there are multiple
    1559             :              * equal high keys, which is possible with !heapkeyspace indexes).
    1560             :              *
    1561             :              * Also check if this is the right-half of an incomplete split
    1562             :              * (see comment above).
    1563             :              */
    1564        8168 :             if (!stack)
    1565             :             {
    1566             :                 BTScanInsert itup_key;
    1567             :                 ItemId      itemid;
    1568             :                 IndexTuple  targetkey;
    1569             :                 BlockNumber leftsib,
    1570             :                             leafblkno;
    1571             :                 Buffer      sleafbuf;
    1572             : 
    1573        4084 :                 itemid = PageGetItemId(page, P_HIKEY);
    1574        4084 :                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
    1575             : 
    1576        4084 :                 leftsib = opaque->btpo_prev;
    1577        4084 :                 leafblkno = BufferGetBlockNumber(leafbuf);
    1578             : 
    1579             :                 /*
    1580             :                  * To avoid deadlocks, we'd better drop the leaf page lock
    1581             :                  * before going further.
    1582             :                  */
    1583        4084 :                 LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
    1584             : 
    1585             :                 /*
    1586             :                  * Check that the left sibling of leafbuf (if any) is not
    1587             :                  * marked with INCOMPLETE_SPLIT flag before proceeding
    1588             :                  */
    1589             :                 Assert(leafblkno == scanblkno);
    1590        4084 :                 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
    1591             :                 {
    1592           0 :                     ReleaseBuffer(leafbuf);
    1593           0 :                     return ndeleted;
    1594             :                 }
    1595             : 
    1596             :                 /* we need an insertion scan key for the search, so build one */
    1597        4084 :                 itup_key = _bt_mkscankey(rel, targetkey);
    1598             :                 /* find the leftmost leaf page with matching pivot/high key */
    1599        4084 :                 itup_key->pivotsearch = true;
    1600        4084 :                 stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
    1601             :                 /* won't need a second lock or pin on leafbuf */
    1602        4084 :                 _bt_relbuf(rel, sleafbuf);
    1603             : 
    1604             :                 /*
    1605             :                  * Re-lock the leaf page, and start over to use our stack
    1606             :                  * within _bt_mark_page_halfdead.  We must do it that way
    1607             :                  * because it's possible that leafbuf can no longer be
    1608             :                  * deleted.  We need to recheck.
    1609             :                  *
    1610             :                  * Note: We can't simply hold on to the sleafbuf lock instead,
    1611             :                  * because it's barely possible that sleafbuf is not the same
    1612             :                  * page as leafbuf.  This happens when leafbuf split after our
    1613             :                  * original lock was dropped, but before _bt_search finished
    1614             :                  * its descent.  We rely on the assumption that we'll find
    1615             :                  * leafbuf isn't safe to delete anymore in this scenario.
    1616             :                  * (Page deletion can cope with the stack being to the left of
    1617             :                  * leafbuf, but not to the right of leafbuf.)
    1618             :                  */
    1619        4084 :                 LockBuffer(leafbuf, BT_WRITE);
    1620        4084 :                 continue;
    1621             :             }
    1622             : 
    1623             :             /*
    1624             :              * See if it's safe to delete the leaf page, and determine how
    1625             :              * many parent/internal pages above the leaf level will be
    1626             :              * deleted.  If it's safe then _bt_mark_page_halfdead will also
    1627             :              * perform the first phase of deletion, which includes marking the
    1628             :              * leafbuf page half-dead.
    1629             :              */
    1630             :             Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
    1631        4084 :             if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
    1632             :             {
    1633           0 :                 _bt_relbuf(rel, leafbuf);
    1634           0 :                 return ndeleted;
    1635             :             }
    1636             :         }
    1637             : 
    1638             :         /*
    1639             :          * Then unlink it from its siblings.  Each call to
    1640             :          * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
    1641             :          * making it shallower.  Iterate until the leafbuf page is deleted.
    1642             :          *
    1643             :          * _bt_unlink_halfdead_page should never fail, since we established
    1644             :          * that deletion is generally safe in _bt_mark_page_halfdead.
    1645             :          */
    1646        4084 :         rightsib_empty = false;
    1647             :         Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
    1648        8218 :         while (P_ISHALFDEAD(opaque))
    1649             :         {
    1650             :             /* Check for interrupts in _bt_unlink_halfdead_page */
    1651        4134 :             if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
    1652             :                                           &rightsib_empty, oldestBtpoXact,
    1653             :                                           &ndeleted))
    1654             :             {
    1655             :                 /* _bt_unlink_halfdead_page failed, released buffer */
    1656           0 :                 return ndeleted;
    1657             :             }
    1658             :         }
    1659             : 
    1660             :         Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
    1661             :         Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
    1662             :                                             *oldestBtpoXact));
    1663             : 
    1664        4084 :         rightsib = opaque->btpo_next;
    1665             : 
    1666        4084 :         _bt_relbuf(rel, leafbuf);
    1667             : 
    1668             :         /*
    1669             :          * Check here, as calling loops will have locks held, preventing
    1670             :          * interrupts from being processed.
    1671             :          */
    1672        4084 :         CHECK_FOR_INTERRUPTS();
    1673             : 
    1674             :         /*
    1675             :          * The page has now been deleted. If its right sibling is completely
    1676             :          * empty, it's possible that the reason we haven't deleted it earlier
    1677             :          * is that it was the rightmost child of the parent. Now that we
    1678             :          * removed the downlink for this page, the right sibling might now be
    1679             :          * the only child of the parent, and could be removed. It would be
    1680             :          * picked up by the next vacuum anyway, but might as well try to
    1681             :          * remove it now, so loop back to process the right sibling.
    1682             :          *
    1683             :          * Note: This relies on the assumption that _bt_getstackbuf() will be
    1684             :          * able to reuse our original descent stack with a different child
    1685             :          * block (provided that the child block is to the right of the
    1686             :          * original leaf page reached by _bt_search()). It will even update
    1687             :          * the descent stack each time we loop around, avoiding repeated work.
    1688             :          */
    1689        4084 :         if (!rightsib_empty)
    1690        4080 :             break;
    1691             : 
    1692           4 :         leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    1693             :     }
    1694             : 
    1695        4080 :     return ndeleted;
    1696             : }
    1697             : 
    1698             : /*
    1699             :  * First stage of page deletion.
    1700             :  *
    1701             :  * Establish the height of the to-be-deleted subtree with leafbuf at its
    1702             :  * lowest level, remove the downlink to the subtree, and mark leafbuf
    1703             :  * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
    1704             :  * but may include additional internal pages (at most one per level of the
    1705             :  * tree below the root).
    1706             :  *
    1707             :  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
    1708             :  * the rightmost child of its parent (and parent has more than one downlink).
    1709             :  * Returns 'true' when the first stage of page deletion completed
    1710             :  * successfully.
    1711             :  */
    1712             : static bool
    1713        4084 : _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
    1714             : {
    1715             :     BlockNumber leafblkno;
    1716             :     BlockNumber leafrightsib;
    1717             :     BlockNumber topparent;
    1718             :     BlockNumber topparentrightsib;
    1719             :     ItemId      itemid;
    1720             :     Page        page;
    1721             :     BTPageOpaque opaque;
    1722             :     Buffer      subtreeparent;
    1723             :     OffsetNumber poffset;
    1724             :     OffsetNumber nextoffset;
    1725             :     IndexTuple  itup;
    1726             :     IndexTupleData trunctuple;
    1727             : 
    1728        4084 :     page = BufferGetPage(leafbuf);
    1729        4084 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1730             : 
    1731             :     Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
    1732             :            P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
    1733             :            P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    1734             : 
    1735             :     /*
    1736             :      * Save info about the leaf page.
    1737             :      */
    1738        4084 :     leafblkno = BufferGetBlockNumber(leafbuf);
    1739        4084 :     leafrightsib = opaque->btpo_next;
    1740             : 
    1741             :     /*
    1742             :      * Before attempting to lock the parent page, check that the right sibling
    1743             :      * is not in half-dead state.  A half-dead right sibling would have no
    1744             :      * downlink in the parent, which would be highly confusing later when we
    1745             :      * delete the downlink.  It would fail the "right sibling of target page
    1746             :      * is also the next child in parent page" cross-check below.
    1747             :      */
    1748        4084 :     if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
    1749             :     {
    1750           0 :         elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
    1751             :              leafblkno, leafrightsib);
    1752           0 :         return false;
    1753             :     }
    1754             : 
    1755             :     /*
    1756             :      * We cannot delete a page that is the rightmost child of its immediate
    1757             :      * parent, unless it is the only child --- in which case the parent has to
    1758             :      * be deleted too, and the same condition applies recursively to it. We
    1759             :      * have to check this condition all the way up before trying to delete,
    1760             :      * and lock the parent of the root of the to-be-deleted subtree (the
    1761             :      * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
    1762             :      * for us.  We remove the downlink to the "top parent" page (subtree root
    1763             :      * page) from the subtree parent page below.
    1764             :      *
    1765             :      * Initialize topparent to be leafbuf page now.  The final to-be-deleted
    1766             :      * subtree is often a degenerate one page subtree consisting only of the
    1767             :      * leafbuf page.  When that happens, the leafbuf page is the final subtree
    1768             :      * root page/top parent page.
    1769             :      */
    1770        4084 :     topparent = leafblkno;
    1771        4084 :     topparentrightsib = leafrightsib;
    1772        4084 :     if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
    1773             :                                  &subtreeparent, &poffset,
    1774             :                                  &topparent, &topparentrightsib))
    1775           0 :         return false;
    1776             : 
    1777             :     /*
    1778             :      * Check that the parent-page index items we're about to delete/overwrite
    1779             :      * in subtree parent page contain what we expect.  This can fail if the
    1780             :      * index has become corrupt for some reason.  We want to throw any error
    1781             :      * before entering the critical section --- otherwise it'd be a PANIC.
    1782             :      */
    1783        4084 :     page = BufferGetPage(subtreeparent);
    1784        4084 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1785             : 
    1786             : #ifdef USE_ASSERT_CHECKING
    1787             : 
    1788             :     /*
    1789             :      * This is just an assertion because _bt_lock_subtree_parent should have
    1790             :      * guaranteed tuple has the expected contents
    1791             :      */
    1792             :     itemid = PageGetItemId(page, poffset);
    1793             :     itup = (IndexTuple) PageGetItem(page, itemid);
    1794             :     Assert(BTreeTupleGetDownLink(itup) == topparent);
    1795             : #endif
    1796             : 
    1797        4084 :     nextoffset = OffsetNumberNext(poffset);
    1798        4084 :     itemid = PageGetItemId(page, nextoffset);
    1799        4084 :     itup = (IndexTuple) PageGetItem(page, itemid);
    1800        4084 :     if (BTreeTupleGetDownLink(itup) != topparentrightsib)
    1801           0 :         ereport(ERROR,
    1802             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    1803             :                  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
    1804             :                                  topparentrightsib, topparent,
    1805             :                                  BTreeTupleGetDownLink(itup),
    1806             :                                  BufferGetBlockNumber(subtreeparent),
    1807             :                                  RelationGetRelationName(rel))));
    1808             : 
    1809             :     /*
    1810             :      * Any insert which would have gone on the leaf block will now go to its
    1811             :      * right sibling.  In other words, the key space moves right.
    1812             :      */
    1813        4084 :     PredicateLockPageCombine(rel, leafblkno, leafrightsib);
    1814             : 
    1815             :     /* No ereport(ERROR) until changes are logged */
    1816        4084 :     START_CRIT_SECTION();
    1817             : 
    1818             :     /*
    1819             :      * Update parent of subtree.  We want to delete the downlink to the top
    1820             :      * parent page/root of the subtree, and the *following* key.  Easiest way
    1821             :      * is to copy the right sibling's downlink over the downlink that points
    1822             :      * to top parent page, and then delete the right sibling's original pivot
    1823             :      * tuple.
    1824             :      *
    1825             :      * Lanin and Shasha make the key space move left when deleting a page,
    1826             :      * whereas the key space moves right here.  That's why we cannot simply
    1827             :      * delete the pivot tuple with the downlink to the top parent page.  See
    1828             :      * nbtree/README.
    1829             :      */
    1830        4084 :     page = BufferGetPage(subtreeparent);
    1831        4084 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1832             : 
    1833        4084 :     itemid = PageGetItemId(page, poffset);
    1834        4084 :     itup = (IndexTuple) PageGetItem(page, itemid);
    1835        4084 :     BTreeTupleSetDownLink(itup, topparentrightsib);
    1836             : 
    1837        4084 :     nextoffset = OffsetNumberNext(poffset);
    1838        4084 :     PageIndexTupleDelete(page, nextoffset);
    1839             : 
    1840             :     /*
    1841             :      * Mark the leaf page as half-dead, and stamp it with a link to the top
    1842             :      * parent page.  When the leaf page is also the top parent page, the link
    1843             :      * is set to InvalidBlockNumber.
    1844             :      */
    1845        4084 :     page = BufferGetPage(leafbuf);
    1846        4084 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1847        4084 :     opaque->btpo_flags |= BTP_HALF_DEAD;
    1848             : 
    1849             :     Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
    1850        4084 :     MemSet(&trunctuple, 0, sizeof(IndexTupleData));
    1851        4084 :     trunctuple.t_info = sizeof(IndexTupleData);
    1852        4084 :     if (topparent != leafblkno)
    1853          34 :         BTreeTupleSetTopParent(&trunctuple, topparent);
    1854             :     else
    1855        4050 :         BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
    1856             : 
    1857        4084 :     if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
    1858        4084 :                                  IndexTupleSize(&trunctuple)))
    1859           0 :         elog(ERROR, "could not overwrite high key in half-dead page");
    1860             : 
    1861             :     /* Must mark buffers dirty before XLogInsert */
    1862        4084 :     MarkBufferDirty(subtreeparent);
    1863        4084 :     MarkBufferDirty(leafbuf);
    1864             : 
    1865             :     /* XLOG stuff */
    1866        4084 :     if (RelationNeedsWAL(rel))
    1867             :     {
    1868             :         xl_btree_mark_page_halfdead xlrec;
    1869             :         XLogRecPtr  recptr;
    1870             : 
    1871        4084 :         xlrec.poffset = poffset;
    1872        4084 :         xlrec.leafblk = leafblkno;
    1873        4084 :         if (topparent != leafblkno)
    1874          34 :             xlrec.topparent = topparent;
    1875             :         else
    1876        4050 :             xlrec.topparent = InvalidBlockNumber;
    1877             : 
    1878        4084 :         XLogBeginInsert();
    1879        4084 :         XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
    1880        4084 :         XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
    1881             : 
    1882        4084 :         page = BufferGetPage(leafbuf);
    1883        4084 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1884        4084 :         xlrec.leftblk = opaque->btpo_prev;
    1885        4084 :         xlrec.rightblk = opaque->btpo_next;
    1886             : 
    1887        4084 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
    1888             : 
    1889        4084 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
    1890             : 
    1891        4084 :         page = BufferGetPage(subtreeparent);
    1892        4084 :         PageSetLSN(page, recptr);
    1893        4084 :         page = BufferGetPage(leafbuf);
    1894        4084 :         PageSetLSN(page, recptr);
    1895             :     }
    1896             : 
    1897        4084 :     END_CRIT_SECTION();
    1898             : 
    1899        4084 :     _bt_relbuf(rel, subtreeparent);
    1900        4084 :     return true;
    1901             : }
    1902             : 
    1903             : /*
    1904             :  * Second stage of page deletion.
    1905             :  *
    1906             :  * Unlinks a single page (in the subtree undergoing deletion) from its
    1907             :  * siblings.  Also marks the page deleted.
    1908             :  *
    1909             :  * To get rid of the whole subtree, including the leaf page itself, call here
    1910             :  * until the leaf page is deleted.  The original "top parent" established in
    1911             :  * the first stage of deletion is deleted in the first call here, while the
    1912             :  * leaf page is deleted in the last call here.  Note that the leaf page itself
    1913             :  * is often the initial top parent page.
    1914             :  *
    1915             :  * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
    1916             :  * the right sibling of the current target page is empty, *rightsib_empty is
    1917             :  * set to true, allowing caller to delete the target's right sibling page in
    1918             :  * passing.  Note that *rightsib_empty is only actually used by caller when
    1919             :  * target page is leafbuf, following last call here for leafbuf/the subtree
    1920             :  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
    1921             :  * consistent.)
    1922             :  *
    1923             :  * We maintain *oldestBtpoXact for pages that are deleted by the current
    1924             :  * VACUUM operation here.  This must be handled here because we conservatively
    1925             :  * assume that there needs to be a new call to ReadNewTransactionId() each
    1926             :  * time a page gets deleted.  See comments about the underlying assumption
    1927             :  * below.
    1928             :  *
    1929             :  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
    1930             :  * On success exit, we'll be holding pin and write lock.  On failure exit,
    1931             :  * we'll release both pin and lock before returning (we define it that way
    1932             :  * to avoid having to reacquire a lock we already released).
    1933             :  */
    1934             : static bool
    1935        4134 : _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
    1936             :                          bool *rightsib_empty, TransactionId *oldestBtpoXact,
    1937             :                          uint32 *ndeleted)
    1938             : {
    1939        4134 :     BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
    1940             :     BlockNumber leafleftsib;
    1941             :     BlockNumber leafrightsib;
    1942             :     BlockNumber target;
    1943             :     BlockNumber leftsib;
    1944             :     BlockNumber rightsib;
    1945        4134 :     Buffer      lbuf = InvalidBuffer;
    1946             :     Buffer      buf;
    1947             :     Buffer      rbuf;
    1948        4134 :     Buffer      metabuf = InvalidBuffer;
    1949        4134 :     Page        metapg = NULL;
    1950        4134 :     BTMetaPageData *metad = NULL;
    1951             :     ItemId      itemid;
    1952             :     Page        page;
    1953             :     BTPageOpaque opaque;
    1954             :     bool        rightsib_is_rightmost;
    1955             :     int         targetlevel;
    1956             :     IndexTuple  leafhikey;
    1957             :     BlockNumber nextchild;
    1958             : 
    1959        4134 :     page = BufferGetPage(leafbuf);
    1960        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1961             : 
    1962             :     Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
    1963             : 
    1964             :     /*
    1965             :      * Remember some information about the leaf page.
    1966             :      */
    1967        4134 :     itemid = PageGetItemId(page, P_HIKEY);
    1968        4134 :     leafhikey = (IndexTuple) PageGetItem(page, itemid);
    1969        4134 :     target = BTreeTupleGetTopParent(leafhikey);
    1970        4134 :     leafleftsib = opaque->btpo_prev;
    1971        4134 :     leafrightsib = opaque->btpo_next;
    1972             : 
    1973        4134 :     LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
    1974             : 
    1975             :     /*
    1976             :      * Check here, as calling loops will have locks held, preventing
    1977             :      * interrupts from being processed.
    1978             :      */
    1979        4134 :     CHECK_FOR_INTERRUPTS();
    1980             : 
    1981             :     /* Unlink the current top parent of the subtree */
    1982        4134 :     if (!BlockNumberIsValid(target))
    1983             :     {
    1984             :         /* Target is leaf page (or leaf page is top parent, if you prefer) */
    1985        4084 :         target = leafblkno;
    1986             : 
    1987        4084 :         buf = leafbuf;
    1988        4084 :         leftsib = leafleftsib;
    1989        4084 :         targetlevel = 0;
    1990             :     }
    1991             :     else
    1992             :     {
    1993             :         /* Target is the internal page taken from leaf's top parent link */
    1994             :         Assert(target != leafblkno);
    1995             : 
    1996             :         /* Fetch the block number of the target's left sibling */
    1997          50 :         buf = _bt_getbuf(rel, target, BT_READ);
    1998          50 :         page = BufferGetPage(buf);
    1999          50 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2000          50 :         leftsib = opaque->btpo_prev;
    2001          50 :         targetlevel = opaque->btpo.level;
    2002             :         Assert(targetlevel > 0);
    2003             : 
    2004             :         /*
    2005             :          * To avoid deadlocks, we'd better drop the target page lock before
    2006             :          * going further.
    2007             :          */
    2008          50 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2009             :     }
    2010             : 
    2011             :     /*
    2012             :      * We have to lock the pages we need to modify in the standard order:
    2013             :      * moving right, then up.  Else we will deadlock against other writers.
    2014             :      *
    2015             :      * So, first lock the leaf page, if it's not the target.  Then find and
    2016             :      * write-lock the current left sibling of the target page.  The sibling
    2017             :      * that was current a moment ago could have split, so we may have to move
    2018             :      * right.  This search could fail if either the sibling or the target page
    2019             :      * was deleted by someone else meanwhile; if so, give up.  (Right now,
    2020             :      * that should never happen, since page deletion is only done in VACUUM
    2021             :      * and there shouldn't be multiple VACUUMs concurrently on the same
    2022             :      * table.)
    2023             :      */
    2024        4134 :     if (target != leafblkno)
    2025          50 :         LockBuffer(leafbuf, BT_WRITE);
    2026        4134 :     if (leftsib != P_NONE)
    2027             :     {
    2028        1108 :         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2029        1108 :         page = BufferGetPage(lbuf);
    2030        1108 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2031        1108 :         while (P_ISDELETED(opaque) || opaque->btpo_next != target)
    2032             :         {
    2033             :             /* step right one page */
    2034           0 :             leftsib = opaque->btpo_next;
    2035           0 :             _bt_relbuf(rel, lbuf);
    2036             : 
    2037             :             /*
    2038             :              * It'd be good to check for interrupts here, but it's not easy to
    2039             :              * do so because a lock is always held. This block isn't
    2040             :              * frequently reached, so hopefully the consequences of not
    2041             :              * checking interrupts aren't too bad.
    2042             :              */
    2043             : 
    2044           0 :             if (leftsib == P_NONE)
    2045             :             {
    2046           0 :                 elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
    2047             :                      target,
    2048             :                      RelationGetRelationName(rel));
    2049           0 :                 if (target != leafblkno)
    2050             :                 {
    2051             :                     /* we have only a pin on target, but pin+lock on leafbuf */
    2052           0 :                     ReleaseBuffer(buf);
    2053           0 :                     _bt_relbuf(rel, leafbuf);
    2054             :                 }
    2055             :                 else
    2056             :                 {
    2057             :                     /* we have only a pin on leafbuf */
    2058           0 :                     ReleaseBuffer(leafbuf);
    2059             :                 }
    2060           0 :                 return false;
    2061             :             }
    2062           0 :             lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
    2063           0 :             page = BufferGetPage(lbuf);
    2064           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2065             :         }
    2066             :     }
    2067             :     else
    2068        3026 :         lbuf = InvalidBuffer;
    2069             : 
    2070             :     /*
    2071             :      * Next write-lock the target page itself.  It's okay to take a write lock
    2072             :      * rather than a superexclusive lock, since no scan will stop on an empty
    2073             :      * page.
    2074             :      */
    2075        4134 :     LockBuffer(buf, BT_WRITE);
    2076        4134 :     page = BufferGetPage(buf);
    2077        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2078             : 
    2079             :     /*
    2080             :      * Check page is still empty etc, else abandon deletion.  This is just for
    2081             :      * paranoia's sake; a half-dead page cannot resurrect because there can be
    2082             :      * only one vacuum process running at a time.
    2083             :      */
    2084        4134 :     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
    2085           0 :         elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    2086             :              target, RelationGetRelationName(rel));
    2087             : 
    2088        4134 :     if (opaque->btpo_prev != leftsib)
    2089           0 :         ereport(ERROR,
    2090             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2091             :                  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
    2092             :                                  target, RelationGetRelationName(rel))));
    2093             : 
    2094        4134 :     if (target == leafblkno)
    2095             :     {
    2096        4084 :         if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
    2097        4084 :             !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
    2098           0 :             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    2099             :                  target, RelationGetRelationName(rel));
    2100        4084 :         nextchild = InvalidBlockNumber;
    2101             :     }
    2102             :     else
    2103             :     {
    2104          50 :         if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
    2105          50 :             P_ISLEAF(opaque))
    2106           0 :             elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
    2107             :                  target, RelationGetRelationName(rel));
    2108             : 
    2109             :         /* Remember the next non-leaf child down in the subtree */
    2110          50 :         itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
    2111          50 :         nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
    2112          50 :         if (nextchild == leafblkno)
    2113          34 :             nextchild = InvalidBlockNumber;
    2114             :     }
    2115             : 
    2116             :     /*
    2117             :      * And next write-lock the (current) right sibling.
    2118             :      */
    2119        4134 :     rightsib = opaque->btpo_next;
    2120        4134 :     rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    2121        4134 :     page = BufferGetPage(rbuf);
    2122        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2123        4134 :     if (opaque->btpo_prev != target)
    2124           0 :         ereport(ERROR,
    2125             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2126             :                  errmsg_internal("right sibling's left-link doesn't match: "
    2127             :                                  "block %u links to %u instead of expected %u in index \"%s\"",
    2128             :                                  rightsib, opaque->btpo_prev, target,
    2129             :                                  RelationGetRelationName(rel))));
    2130        4134 :     rightsib_is_rightmost = P_RIGHTMOST(opaque);
    2131        4134 :     *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
    2132             : 
    2133             :     /*
    2134             :      * If we are deleting the next-to-last page on the target's level, then
    2135             :      * the rightsib is a candidate to become the new fast root. (In theory, it
    2136             :      * might be possible to push the fast root even further down, but the odds
    2137             :      * of doing so are slim, and the locking considerations daunting.)
    2138             :      *
    2139             :      * We can safely acquire a lock on the metapage here --- see comments for
    2140             :      * _bt_newroot().
    2141             :      */
    2142        4134 :     if (leftsib == P_NONE && rightsib_is_rightmost)
    2143             :     {
    2144          22 :         page = BufferGetPage(rbuf);
    2145          22 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2146          22 :         if (P_RIGHTMOST(opaque))
    2147             :         {
    2148             :             /* rightsib will be the only one left on the level */
    2149          22 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2150          22 :             metapg = BufferGetPage(metabuf);
    2151          22 :             metad = BTPageGetMeta(metapg);
    2152             : 
    2153             :             /*
    2154             :              * The expected case here is btm_fastlevel == targetlevel+1; if
    2155             :              * the fastlevel is <= targetlevel, something is wrong, and we
    2156             :              * choose to overwrite it to fix it.
    2157             :              */
    2158          22 :             if (metad->btm_fastlevel > targetlevel + 1)
    2159             :             {
    2160             :                 /* no update wanted */
    2161           0 :                 _bt_relbuf(rel, metabuf);
    2162           0 :                 metabuf = InvalidBuffer;
    2163             :             }
    2164             :         }
    2165             :     }
    2166             : 
    2167             :     /*
    2168             :      * Here we begin doing the deletion.
    2169             :      */
    2170             : 
    2171             :     /* No ereport(ERROR) until changes are logged */
    2172        4134 :     START_CRIT_SECTION();
    2173             : 
    2174             :     /*
    2175             :      * Update siblings' side-links.  Note the target page's side-links will
    2176             :      * continue to point to the siblings.  Asserts here are just rechecking
    2177             :      * things we already verified above.
    2178             :      */
    2179        4134 :     if (BufferIsValid(lbuf))
    2180             :     {
    2181        1108 :         page = BufferGetPage(lbuf);
    2182        1108 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2183             :         Assert(opaque->btpo_next == target);
    2184        1108 :         opaque->btpo_next = rightsib;
    2185             :     }
    2186        4134 :     page = BufferGetPage(rbuf);
    2187        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2188             :     Assert(opaque->btpo_prev == target);
    2189        4134 :     opaque->btpo_prev = leftsib;
    2190             : 
    2191             :     /*
    2192             :      * If we deleted a parent of the targeted leaf page, instead of the leaf
    2193             :      * itself, update the leaf to point to the next remaining child in the
    2194             :      * subtree.
    2195             :      *
    2196             :      * Note: We rely on the fact that a buffer pin on the leaf page has been
    2197             :      * held since leafhikey was initialized.  This is safe, though only
    2198             :      * because the page was already half-dead at that point.  The leaf page
    2199             :      * cannot have been modified by any other backend during the period when
    2200             :      * no lock was held.
    2201             :      */
    2202        4134 :     if (target != leafblkno)
    2203          50 :         BTreeTupleSetTopParent(leafhikey, nextchild);
    2204             : 
    2205             :     /*
    2206             :      * Mark the page itself deleted.  It can be recycled when all current
    2207             :      * transactions are gone.  Storing GetTopTransactionId() would work, but
    2208             :      * we're in VACUUM and would not otherwise have an XID.  Having already
    2209             :      * updated links to the target, ReadNewTransactionId() suffices as an
    2210             :      * upper bound.  Any scan having retained a now-stale link is advertising
    2211             :      * in its PGXACT an xmin less than or equal to the value we read here.  It
    2212             :      * will continue to do so, holding back RecentGlobalXmin, for the duration
    2213             :      * of that scan.
    2214             :      */
    2215        4134 :     page = BufferGetPage(buf);
    2216        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2217             :     Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
    2218        4134 :     opaque->btpo_flags &= ~BTP_HALF_DEAD;
    2219        4134 :     opaque->btpo_flags |= BTP_DELETED;
    2220        4134 :     opaque->btpo.xact = ReadNewTransactionId();
    2221             : 
    2222             :     /* And update the metapage, if needed */
    2223        4134 :     if (BufferIsValid(metabuf))
    2224             :     {
    2225             :         /* upgrade metapage if needed */
    2226          22 :         if (metad->btm_version < BTREE_NOVAC_VERSION)
    2227           0 :             _bt_upgrademetapage(metapg);
    2228          22 :         metad->btm_fastroot = rightsib;
    2229          22 :         metad->btm_fastlevel = targetlevel;
    2230          22 :         MarkBufferDirty(metabuf);
    2231             :     }
    2232             : 
    2233             :     /* Must mark buffers dirty before XLogInsert */
    2234        4134 :     MarkBufferDirty(rbuf);
    2235        4134 :     MarkBufferDirty(buf);
    2236        4134 :     if (BufferIsValid(lbuf))
    2237        1108 :         MarkBufferDirty(lbuf);
    2238        4134 :     if (target != leafblkno)
    2239          50 :         MarkBufferDirty(leafbuf);
    2240             : 
    2241             :     /* XLOG stuff */
    2242        4134 :     if (RelationNeedsWAL(rel))
    2243             :     {
    2244             :         xl_btree_unlink_page xlrec;
    2245             :         xl_btree_metadata xlmeta;
    2246             :         uint8       xlinfo;
    2247             :         XLogRecPtr  recptr;
    2248             : 
    2249        4134 :         XLogBeginInsert();
    2250             : 
    2251        4134 :         XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
    2252        4134 :         if (BufferIsValid(lbuf))
    2253        1108 :             XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2254        4134 :         XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
    2255        4134 :         if (target != leafblkno)
    2256          50 :             XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
    2257             : 
    2258             :         /* information on the unlinked block */
    2259        4134 :         xlrec.leftsib = leftsib;
    2260        4134 :         xlrec.rightsib = rightsib;
    2261        4134 :         xlrec.btpo_xact = opaque->btpo.xact;
    2262             : 
    2263             :         /* information needed to recreate the leaf block (if not the target) */
    2264        4134 :         xlrec.leafleftsib = leafleftsib;
    2265        4134 :         xlrec.leafrightsib = leafrightsib;
    2266        4134 :         xlrec.topparent = nextchild;
    2267             : 
    2268        4134 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
    2269             : 
    2270        4134 :         if (BufferIsValid(metabuf))
    2271             :         {
    2272          22 :             XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2273             : 
    2274             :             Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2275          22 :             xlmeta.version = metad->btm_version;
    2276          22 :             xlmeta.root = metad->btm_root;
    2277          22 :             xlmeta.level = metad->btm_level;
    2278          22 :             xlmeta.fastroot = metad->btm_fastroot;
    2279          22 :             xlmeta.fastlevel = metad->btm_fastlevel;
    2280          22 :             xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
    2281          22 :             xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
    2282          22 :             xlmeta.allequalimage = metad->btm_allequalimage;
    2283             : 
    2284          22 :             XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
    2285          22 :             xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
    2286             :         }
    2287             :         else
    2288        4112 :             xlinfo = XLOG_BTREE_UNLINK_PAGE;
    2289             : 
    2290        4134 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2291             : 
    2292        4134 :         if (BufferIsValid(metabuf))
    2293             :         {
    2294          22 :             PageSetLSN(metapg, recptr);
    2295             :         }
    2296        4134 :         page = BufferGetPage(rbuf);
    2297        4134 :         PageSetLSN(page, recptr);
    2298        4134 :         page = BufferGetPage(buf);
    2299        4134 :         PageSetLSN(page, recptr);
    2300        4134 :         if (BufferIsValid(lbuf))
    2301             :         {
    2302        1108 :             page = BufferGetPage(lbuf);
    2303        1108 :             PageSetLSN(page, recptr);
    2304             :         }
    2305        4134 :         if (target != leafblkno)
    2306             :         {
    2307          50 :             page = BufferGetPage(leafbuf);
    2308          50 :             PageSetLSN(page, recptr);
    2309             :         }
    2310             :     }
    2311             : 
    2312        4134 :     END_CRIT_SECTION();
    2313             : 
    2314             :     /* release metapage */
    2315        4134 :     if (BufferIsValid(metabuf))
    2316          22 :         _bt_relbuf(rel, metabuf);
    2317             : 
    2318             :     /* release siblings */
    2319        4134 :     if (BufferIsValid(lbuf))
    2320        1108 :         _bt_relbuf(rel, lbuf);
    2321        4134 :     _bt_relbuf(rel, rbuf);
    2322             : 
    2323        8132 :     if (!TransactionIdIsValid(*oldestBtpoXact) ||
    2324        3998 :         TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
    2325         136 :         *oldestBtpoXact = opaque->btpo.xact;
    2326             : 
    2327             :     /*
    2328             :      * If btvacuumscan won't revisit this page in a future btvacuumpage call
    2329             :      * and count it as deleted then, we count it as deleted by current
    2330             :      * btvacuumpage call
    2331             :      */
    2332        4134 :     if (target <= scanblkno)
    2333        4090 :         (*ndeleted)++;
    2334             : 
    2335             :     /* If the target is not leafbuf, we're done with it now -- release it */
    2336        4134 :     if (target != leafblkno)
    2337          50 :         _bt_relbuf(rel, buf);
    2338             : 
    2339        4134 :     return true;
    2340             : }
    2341             : 
    2342             : /*
    2343             :  * Establish how tall the to-be-deleted subtree will be during the first stage
    2344             :  * of page deletion.
    2345             :  *
    2346             :  * Caller's child argument is the block number of the page caller wants to
    2347             :  * delete (this is leafbuf's block number, except when we're called
    2348             :  * recursively).  stack is a search stack leading to it.  Note that we will
    2349             :  * update the stack entry(s) to reflect current downlink positions --- this is
    2350             :  * similar to the corresponding point in page split handling.
    2351             :  *
    2352             :  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
    2353             :  * false.  Returns true on success, in which case caller can use certain
    2354             :  * details established here to perform the first stage of deletion.  This
    2355             :  * function is the last point at which page deletion may be deemed unsafe
    2356             :  * (barring index corruption, or unexpected concurrent page deletions).
    2357             :  *
    2358             :  * We write lock the parent of the root of the to-be-deleted subtree for
    2359             :  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
    2360             :  * caller).  Caller will have to remove a downlink from *subtreeparent.  We
    2361             :  * also set a *subtreeparent offset number in *poffset, to indicate the
    2362             :  * location of the pivot tuple that contains the relevant downlink.
    2363             :  *
    2364             :  * The root of the to-be-deleted subtree is called the "top parent".  Note
    2365             :  * that the leafbuf page is often the final "top parent" page (you can think
    2366             :  * of the leafbuf page as a degenerate single page subtree when that happens).
    2367             :  * Caller should initialize *topparent to the target leafbuf page block number
    2368             :  * (while *topparentrightsib should be set to leafbuf's right sibling block
    2369             :  * number).  We will update *topparent (and *topparentrightsib) for caller
    2370             :  * here, though only when it turns out that caller will delete at least one
    2371             :  * internal page (i.e. only when caller needs to store a valid link to the top
    2372             :  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
    2373             :  */
    2374             : static bool
    2375        4134 : _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
    2376             :                         Buffer *subtreeparent, OffsetNumber *poffset,
    2377             :                         BlockNumber *topparent, BlockNumber *topparentrightsib)
    2378             : {
    2379             :     BlockNumber parent,
    2380             :                 leftsibparent;
    2381             :     OffsetNumber parentoffset,
    2382             :                 maxoff;
    2383             :     Buffer      pbuf;
    2384             :     Page        page;
    2385             :     BTPageOpaque opaque;
    2386             : 
    2387             :     /*
    2388             :      * Locate the pivot tuple whose downlink points to "child".  Write lock
    2389             :      * the parent page itself.
    2390             :      */
    2391        4134 :     pbuf = _bt_getstackbuf(rel, stack, child);
    2392        4134 :     if (pbuf == InvalidBuffer)
    2393           0 :         ereport(ERROR,
    2394             :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    2395             :                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
    2396             :                                  RelationGetRelationName(rel), child)));
    2397        4134 :     parent = stack->bts_blkno;
    2398        4134 :     parentoffset = stack->bts_offset;
    2399             : 
    2400        4134 :     page = BufferGetPage(pbuf);
    2401        4134 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2402        4134 :     maxoff = PageGetMaxOffsetNumber(page);
    2403        4134 :     leftsibparent = opaque->btpo_prev;
    2404             : 
    2405             :     /*
    2406             :      * _bt_getstackbuf() completes page splits on returned parent buffer when
    2407             :      * required.
    2408             :      *
    2409             :      * In general it's a bad idea for VACUUM to use up more disk space, which
    2410             :      * is why page deletion does not finish incomplete page splits most of the
    2411             :      * time.  We allow this limited exception because the risk is much lower,
    2412             :      * and the potential downside of not proceeding is much higher:  A single
    2413             :      * internal page with the INCOMPLETE_SPLIT flag set might otherwise
    2414             :      * prevent us from deleting hundreds of empty leaf pages from one level
    2415             :      * down.
    2416             :      */
    2417             :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    2418             : 
    2419        4134 :     if (parentoffset < maxoff)
    2420             :     {
    2421             :         /*
    2422             :          * Child is not the rightmost child in parent, so it's safe to delete
    2423             :          * the subtree whose root/topparent is child page
    2424             :          */
    2425        4084 :         *subtreeparent = pbuf;
    2426        4084 :         *poffset = parentoffset;
    2427        4084 :         return true;
    2428             :     }
    2429             : 
    2430             :     /*
    2431             :      * Child is the rightmost child of parent.
    2432             :      *
    2433             :      * Since it's the rightmost child of parent, deleting the child (or
    2434             :      * deleting the subtree whose root/topparent is the child page) is only
    2435             :      * safe when it's also possible to delete the parent.
    2436             :      */
    2437             :     Assert(parentoffset == maxoff);
    2438          50 :     if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
    2439             :     {
    2440             :         /*
    2441             :          * Child isn't parent's only child, or parent is rightmost on its
    2442             :          * entire level.  Definitely cannot delete any pages.
    2443             :          */
    2444           0 :         _bt_relbuf(rel, pbuf);
    2445           0 :         return false;
    2446             :     }
    2447             : 
    2448             :     /*
    2449             :      * Now make sure that the parent deletion is itself safe by examining the
    2450             :      * child's grandparent page.  Recurse, passing the parent page as the
    2451             :      * child page (child's grandparent is the parent on the next level up). If
    2452             :      * parent deletion is unsafe, then child deletion must also be unsafe (in
    2453             :      * which case caller cannot delete any pages at all).
    2454             :      */
    2455          50 :     *topparent = parent;
    2456          50 :     *topparentrightsib = opaque->btpo_next;
    2457             : 
    2458             :     /*
    2459             :      * Release lock on parent before recursing.
    2460             :      *
    2461             :      * It's OK to release page locks on parent before recursive call locks
    2462             :      * grandparent.  An internal page can only acquire an entry if the child
    2463             :      * is split, but that cannot happen as long as we still hold a lock on the
    2464             :      * leafbuf page.
    2465             :      */
    2466          50 :     _bt_relbuf(rel, pbuf);
    2467             : 
    2468             :     /*
    2469             :      * Before recursing, check that the left sibling of parent (if any) is not
    2470             :      * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
    2471             :      * parent lock).
    2472             :      *
    2473             :      * Note: We deliberately avoid completing incomplete splits here.
    2474             :      */
    2475          50 :     if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
    2476           0 :         return false;
    2477             : 
    2478             :     /* Recurse to examine child page's grandparent page */
    2479          50 :     return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
    2480             :                                    subtreeparent, poffset,
    2481             :                                    topparent, topparentrightsib);
    2482             : }

Generated by: LCOV version 1.13