LCOV - PostgreSQL 19devel - src/backend/access/nbtree/nbtinsert.c

LCOV - code coverage report

Current view:	top level - src/backend/access/nbtree - nbtinsert.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	684	766	89.3 %
Date:	2025-08-09 08:18:06	Functions:	15	16	93.8 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtinsert.c
       4             :  *    Item insertion in Lehman and Yao btrees for Postgres.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtinsert.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/nbtxlog.h"
      20             : #include "access/transam.h"
      21             : #include "access/xloginsert.h"
      22             : #include "common/int.h"
      23             : #include "common/pg_prng.h"
      24             : #include "lib/qunique.h"
      25             : #include "miscadmin.h"
      26             : #include "storage/lmgr.h"
      27             : #include "storage/predicate.h"
      28             : 
      29             : /* Minimum tree height for application of fastpath optimization */
      30             : #define BTREE_FASTPATH_MIN_LEVEL    2
      31             : 
      32             : 
      33             : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
      34             :                                  BTInsertState insertstate);
      35             : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
      36             :                                       Relation heapRel,
      37             :                                       IndexUniqueCheck checkUnique, bool *is_unique,
      38             :                                       uint32 *speculativeToken);
      39             : static OffsetNumber _bt_findinsertloc(Relation rel,
      40             :                                       BTInsertState insertstate,
      41             :                                       bool checkingunique,
      42             :                                       bool indexUnchanged,
      43             :                                       BTStack stack,
      44             :                                       Relation heapRel);
      45             : static void _bt_stepright(Relation rel, Relation heaprel,
      46             :                           BTInsertState insertstate, BTStack stack);
      47             : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
      48             :                            Buffer buf,
      49             :                            Buffer cbuf,
      50             :                            BTStack stack,
      51             :                            IndexTuple itup,
      52             :                            Size itemsz,
      53             :                            OffsetNumber newitemoff,
      54             :                            int postingoff,
      55             :                            bool split_only_page);
      56             : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
      57             :                         Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
      58             :                         Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
      59             :                         IndexTuple nposting, uint16 postingoff);
      60             : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
      61             :                               Buffer rbuf, BTStack stack, bool isroot, bool isonly);
      62             : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
      63             : static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
      64             :                                 OffsetNumber itup_off, bool newfirstdataitem);
      65             : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
      66             :                                          BTInsertState insertstate,
      67             :                                          bool simpleonly, bool checkingunique,
      68             :                                          bool uniquedup, bool indexUnchanged);
      69             : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
      70             :                                OffsetNumber *deletable, int ndeletable,
      71             :                                IndexTuple newitem, OffsetNumber minoff,
      72             :                                OffsetNumber maxoff);
      73             : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
      74             :                                    int ndeletable, IndexTuple newitem,
      75             :                                    int *nblocks);
      76             : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
      77             : 
      78             : /*
      79             :  *  _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
      80             :  *
      81             :  *      This routine is called by the public interface routine, btinsert.
      82             :  *      By here, itup is filled in, including the TID.
      83             :  *
      84             :  *      If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
      85             :  *      will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
      86             :  *      UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
      87             :  *      For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
      88             :  *      don't actually insert.
      89             :  *
      90             :  *      indexUnchanged executor hint indicates if itup is from an
      91             :  *      UPDATE that didn't logically change the indexed value, but
      92             :  *      must nevertheless have a new entry to point to a successor
      93             :  *      version.
      94             :  *
      95             :  *      The result value is only significant for UNIQUE_CHECK_PARTIAL:
      96             :  *      it must be true if the entry is known unique, else false.
      97             :  *      (In the current implementation we'll also return true after a
      98             :  *      successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
      99             :  *      that's just a coding artifact.)
     100             :  */
     101             : bool
     102     7162200 : _bt_doinsert(Relation rel, IndexTuple itup,
     103             :              IndexUniqueCheck checkUnique, bool indexUnchanged,
     104             :              Relation heapRel)
     105             : {
     106     7162200 :     bool        is_unique = false;
     107             :     BTInsertStateData insertstate;
     108             :     BTScanInsert itup_key;
     109             :     BTStack     stack;
     110     7162200 :     bool        checkingunique = (checkUnique != UNIQUE_CHECK_NO);
     111             : 
     112             :     /* we need an insertion scan key to do our search, so build one */
     113     7162200 :     itup_key = _bt_mkscankey(rel, itup);
     114             : 
     115     7162200 :     if (checkingunique)
     116             :     {
     117     5142550 :         if (!itup_key->anynullkeys)
     118             :         {
     119             :             /* No (heapkeyspace) scantid until uniqueness established */
     120     5122376 :             itup_key->scantid = NULL;
     121             :         }
     122             :         else
     123             :         {
     124             :             /*
     125             :              * Scan key for new tuple contains NULL key values.  Bypass
     126             :              * checkingunique steps.  They are unnecessary because core code
     127             :              * considers NULL unequal to every value, including NULL.
     128             :              *
     129             :              * This optimization avoids O(N^2) behavior within the
     130             :              * _bt_findinsertloc() heapkeyspace path when a unique index has a
     131             :              * large number of "duplicates" with NULL key values.
     132             :              */
     133       20174 :             checkingunique = false;
     134             :             /* Tuple is unique in the sense that core code cares about */
     135             :             Assert(checkUnique != UNIQUE_CHECK_EXISTING);
     136       20174 :             is_unique = true;
     137             :         }
     138             :     }
     139             : 
     140             :     /*
     141             :      * Fill in the BTInsertState working area, to track the current page and
     142             :      * position within the page to insert on.
     143             :      *
     144             :      * Note that itemsz is passed down to lower level code that deals with
     145             :      * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
     146             :      * accounting code consistently considers the alignment overhead that we
     147             :      * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
     148             :      * already conservative about alignment, but we don't rely on that from
     149             :      * this distance.  Besides, preserving the "true" tuple size in index
     150             :      * tuple headers for the benefit of nbtsplitloc.c might happen someday.
     151             :      * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
     152             :      */
     153     7162200 :     insertstate.itup = itup;
     154     7162200 :     insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
     155     7162200 :     insertstate.itup_key = itup_key;
     156     7162200 :     insertstate.bounds_valid = false;
     157     7162200 :     insertstate.buf = InvalidBuffer;
     158     7162200 :     insertstate.postingoff = 0;
     159             : 
     160     7162224 : search:
     161             : 
     162             :     /*
     163             :      * Find and lock the leaf page that the tuple should be added to by
     164             :      * searching from the root page.  insertstate.buf will hold a buffer that
     165             :      * is locked in exclusive mode afterwards.
     166             :      */
     167     7162224 :     stack = _bt_search_insert(rel, heapRel, &insertstate);
     168             : 
     169             :     /*
     170             :      * checkingunique inserts are not allowed to go ahead when two tuples with
     171             :      * equal key attribute values would be visible to new MVCC snapshots once
     172             :      * the xact commits.  Check for conflicts in the locked page/buffer (if
     173             :      * needed) here.
     174             :      *
     175             :      * It might be necessary to check a page to the right in _bt_check_unique,
     176             :      * though that should be very rare.  In practice the first page the value
     177             :      * could be on (with scantid omitted) is almost always also the only page
     178             :      * that a matching tuple might be found on.  This is due to the behavior
     179             :      * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
     180             :      * only be allowed to cross a page boundary when there is no candidate
     181             :      * leaf page split point that avoids it.  Also, _bt_check_unique can use
     182             :      * the leaf page high key to determine that there will be no duplicates on
     183             :      * the right sibling without actually visiting it (it uses the high key in
     184             :      * cases where the new item happens to belong at the far right of the leaf
     185             :      * page).
     186             :      *
     187             :      * NOTE: obviously, _bt_check_unique can only detect keys that are already
     188             :      * in the index; so it cannot defend against concurrent insertions of the
     189             :      * same key.  We protect against that by means of holding a write lock on
     190             :      * the first page the value could be on, with omitted/-inf value for the
     191             :      * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
     192             :      * the same key must acquire a write lock on the same page, so only one
     193             :      * would-be inserter can be making the check at one time.  Furthermore,
     194             :      * once we are past the check we hold write locks continuously until we
     195             :      * have performed our insertion, so no later inserter can fail to see our
     196             :      * insertion.  (This requires some care in _bt_findinsertloc.)
     197             :      *
     198             :      * If we must wait for another xact, we release the lock while waiting,
     199             :      * and then must perform a new search.
     200             :      *
     201             :      * For a partial uniqueness check, we don't wait for the other xact. Just
     202             :      * let the tuple in and return false for possibly non-unique, or true for
     203             :      * definitely unique.
     204             :      */
     205     7162224 :     if (checkingunique)
     206             :     {
     207             :         TransactionId xwait;
     208             :         uint32      speculativeToken;
     209             : 
     210     5122400 :         xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
     211             :                                  &is_unique, &speculativeToken);
     212             : 
     213     5121894 :         if (unlikely(TransactionIdIsValid(xwait)))
     214             :         {
     215             :             /* Have to wait for the other guy ... */
     216          24 :             _bt_relbuf(rel, insertstate.buf);
     217          24 :             insertstate.buf = InvalidBuffer;
     218             : 
     219             :             /*
     220             :              * If it's a speculative insertion, wait for it to finish (ie. to
     221             :              * go ahead with the insertion, or kill the tuple).  Otherwise
     222             :              * wait for the transaction to finish as usual.
     223             :              */
     224          24 :             if (speculativeToken)
     225           0 :                 SpeculativeInsertionWait(xwait, speculativeToken);
     226             :             else
     227          24 :                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
     228             : 
     229             :             /* start over... */
     230          24 :             if (stack)
     231           0 :                 _bt_freestack(stack);
     232          24 :             goto search;
     233             :         }
     234             : 
     235             :         /* Uniqueness is established -- restore heap tid as scantid */
     236     5121870 :         if (itup_key->heapkeyspace)
     237     5121870 :             itup_key->scantid = &itup->t_tid;
     238             :     }
     239             : 
     240     7161694 :     if (checkUnique != UNIQUE_CHECK_EXISTING)
     241             :     {
     242             :         OffsetNumber newitemoff;
     243             : 
     244             :         /*
     245             :          * The only conflict predicate locking cares about for indexes is when
     246             :          * an index tuple insert conflicts with an existing lock.  We don't
     247             :          * know the actual page we're going to insert on for sure just yet in
     248             :          * checkingunique and !heapkeyspace cases, but it's okay to use the
     249             :          * first page the value could be on (with scantid omitted) instead.
     250             :          */
     251     7161640 :         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
     252             : 
     253             :         /*
     254             :          * Do the insertion.  Note that insertstate contains cached binary
     255             :          * search bounds established within _bt_check_unique when insertion is
     256             :          * checkingunique.
     257             :          */
     258     7161634 :         newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
     259             :                                        indexUnchanged, stack, heapRel);
     260     7161634 :         _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
     261             :                        stack, itup, insertstate.itemsz, newitemoff,
     262             :                        insertstate.postingoff, false);
     263             :     }
     264             :     else
     265             :     {
     266             :         /* just release the buffer */
     267          54 :         _bt_relbuf(rel, insertstate.buf);
     268             :     }
     269             : 
     270             :     /* be tidy */
     271     7161688 :     if (stack)
     272     6210560 :         _bt_freestack(stack);
     273     7161688 :     pfree(itup_key);
     274             : 
     275     7161688 :     return is_unique;
     276             : }
     277             : 
     278             : /*
     279             :  *  _bt_search_insert() -- _bt_search() wrapper for inserts
     280             :  *
     281             :  * Search the tree for a particular scankey, or more precisely for the first
     282             :  * leaf page it could be on.  Try to make use of the fastpath optimization's
     283             :  * rightmost leaf page cache before actually searching the tree from the root
     284             :  * page, though.
     285             :  *
     286             :  * Return value is a stack of parent-page pointers (though see notes about
     287             :  * fastpath optimization and page splits below).  insertstate->buf is set to
     288             :  * the address of the leaf-page buffer, which is write-locked and pinned in
     289             :  * all cases (if necessary by creating a new empty root page for caller).
     290             :  *
     291             :  * The fastpath optimization avoids most of the work of searching the tree
     292             :  * repeatedly when a single backend inserts successive new tuples on the
     293             :  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
     294             :  * page is maintained within _bt_insertonpg(), and used here.  The cache is
     295             :  * invalidated here when an insert of a non-pivot tuple must take place on a
     296             :  * non-rightmost leaf page.
     297             :  *
     298             :  * The optimization helps with indexes on an auto-incremented field.  It also
     299             :  * helps with indexes on datetime columns, as well as indexes with lots of
     300             :  * NULL values.  (NULLs usually get inserted in the rightmost page for single
     301             :  * column indexes, since they usually get treated as coming after everything
     302             :  * else in the key space.  Individual NULL tuples will generally be placed on
     303             :  * the rightmost leaf page due to the influence of the heap TID column.)
     304             :  *
     305             :  * Note that we avoid applying the optimization when there is insufficient
     306             :  * space on the rightmost page to fit caller's new item.  This is necessary
     307             :  * because we'll need to return a real descent stack when a page split is
     308             :  * expected (actually, caller can cope with a leaf page split that uses a NULL
     309             :  * stack, but that's very slow and so must be avoided).  Note also that the
     310             :  * fastpath optimization acquires the lock on the page conditionally as a way
     311             :  * of reducing extra contention when there are concurrent insertions into the
     312             :  * rightmost page (we give up if we'd have to wait for the lock).  We assume
     313             :  * that it isn't useful to apply the optimization when there is contention,
     314             :  * since each per-backend cache won't stay valid for long.
     315             :  */
     316             : static BTStack
     317     7162224 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
     318             : {
     319             :     Assert(insertstate->buf == InvalidBuffer);
     320             :     Assert(!insertstate->bounds_valid);
     321             :     Assert(insertstate->postingoff == 0);
     322             : 
     323     7162224 :     if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
     324             :     {
     325             :         /* Simulate a _bt_getbuf() call with conditional locking */
     326       62730 :         insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
     327       62730 :         if (_bt_conditionallockbuf(rel, insertstate->buf))
     328             :         {
     329             :             Page        page;
     330             :             BTPageOpaque opaque;
     331             : 
     332       61520 :             _bt_checkpage(rel, insertstate->buf);
     333       61520 :             page = BufferGetPage(insertstate->buf);
     334       61520 :             opaque = BTPageGetOpaque(page);
     335             : 
     336             :             /*
     337             :              * Check if the page is still the rightmost leaf page and has
     338             :              * enough free space to accommodate the new tuple.  Also check
     339             :              * that the insertion scan key is strictly greater than the first
     340             :              * non-pivot tuple on the page.  (Note that we expect itup_key's
     341             :              * scantid to be unset when our caller is a checkingunique
     342             :              * inserter.)
     343             :              */
     344       61520 :             if (P_RIGHTMOST(opaque) &&
     345       61456 :                 P_ISLEAF(opaque) &&
     346       61456 :                 !P_IGNORE(opaque) &&
     347      122510 :                 PageGetFreeSpace(page) > insertstate->itemsz &&
     348      122108 :                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
     349       61054 :                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
     350             :             {
     351             :                 /*
     352             :                  * Caller can use the fastpath optimization because cached
     353             :                  * block is still rightmost leaf page, which can fit caller's
     354             :                  * new tuple without splitting.  Keep block in local cache for
     355             :                  * next insert, and have caller use NULL stack.
     356             :                  *
     357             :                  * Note that _bt_insert_parent() has an assertion that catches
     358             :                  * leaf page splits that somehow follow from a fastpath insert
     359             :                  * (it should only be passed a NULL stack when it must deal
     360             :                  * with a concurrent root page split, and never because a NULL
     361             :                  * stack was returned here).
     362             :                  */
     363       61012 :                 return NULL;
     364             :             }
     365             : 
     366             :             /* Page unsuitable for caller, drop lock and pin */
     367         508 :             _bt_relbuf(rel, insertstate->buf);
     368             :         }
     369             :         else
     370             :         {
     371             :             /* Lock unavailable, drop pin */
     372        1210 :             ReleaseBuffer(insertstate->buf);
     373             :         }
     374             : 
     375             :         /* Forget block, since cache doesn't appear to be useful */
     376        1718 :         RelationSetTargetBlock(rel, InvalidBlockNumber);
     377             :     }
     378             : 
     379             :     /* Cannot use optimization -- descend tree, return proper descent stack */
     380     7101212 :     return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
     381             :                       BT_WRITE);
     382             : }
     383             : 
     384             : /*
     385             :  *  _bt_check_unique() -- Check for violation of unique index constraint
     386             :  *
     387             :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
     388             :  * we must wait for to see if it commits a conflicting tuple.   If an actual
     389             :  * conflict is detected, no return --- just ereport().  If an xact ID is
     390             :  * returned, and the conflicting tuple still has a speculative insertion in
     391             :  * progress, *speculativeToken is set to non-zero, and the caller can wait for
     392             :  * the verdict on the insertion using SpeculativeInsertionWait().
     393             :  *
     394             :  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
     395             :  * InvalidTransactionId because we don't want to wait.  In this case we
     396             :  * set *is_unique to false if there is a potential conflict, and the
     397             :  * core code must redo the uniqueness check later.
     398             :  *
     399             :  * As a side-effect, sets state in insertstate that can later be used by
     400             :  * _bt_findinsertloc() to reuse most of the binary search work we do
     401             :  * here.
     402             :  *
     403             :  * This code treats NULLs as equal, unlike the default semantics for unique
     404             :  * indexes.  So do not call here when there are NULL values in scan key and
     405             :  * the index uses the default NULLS DISTINCT mode.
     406             :  */
     407             : static TransactionId
     408     5122400 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
     409             :                  IndexUniqueCheck checkUnique, bool *is_unique,
     410             :                  uint32 *speculativeToken)
     411             : {
     412     5122400 :     IndexTuple  itup = insertstate->itup;
     413     5122400 :     IndexTuple  curitup = NULL;
     414     5122400 :     ItemId      curitemid = NULL;
     415     5122400 :     BTScanInsert itup_key = insertstate->itup_key;
     416             :     SnapshotData SnapshotDirty;
     417             :     OffsetNumber offset;
     418             :     OffsetNumber maxoff;
     419             :     Page        page;
     420             :     BTPageOpaque opaque;
     421     5122400 :     Buffer      nbuf = InvalidBuffer;
     422     5122400 :     bool        found = false;
     423     5122400 :     bool        inposting = false;
     424     5122400 :     bool        prevalldead = true;
     425     5122400 :     int         curposti = 0;
     426             : 
     427             :     /* Assume unique until we find a duplicate */
     428     5122400 :     *is_unique = true;
     429             : 
     430     5122400 :     InitDirtySnapshot(SnapshotDirty);
     431             : 
     432     5122400 :     page = BufferGetPage(insertstate->buf);
     433     5122400 :     opaque = BTPageGetOpaque(page);
     434     5122400 :     maxoff = PageGetMaxOffsetNumber(page);
     435             : 
     436             :     /*
     437             :      * Find the first tuple with the same key.
     438             :      *
     439             :      * This also saves the binary search bounds in insertstate.  We use them
     440             :      * in the fastpath below, but also in the _bt_findinsertloc() call later.
     441             :      */
     442             :     Assert(!insertstate->bounds_valid);
     443     5122400 :     offset = _bt_binsrch_insert(rel, insertstate);
     444             : 
     445             :     /*
     446             :      * Scan over all equal tuples, looking for live conflicts.
     447             :      */
     448             :     Assert(!insertstate->bounds_valid || insertstate->low == offset);
     449             :     Assert(!itup_key->anynullkeys);
     450             :     Assert(itup_key->scantid == NULL);
     451             :     for (;;)
     452             :     {
     453             :         /*
     454             :          * Each iteration of the loop processes one heap TID, not one index
     455             :          * tuple.  Current offset number for page isn't usually advanced on
     456             :          * iterations that process heap TIDs from posting list tuples.
     457             :          *
     458             :          * "inposting" state is set when _inside_ a posting list --- not when
     459             :          * we're at the start (or end) of a posting list.  We advance curposti
     460             :          * at the end of the iteration when inside a posting list tuple.  In
     461             :          * general, every loop iteration either advances the page offset or
     462             :          * advances curposti --- an iteration that handles the rightmost/max
     463             :          * heap TID in a posting list finally advances the page offset (and
     464             :          * unsets "inposting").
     465             :          *
     466             :          * Make sure the offset points to an actual index tuple before trying
     467             :          * to examine it...
     468             :          */
     469    17091394 :         if (offset <= maxoff)
     470             :         {
     471             :             /*
     472             :              * Fastpath: In most cases, we can use cached search bounds to
     473             :              * limit our consideration to items that are definitely
     474             :              * duplicates.  This fastpath doesn't apply when the original page
     475             :              * is empty, or when initial offset is past the end of the
     476             :              * original page, which may indicate that we need to examine a
     477             :              * second or subsequent page.
     478             :              *
     479             :              * Note that this optimization allows us to avoid calling
     480             :              * _bt_compare() directly when there are no duplicates, as long as
     481             :              * the offset where the key will go is not at the end of the page.
     482             :              */
     483    14237764 :             if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
     484             :             {
     485             :                 Assert(insertstate->bounds_valid);
     486             :                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
     487             :                 Assert(insertstate->low <= insertstate->stricthigh);
     488             :                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
     489     2040736 :                 break;
     490             :             }
     491             : 
     492             :             /*
     493             :              * We can skip items that are already marked killed.
     494             :              *
     495             :              * In the presence of heavy update activity an index may contain
     496             :              * many killed items with the same key; running _bt_compare() on
     497             :              * each killed item gets expensive.  Just advance over killed
     498             :              * items as quickly as we can.  We only apply _bt_compare() when
     499             :              * we get to a non-killed item.  We could reuse the bounds to
     500             :              * avoid _bt_compare() calls for known equal tuples, but it
     501             :              * doesn't seem worth it.
     502             :              */
     503    12197028 :             if (!inposting)
     504     7684808 :                 curitemid = PageGetItemId(page, offset);
     505    12197028 :             if (inposting || !ItemIdIsDead(curitemid))
     506             :             {
     507             :                 ItemPointerData htid;
     508    11642882 :                 bool        all_dead = false;
     509             : 
     510    11642882 :                 if (!inposting)
     511             :                 {
     512             :                     /* Plain tuple, or first TID in posting list tuple */
     513     7130662 :                     if (_bt_compare(rel, itup_key, page, offset) != 0)
     514      202002 :                         break;  /* we're past all the equal tuples */
     515             : 
     516             :                     /* Advanced curitup */
     517     6928660 :                     curitup = (IndexTuple) PageGetItem(page, curitemid);
     518             :                     Assert(!BTreeTupleIsPivot(curitup));
     519             :                 }
     520             : 
     521             :                 /* okay, we gotta fetch the heap tuple using htid ... */
     522    11440880 :                 if (!BTreeTupleIsPosting(curitup))
     523             :                 {
     524             :                     /* ... htid is from simple non-pivot tuple */
     525             :                     Assert(!inposting);
     526     6882286 :                     htid = curitup->t_tid;
     527             :                 }
     528     4558594 :                 else if (!inposting)
     529             :                 {
     530             :                     /* ... htid is first TID in new posting list */
     531       46374 :                     inposting = true;
     532       46374 :                     prevalldead = true;
     533       46374 :                     curposti = 0;
     534       46374 :                     htid = *BTreeTupleGetPostingN(curitup, 0);
     535             :                 }
     536             :                 else
     537             :                 {
     538             :                     /* ... htid is second or subsequent TID in posting list */
     539             :                     Assert(curposti > 0);
     540     4512220 :                     htid = *BTreeTupleGetPostingN(curitup, curposti);
     541             :                 }
     542             : 
     543             :                 /*
     544             :                  * If we are doing a recheck, we expect to find the tuple we
     545             :                  * are rechecking.  It's not a duplicate, but we have to keep
     546             :                  * scanning.
     547             :                  */
     548    11441116 :                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
     549         236 :                     ItemPointerCompare(&htid, &itup->t_tid) == 0)
     550             :                 {
     551          54 :                     found = true;
     552             :                 }
     553             : 
     554             :                 /*
     555             :                  * Check if there's any table tuples for this index entry
     556             :                  * satisfying SnapshotDirty. This is necessary because for AMs
     557             :                  * with optimizations like heap's HOT, we have just a single
     558             :                  * index entry for the entire chain.
     559             :                  */
     560    11440826 :                 else if (table_index_fetch_tuple_check(heapRel, &htid,
     561             :                                                        &SnapshotDirty,
     562             :                                                        &all_dead))
     563             :                 {
     564             :                     TransactionId xwait;
     565             : 
     566             :                     /*
     567             :                      * It is a duplicate. If we are only doing a partial
     568             :                      * check, then don't bother checking if the tuple is being
     569             :                      * updated in another transaction. Just return the fact
     570             :                      * that it is a potential conflict and leave the full
     571             :                      * check till later. Don't invalidate binary search
     572             :                      * bounds.
     573             :                      */
     574         724 :                     if (checkUnique == UNIQUE_CHECK_PARTIAL)
     575             :                     {
     576         194 :                         if (nbuf != InvalidBuffer)
     577           0 :                             _bt_relbuf(rel, nbuf);
     578         194 :                         *is_unique = false;
     579         218 :                         return InvalidTransactionId;
     580             :                     }
     581             : 
     582             :                     /*
     583             :                      * If this tuple is being updated by other transaction
     584             :                      * then we have to wait for its commit/abort.
     585             :                      */
     586        1060 :                     xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
     587         530 :                         SnapshotDirty.xmin : SnapshotDirty.xmax;
     588             : 
     589         530 :                     if (TransactionIdIsValid(xwait))
     590             :                     {
     591          24 :                         if (nbuf != InvalidBuffer)
     592           0 :                             _bt_relbuf(rel, nbuf);
     593             :                         /* Tell _bt_doinsert to wait... */
     594          24 :                         *speculativeToken = SnapshotDirty.speculativeToken;
     595             :                         /* Caller releases lock on buf immediately */
     596          24 :                         insertstate->bounds_valid = false;
     597          24 :                         return xwait;
     598             :                     }
     599             : 
     600             :                     /*
     601             :                      * Otherwise we have a definite conflict.  But before
     602             :                      * complaining, look to see if the tuple we want to insert
     603             :                      * is itself now committed dead --- if so, don't complain.
     604             :                      * This is a waste of time in normal scenarios but we must
     605             :                      * do it to support CREATE INDEX CONCURRENTLY.
     606             :                      *
     607             :                      * We must follow HOT-chains here because during
     608             :                      * concurrent index build, we insert the root TID though
     609             :                      * the actual tuple may be somewhere in the HOT-chain.
     610             :                      * While following the chain we might not stop at the
     611             :                      * exact tuple which triggered the insert, but that's OK
     612             :                      * because if we find a live tuple anywhere in this chain,
     613             :                      * we have a unique key conflict.  The other live tuple is
     614             :                      * not part of this chain because it had a different index
     615             :                      * entry.
     616             :                      */
     617         506 :                     htid = itup->t_tid;
     618         506 :                     if (table_index_fetch_tuple_check(heapRel, &htid,
     619             :                                                       SnapshotSelf, NULL))
     620             :                     {
     621             :                         /* Normal case --- it's still live */
     622             :                     }
     623             :                     else
     624             :                     {
     625             :                         /*
     626             :                          * It's been deleted, so no error, and no need to
     627             :                          * continue searching
     628             :                          */
     629           0 :                         break;
     630             :                     }
     631             : 
     632             :                     /*
     633             :                      * Check for a conflict-in as we would if we were going to
     634             :                      * write to this page.  We aren't actually going to write,
     635             :                      * but we want a chance to report SSI conflicts that would
     636             :                      * otherwise be masked by this unique constraint
     637             :                      * violation.
     638             :                      */
     639         506 :                     CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
     640             : 
     641             :                     /*
     642             :                      * This is a definite conflict.  Break the tuple down into
     643             :                      * datums and report the error.  But first, make sure we
     644             :                      * release the buffer locks we're holding ---
     645             :                      * BuildIndexValueDescription could make catalog accesses,
     646             :                      * which in the worst case might touch this same index and
     647             :                      * cause deadlocks.
     648             :                      */
     649         498 :                     if (nbuf != InvalidBuffer)
     650           0 :                         _bt_relbuf(rel, nbuf);
     651         498 :                     _bt_relbuf(rel, insertstate->buf);
     652         498 :                     insertstate->buf = InvalidBuffer;
     653         498 :                     insertstate->bounds_valid = false;
     654             : 
     655             :                     {
     656             :                         Datum       values[INDEX_MAX_KEYS];
     657             :                         bool        isnull[INDEX_MAX_KEYS];
     658             :                         char       *key_desc;
     659             : 
     660         498 :                         index_deform_tuple(itup, RelationGetDescr(rel),
     661             :                                            values, isnull);
     662             : 
     663         498 :                         key_desc = BuildIndexValueDescription(rel, values,
     664             :                                                               isnull);
     665             : 
     666         498 :                         ereport(ERROR,
     667             :                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
     668             :                                  errmsg("duplicate key value violates unique constraint \"%s\"",
     669             :                                         RelationGetRelationName(rel)),
     670             :                                  key_desc ? errdetail("Key %s already exists.",
     671             :                                                       key_desc) : 0,
     672             :                                  errtableconstraint(heapRel,
     673             :                                                     RelationGetRelationName(rel))));
     674             :                     }
     675             :                 }
     676    11440102 :                 else if (all_dead && (!inposting ||
     677       37198 :                                       (prevalldead &&
     678       37198 :                                        curposti == BTreeTupleGetNPosting(curitup) - 1)))
     679             :                 {
     680             :                     /*
     681             :                      * The conflicting tuple (or all HOT chains pointed to by
     682             :                      * all posting list TIDs) is dead to everyone, so mark the
     683             :                      * index entry killed.
     684             :                      */
     685      105778 :                     ItemIdMarkDead(curitemid);
     686      105778 :                     opaque->btpo_flags |= BTP_HAS_GARBAGE;
     687             : 
     688             :                     /*
     689             :                      * Mark buffer with a dirty hint, since state is not
     690             :                      * crucial. Be sure to mark the proper buffer dirty.
     691             :                      */
     692      105778 :                     if (nbuf != InvalidBuffer)
     693           6 :                         MarkBufferDirtyHint(nbuf, true);
     694             :                     else
     695      105772 :                         MarkBufferDirtyHint(insertstate->buf, true);
     696             :                 }
     697             : 
     698             :                 /*
     699             :                  * Remember if posting list tuple has even a single HOT chain
     700             :                  * whose members are not all dead
     701             :                  */
     702    11440156 :                 if (!all_dead && inposting)
     703     4520680 :                     prevalldead = false;
     704             :             }
     705             :         }
     706             : 
     707    14847932 :         if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
     708             :         {
     709             :             /* Advance to next TID in same posting list */
     710     4512220 :             curposti++;
     711     4512220 :             continue;
     712             :         }
     713    10335712 :         else if (offset < maxoff)
     714             :         {
     715             :             /* Advance to next tuple */
     716     7446730 :             curposti = 0;
     717     7446730 :             inposting = false;
     718     7446730 :             offset = OffsetNumberNext(offset);
     719             :         }
     720             :         else
     721             :         {
     722             :             int         highkeycmp;
     723             : 
     724             :             /* If scankey == hikey we gotta check the next page too */
     725     2888982 :             if (P_RIGHTMOST(opaque))
     726     2725944 :                 break;
     727      163038 :             highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
     728             :             Assert(highkeycmp <= 0);
     729      163038 :             if (highkeycmp != 0)
     730      152994 :                 break;
     731             :             /* Advance to next non-dead page --- there must be one */
     732             :             for (;;)
     733           0 :             {
     734       10044 :                 BlockNumber nblkno = opaque->btpo_next;
     735             : 
     736       10044 :                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
     737       10044 :                 page = BufferGetPage(nbuf);
     738       10044 :                 opaque = BTPageGetOpaque(page);
     739       10044 :                 if (!P_IGNORE(opaque))
     740       10044 :                     break;
     741           0 :                 if (P_RIGHTMOST(opaque))
     742           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
     743             :                          RelationGetRelationName(rel));
     744             :             }
     745             :             /* Will also advance to next tuple */
     746       10044 :             curposti = 0;
     747       10044 :             inposting = false;
     748       10044 :             maxoff = PageGetMaxOffsetNumber(page);
     749       10044 :             offset = P_FIRSTDATAKEY(opaque);
     750             :             /* Don't invalidate binary search bounds */
     751             :         }
     752             :     }
     753             : 
     754             :     /*
     755             :      * If we are doing a recheck then we should have found the tuple we are
     756             :      * checking.  Otherwise there's something very wrong --- probably, the
     757             :      * index is on a non-immutable expression.
     758             :      */
     759     5121676 :     if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
     760           0 :         ereport(ERROR,
     761             :                 (errcode(ERRCODE_INTERNAL_ERROR),
     762             :                  errmsg("failed to re-find tuple within index \"%s\"",
     763             :                         RelationGetRelationName(rel)),
     764             :                  errhint("This may be because of a non-immutable index expression."),
     765             :                  errtableconstraint(heapRel,
     766             :                                     RelationGetRelationName(rel))));
     767             : 
     768     5121676 :     if (nbuf != InvalidBuffer)
     769        5760 :         _bt_relbuf(rel, nbuf);
     770             : 
     771     5121676 :     return InvalidTransactionId;
     772             : }
     773             : 
     774             : 
     775             : /*
     776             :  *  _bt_findinsertloc() -- Finds an insert location for a tuple
     777             :  *
     778             :  *      On entry, insertstate buffer contains the page the new tuple belongs
     779             :  *      on.  It is exclusive-locked and pinned by the caller.
     780             :  *
     781             :  *      If 'checkingunique' is true, the buffer on entry is the first page
     782             :  *      that contains duplicates of the new key.  If there are duplicates on
     783             :  *      multiple pages, the correct insertion position might be some page to
     784             :  *      the right, rather than the first page.  In that case, this function
     785             :  *      moves right to the correct target page.
     786             :  *
     787             :  *      (In a !heapkeyspace index, there can be multiple pages with the same
     788             :  *      high key, where the new tuple could legitimately be placed on.  In
     789             :  *      that case, the caller passes the first page containing duplicates,
     790             :  *      just like when checkingunique=true.  If that page doesn't have enough
     791             :  *      room for the new tuple, this function moves right, trying to find a
     792             :  *      legal page that does.)
     793             :  *
     794             :  *      If 'indexUnchanged' is true, this is for an UPDATE that didn't
     795             :  *      logically change the indexed value, but must nevertheless have a new
     796             :  *      entry to point to a successor version.  This hint from the executor
     797             :  *      will influence our behavior when the page might have to be split and
     798             :  *      we must consider our options.  Bottom-up index deletion can avoid
     799             :  *      pathological version-driven page splits, but we only want to go to the
     800             :  *      trouble of trying it when we already have moderate confidence that
     801             :  *      it's appropriate.  The hint should not significantly affect our
     802             :  *      behavior over time unless practically all inserts on to the leaf page
     803             :  *      get the hint.
     804             :  *
     805             :  *      On exit, insertstate buffer contains the chosen insertion page, and
     806             :  *      the offset within that page is returned.  If _bt_findinsertloc needed
     807             :  *      to move right, the lock and pin on the original page are released, and
     808             :  *      the new buffer is exclusively locked and pinned instead.
     809             :  *
     810             :  *      If insertstate contains cached binary search bounds, we will take
     811             :  *      advantage of them.  This avoids repeating comparisons that we made in
     812             :  *      _bt_check_unique() already.
     813             :  */
     814             : static OffsetNumber
     815     7161634 : _bt_findinsertloc(Relation rel,
     816             :                   BTInsertState insertstate,
     817             :                   bool checkingunique,
     818             :                   bool indexUnchanged,
     819             :                   BTStack stack,
     820             :                   Relation heapRel)
     821             : {
     822     7161634 :     BTScanInsert itup_key = insertstate->itup_key;
     823     7161634 :     Page        page = BufferGetPage(insertstate->buf);
     824             :     BTPageOpaque opaque;
     825             :     OffsetNumber newitemoff;
     826             : 
     827     7161634 :     opaque = BTPageGetOpaque(page);
     828             : 
     829             :     /* Check 1/3 of a page restriction */
     830     7161634 :     if (unlikely(insertstate->itemsz > BTMaxItemSize))
     831           0 :         _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
     832             :                              insertstate->itup);
     833             : 
     834             :     Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
     835             :     Assert(!insertstate->bounds_valid || checkingunique);
     836             :     Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
     837             :     Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
     838             :     Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
     839             : 
     840     7161634 :     if (itup_key->heapkeyspace)
     841             :     {
     842             :         /* Keep track of whether checkingunique duplicate seen */
     843     7161634 :         bool        uniquedup = indexUnchanged;
     844             : 
     845             :         /*
     846             :          * If we're inserting into a unique index, we may have to walk right
     847             :          * through leaf pages to find the one leaf page that we must insert on
     848             :          * to.
     849             :          *
     850             :          * This is needed for checkingunique callers because a scantid was not
     851             :          * used when we called _bt_search().  scantid can only be set after
     852             :          * _bt_check_unique() has checked for duplicates.  The buffer
     853             :          * initially stored in insertstate->buf has the page where the first
     854             :          * duplicate key might be found, which isn't always the page that new
     855             :          * tuple belongs on.  The heap TID attribute for new tuple (scantid)
     856             :          * could force us to insert on a sibling page, though that should be
     857             :          * very rare in practice.
     858             :          */
     859     7161634 :         if (checkingunique)
     860             :         {
     861     5121810 :             if (insertstate->low < insertstate->stricthigh)
     862             :             {
     863             :                 /* Encountered a duplicate in _bt_check_unique() */
     864             :                 Assert(insertstate->bounds_valid);
     865      439002 :                 uniquedup = true;
     866             :             }
     867             : 
     868             :             for (;;)
     869             :             {
     870             :                 /*
     871             :                  * Does the new tuple belong on this page?
     872             :                  *
     873             :                  * The earlier _bt_check_unique() call may well have
     874             :                  * established a strict upper bound on the offset for the new
     875             :                  * item.  If it's not the last item of the page (i.e. if there
     876             :                  * is at least one tuple on the page that goes after the tuple
     877             :                  * we're inserting) then we know that the tuple belongs on
     878             :                  * this page.  We can skip the high key check.
     879             :                  */
     880     5131854 :                 if (insertstate->bounds_valid &&
     881    10222436 :                     insertstate->low <= insertstate->stricthigh &&
     882     5111218 :                     insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     883     2217442 :                     break;
     884             : 
     885             :                 /* Test '<=', not '!=', since scantid is set now */
     886     3092006 :                 if (P_RIGHTMOST(opaque) ||
     887      177594 :                     _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
     888             :                     break;
     889             : 
     890       10044 :                 _bt_stepright(rel, heapRel, insertstate, stack);
     891             :                 /* Update local state after stepping right */
     892       10044 :                 page = BufferGetPage(insertstate->buf);
     893       10044 :                 opaque = BTPageGetOpaque(page);
     894             :                 /* Assume duplicates (if checkingunique) */
     895       10044 :                 uniquedup = true;
     896             :             }
     897             :         }
     898             : 
     899             :         /*
     900             :          * If the target page cannot fit newitem, try to avoid splitting the
     901             :          * page on insert by performing deletion or deduplication now
     902             :          */
     903     7161634 :         if (PageGetFreeSpace(page) < insertstate->itemsz)
     904       50058 :             _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
     905             :                                          checkingunique, uniquedup,
     906             :                                          indexUnchanged);
     907             :     }
     908             :     else
     909             :     {
     910             :         /*----------
     911             :          * This is a !heapkeyspace (version 2 or 3) index.  The current page
     912             :          * is the first page that we could insert the new tuple to, but there
     913             :          * may be other pages to the right that we could opt to use instead.
     914             :          *
     915             :          * If the new key is equal to one or more existing keys, we can
     916             :          * legitimately place it anywhere in the series of equal keys.  In
     917             :          * fact, if the new key is equal to the page's "high key" we can place
     918             :          * it on the next page.  If it is equal to the high key, and there's
     919             :          * not room to insert the new tuple on the current page without
     920             :          * splitting, then we move right hoping to find more free space and
     921             :          * avoid a split.
     922             :          *
     923             :          * Keep scanning right until we
     924             :          *      (a) find a page with enough free space,
     925             :          *      (b) reach the last page where the tuple can legally go, or
     926             :          *      (c) get tired of searching.
     927             :          * (c) is not flippant; it is important because if there are many
     928             :          * pages' worth of equal keys, it's better to split one of the early
     929             :          * pages than to scan all the way to the end of the run of equal keys
     930             :          * on every insert.  We implement "get tired" as a random choice,
     931             :          * since stopping after scanning a fixed number of pages wouldn't work
     932             :          * well (we'd never reach the right-hand side of previously split
     933             :          * pages).  The probability of moving right is set at 0.99, which may
     934             :          * seem too high to change the behavior much, but it does an excellent
     935             :          * job of preventing O(N^2) behavior with many equal keys.
     936             :          *----------
     937             :          */
     938           0 :         while (PageGetFreeSpace(page) < insertstate->itemsz)
     939             :         {
     940             :             /*
     941             :              * Before considering moving right, see if we can obtain enough
     942             :              * space by erasing LP_DEAD items
     943             :              */
     944           0 :             if (P_HAS_GARBAGE(opaque))
     945             :             {
     946             :                 /* Perform simple deletion */
     947           0 :                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     948             :                                              false, false, false);
     949             : 
     950           0 :                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
     951           0 :                     break;      /* OK, now we have enough space */
     952             :             }
     953             : 
     954             :             /*
     955             :              * Nope, so check conditions (b) and (c) enumerated above
     956             :              *
     957             :              * The earlier _bt_check_unique() call may well have established a
     958             :              * strict upper bound on the offset for the new item.  If it's not
     959             :              * the last item of the page (i.e. if there is at least one tuple
     960             :              * on the page that's greater than the tuple we're inserting to)
     961             :              * then we know that the tuple belongs on this page.  We can skip
     962             :              * the high key check.
     963             :              */
     964           0 :             if (insertstate->bounds_valid &&
     965           0 :                 insertstate->low <= insertstate->stricthigh &&
     966           0 :                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     967           0 :                 break;
     968             : 
     969           0 :             if (P_RIGHTMOST(opaque) ||
     970           0 :                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
     971           0 :                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
     972             :                 break;
     973             : 
     974           0 :             _bt_stepright(rel, heapRel, insertstate, stack);
     975             :             /* Update local state after stepping right */
     976           0 :             page = BufferGetPage(insertstate->buf);
     977           0 :             opaque = BTPageGetOpaque(page);
     978             :         }
     979             :     }
     980             : 
     981             :     /*
     982             :      * We should now be on the correct page.  Find the offset within the page
     983             :      * for the new tuple. (Possibly reusing earlier search bounds.)
     984             :      */
     985             :     Assert(P_RIGHTMOST(opaque) ||
     986             :            _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
     987             : 
     988     7161634 :     newitemoff = _bt_binsrch_insert(rel, insertstate);
     989             : 
     990     7161634 :     if (insertstate->postingoff == -1)
     991             :     {
     992             :         /*
     993             :          * There is an overlapping posting list tuple with its LP_DEAD bit
     994             :          * set.  We don't want to unnecessarily unset its LP_DEAD bit while
     995             :          * performing a posting list split, so perform simple index tuple
     996             :          * deletion early.
     997             :          */
     998           4 :         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     999             :                                      false, false, false);
    1000             : 
    1001             :         /*
    1002             :          * Do new binary search.  New insert location cannot overlap with any
    1003             :          * posting list now.
    1004             :          */
    1005             :         Assert(!insertstate->bounds_valid);
    1006           4 :         insertstate->postingoff = 0;
    1007           4 :         newitemoff = _bt_binsrch_insert(rel, insertstate);
    1008             :         Assert(insertstate->postingoff == 0);
    1009             :     }
    1010             : 
    1011     7161634 :     return newitemoff;
    1012             : }
    1013             : 
    1014             : /*
    1015             :  * Step right to next non-dead page, during insertion.
    1016             :  *
    1017             :  * This is a bit more complicated than moving right in a search.  We must
    1018             :  * write-lock the target page before releasing write lock on current page;
    1019             :  * else someone else's _bt_check_unique scan could fail to see our insertion.
    1020             :  * Write locks on intermediate dead pages won't do because we don't know when
    1021             :  * they will get de-linked from the tree.
    1022             :  *
    1023             :  * This is more aggressive than it needs to be for non-unique !heapkeyspace
    1024             :  * indexes.
    1025             :  */
    1026             : static void
    1027       10044 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
    1028             :               BTStack stack)
    1029             : {
    1030             :     Page        page;
    1031             :     BTPageOpaque opaque;
    1032             :     Buffer      rbuf;
    1033             :     BlockNumber rblkno;
    1034             : 
    1035             :     Assert(heaprel != NULL);
    1036       10044 :     page = BufferGetPage(insertstate->buf);
    1037       10044 :     opaque = BTPageGetOpaque(page);
    1038             : 
    1039       10044 :     rbuf = InvalidBuffer;
    1040       10044 :     rblkno = opaque->btpo_next;
    1041             :     for (;;)
    1042             :     {
    1043       10044 :         rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
    1044       10044 :         page = BufferGetPage(rbuf);
    1045       10044 :         opaque = BTPageGetOpaque(page);
    1046             : 
    1047             :         /*
    1048             :          * If this page was incompletely split, finish the split now.  We do
    1049             :          * this while holding a lock on the left sibling, which is not good
    1050             :          * because finishing the split could be a fairly lengthy operation.
    1051             :          * But this should happen very seldom.
    1052             :          */
    1053       10044 :         if (P_INCOMPLETE_SPLIT(opaque))
    1054             :         {
    1055           0 :             _bt_finish_split(rel, heaprel, rbuf, stack);
    1056           0 :             rbuf = InvalidBuffer;
    1057           0 :             continue;
    1058             :         }
    1059             : 
    1060       10044 :         if (!P_IGNORE(opaque))
    1061       10044 :             break;
    1062           0 :         if (P_RIGHTMOST(opaque))
    1063           0 :             elog(ERROR, "fell off the end of index \"%s\"",
    1064             :                  RelationGetRelationName(rel));
    1065             : 
    1066           0 :         rblkno = opaque->btpo_next;
    1067             :     }
    1068             :     /* rbuf locked; unlock buf, update state for caller */
    1069       10044 :     _bt_relbuf(rel, insertstate->buf);
    1070       10044 :     insertstate->buf = rbuf;
    1071       10044 :     insertstate->bounds_valid = false;
    1072       10044 : }
    1073             : 
    1074             : /*----------
    1075             :  *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
    1076             :  *
    1077             :  *      This recursive procedure does the following things:
    1078             :  *
    1079             :  *          +  if postingoff != 0, splits existing posting list tuple
    1080             :  *             (since it overlaps with new 'itup' tuple).
    1081             :  *          +  if necessary, splits the target page, using 'itup_key' for
    1082             :  *             suffix truncation on leaf pages (caller passes NULL for
    1083             :  *             non-leaf pages).
    1084             :  *          +  inserts the new tuple (might be split from posting list).
    1085             :  *          +  if the page was split, pops the parent stack, and finds the
    1086             :  *             right place to insert the new child pointer (by walking
    1087             :  *             right using information stored in the parent stack).
    1088             :  *          +  invokes itself with the appropriate tuple for the right
    1089             :  *             child page on the parent.
    1090             :  *          +  updates the metapage if a true root or fast root is split.
    1091             :  *
    1092             :  *      On entry, we must have the correct buffer in which to do the
    1093             :  *      insertion, and the buffer must be pinned and write-locked.  On return,
    1094             :  *      we will have dropped both the pin and the lock on the buffer.
    1095             :  *
    1096             :  *      This routine only performs retail tuple insertions.  'itup' should
    1097             :  *      always be either a non-highkey leaf item, or a downlink (new high
    1098             :  *      key items are created indirectly, when a page is split).  When
    1099             :  *      inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
    1100             :  *      we're inserting the downlink for.  This function will clear the
    1101             :  *      INCOMPLETE_SPLIT flag on it, and release the buffer.
    1102             :  *----------
    1103             :  */
    1104             : static void
    1105     7182354 : _bt_insertonpg(Relation rel,
    1106             :                Relation heaprel,
    1107             :                BTScanInsert itup_key,
    1108             :                Buffer buf,
    1109             :                Buffer cbuf,
    1110             :                BTStack stack,
    1111             :                IndexTuple itup,
    1112             :                Size itemsz,
    1113             :                OffsetNumber newitemoff,
    1114             :                int postingoff,
    1115             :                bool split_only_page)
    1116             : {
    1117             :     Page        page;
    1118             :     BTPageOpaque opaque;
    1119             :     bool        isleaf,
    1120             :                 isroot,
    1121             :                 isrightmost,
    1122             :                 isonly;
    1123     7182354 :     IndexTuple  oposting = NULL;
    1124     7182354 :     IndexTuple  origitup = NULL;
    1125     7182354 :     IndexTuple  nposting = NULL;
    1126             : 
    1127     7182354 :     page = BufferGetPage(buf);
    1128     7182354 :     opaque = BTPageGetOpaque(page);
    1129     7182354 :     isleaf = P_ISLEAF(opaque);
    1130     7182354 :     isroot = P_ISROOT(opaque);
    1131     7182354 :     isrightmost = P_RIGHTMOST(opaque);
    1132     7182354 :     isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
    1133             : 
    1134             :     /* child buffer must be given iff inserting on an internal page */
    1135             :     Assert(isleaf == !BufferIsValid(cbuf));
    1136             :     /* tuple must have appropriate number of attributes */
    1137             :     Assert(!isleaf ||
    1138             :            BTreeTupleGetNAtts(itup, rel) ==
    1139             :            IndexRelationGetNumberOfAttributes(rel));
    1140             :     Assert(isleaf ||
    1141             :            BTreeTupleGetNAtts(itup, rel) <=
    1142             :            IndexRelationGetNumberOfKeyAttributes(rel));
    1143             :     Assert(!BTreeTupleIsPosting(itup));
    1144             :     Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
    1145             :     /* Caller must always finish incomplete split for us */
    1146             :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    1147             : 
    1148             :     /*
    1149             :      * Every internal page should have exactly one negative infinity item at
    1150             :      * all times.  Only _bt_split() and _bt_newlevel() should add items that
    1151             :      * become negative infinity items through truncation, since they're the
    1152             :      * only routines that allocate new internal pages.
    1153             :      */
    1154             :     Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
    1155             : 
    1156             :     /*
    1157             :      * Do we need to split an existing posting list item?
    1158             :      */
    1159     7182354 :     if (postingoff != 0)
    1160             :     {
    1161       23802 :         ItemId      itemid = PageGetItemId(page, newitemoff);
    1162             : 
    1163             :         /*
    1164             :          * The new tuple is a duplicate with a heap TID that falls inside the
    1165             :          * range of an existing posting list tuple on a leaf page.  Prepare to
    1166             :          * split an existing posting list.  Overwriting the posting list with
    1167             :          * its post-split version is treated as an extra step in either the
    1168             :          * insert or page split critical section.
    1169             :          */
    1170             :         Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
    1171       23802 :         oposting = (IndexTuple) PageGetItem(page, itemid);
    1172             : 
    1173             :         /*
    1174             :          * postingoff value comes from earlier call to _bt_binsrch_posting().
    1175             :          * Its binary search might think that a plain tuple must be a posting
    1176             :          * list tuple that needs to be split.  This can happen with corruption
    1177             :          * involving an existing plain tuple that is a duplicate of the new
    1178             :          * item, up to and including its table TID.  Check for that here in
    1179             :          * passing.
    1180             :          *
    1181             :          * Also verify that our caller has made sure that the existing posting
    1182             :          * list tuple does not have its LP_DEAD bit set.
    1183             :          */
    1184       23802 :         if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
    1185           0 :             ereport(ERROR,
    1186             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1187             :                      errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
    1188             :                                      ItemPointerGetBlockNumber(&itup->t_tid),
    1189             :                                      ItemPointerGetOffsetNumber(&itup->t_tid),
    1190             :                                      newitemoff, BufferGetBlockNumber(buf),
    1191             :                                      RelationGetRelationName(rel))));
    1192             : 
    1193             :         /* use a mutable copy of itup as our itup from here on */
    1194       23802 :         origitup = itup;
    1195       23802 :         itup = CopyIndexTuple(origitup);
    1196       23802 :         nposting = _bt_swap_posting(itup, oposting, postingoff);
    1197             :         /* itup now contains rightmost/max TID from oposting */
    1198             : 
    1199             :         /* Alter offset so that newitem goes after posting list */
    1200       23802 :         newitemoff = OffsetNumberNext(newitemoff);
    1201             :     }
    1202             : 
    1203             :     /*
    1204             :      * Do we need to split the page to fit the item on it?
    1205             :      *
    1206             :      * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
    1207             :      * so this comparison is correct even though we appear to be accounting
    1208             :      * only for the item and not for its line pointer.
    1209             :      */
    1210     7182354 :     if (PageGetFreeSpace(page) < itemsz)
    1211             :     {
    1212             :         Buffer      rbuf;
    1213             : 
    1214             :         Assert(!split_only_page);
    1215             : 
    1216             :         /* split the buffer into left and right halves */
    1217       22074 :         rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
    1218             :                          itup, origitup, nposting, postingoff);
    1219       22074 :         PredicateLockPageSplit(rel,
    1220             :                                BufferGetBlockNumber(buf),
    1221             :                                BufferGetBlockNumber(rbuf));
    1222             : 
    1223             :         /*----------
    1224             :          * By here,
    1225             :          *
    1226             :          *      +  our target page has been split;
    1227             :          *      +  the original tuple has been inserted;
    1228             :          *      +  we have write locks on both the old (left half)
    1229             :          *         and new (right half) buffers, after the split; and
    1230             :          *      +  we know the key we want to insert into the parent
    1231             :          *         (it's the "high key" on the left child page).
    1232             :          *
    1233             :          * We're ready to do the parent insertion.  We need to hold onto the
    1234             :          * locks for the child pages until we locate the parent, but we can
    1235             :          * at least release the lock on the right child before doing the
    1236             :          * actual insertion.  The lock on the left child will be released
    1237             :          * last of all by parent insertion, where it is the 'cbuf' of parent
    1238             :          * page.
    1239             :          *----------
    1240             :          */
    1241       22074 :         _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
    1242             :     }
    1243             :     else
    1244             :     {
    1245     7160280 :         Buffer      metabuf = InvalidBuffer;
    1246     7160280 :         Page        metapg = NULL;
    1247     7160280 :         BTMetaPageData *metad = NULL;
    1248             :         BlockNumber blockcache;
    1249             : 
    1250             :         /*
    1251             :          * If we are doing this insert because we split a page that was the
    1252             :          * only one on its tree level, but was not the root, it may have been
    1253             :          * the "fast root".  We need to ensure that the fast root link points
    1254             :          * at or above the current page.  We can safely acquire a lock on the
    1255             :          * metapage here --- see comments for _bt_newlevel().
    1256             :          */
    1257     7160280 :         if (unlikely(split_only_page))
    1258             :         {
    1259             :             Assert(!isleaf);
    1260             :             Assert(BufferIsValid(cbuf));
    1261             : 
    1262          24 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    1263          24 :             metapg = BufferGetPage(metabuf);
    1264          24 :             metad = BTPageGetMeta(metapg);
    1265             : 
    1266          24 :             if (metad->btm_fastlevel >= opaque->btpo_level)
    1267             :             {
    1268             :                 /* no update wanted */
    1269           0 :                 _bt_relbuf(rel, metabuf);
    1270           0 :                 metabuf = InvalidBuffer;
    1271             :             }
    1272             :         }
    1273             : 
    1274             :         /* Do the update.  No ereport(ERROR) until changes are logged */
    1275     7160280 :         START_CRIT_SECTION();
    1276             : 
    1277     7160280 :         if (postingoff != 0)
    1278       23734 :             memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
    1279             : 
    1280     7160280 :         if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
    1281             :                         false) == InvalidOffsetNumber)
    1282           0 :             elog(PANIC, "failed to add new item to block %u in index \"%s\"",
    1283             :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1284             : 
    1285     7160280 :         MarkBufferDirty(buf);
    1286             : 
    1287     7160280 :         if (BufferIsValid(metabuf))
    1288             :         {
    1289             :             /* upgrade meta-page if needed */
    1290          24 :             if (metad->btm_version < BTREE_NOVAC_VERSION)
    1291           0 :                 _bt_upgrademetapage(metapg);
    1292          24 :             metad->btm_fastroot = BufferGetBlockNumber(buf);
    1293          24 :             metad->btm_fastlevel = opaque->btpo_level;
    1294          24 :             MarkBufferDirty(metabuf);
    1295             :         }
    1296             : 
    1297             :         /*
    1298             :          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
    1299             :          * finishes a split
    1300             :          */
    1301     7160280 :         if (!isleaf)
    1302             :         {
    1303       20574 :             Page        cpage = BufferGetPage(cbuf);
    1304       20574 :             BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1305             : 
    1306             :             Assert(P_INCOMPLETE_SPLIT(cpageop));
    1307       20574 :             cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1308       20574 :             MarkBufferDirty(cbuf);
    1309             :         }
    1310             : 
    1311             :         /* XLOG stuff */
    1312     7160280 :         if (RelationNeedsWAL(rel))
    1313             :         {
    1314             :             xl_btree_insert xlrec;
    1315             :             xl_btree_metadata xlmeta;
    1316             :             uint8       xlinfo;
    1317             :             XLogRecPtr  recptr;
    1318             :             uint16      upostingoff;
    1319             : 
    1320     6654298 :             xlrec.offnum = newitemoff;
    1321             : 
    1322     6654298 :             XLogBeginInsert();
    1323     6654298 :             XLogRegisterData(&xlrec, SizeOfBtreeInsert);
    1324             : 
    1325     6654298 :             if (isleaf && postingoff == 0)
    1326             :             {
    1327             :                 /* Simple leaf insert */
    1328     6611244 :                 xlinfo = XLOG_BTREE_INSERT_LEAF;
    1329             :             }
    1330       43054 :             else if (postingoff != 0)
    1331             :             {
    1332             :                 /*
    1333             :                  * Leaf insert with posting list split.  Must include
    1334             :                  * postingoff field before newitem/orignewitem.
    1335             :                  */
    1336             :                 Assert(isleaf);
    1337       23734 :                 xlinfo = XLOG_BTREE_INSERT_POST;
    1338             :             }
    1339             :             else
    1340             :             {
    1341             :                 /* Internal page insert, which finishes a split on cbuf */
    1342       19320 :                 xlinfo = XLOG_BTREE_INSERT_UPPER;
    1343       19320 :                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
    1344             : 
    1345       19320 :                 if (BufferIsValid(metabuf))
    1346             :                 {
    1347             :                     /* Actually, it's an internal page insert + meta update */
    1348          24 :                     xlinfo = XLOG_BTREE_INSERT_META;
    1349             : 
    1350             :                     Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    1351          24 :                     xlmeta.version = metad->btm_version;
    1352          24 :                     xlmeta.root = metad->btm_root;
    1353          24 :                     xlmeta.level = metad->btm_level;
    1354          24 :                     xlmeta.fastroot = metad->btm_fastroot;
    1355          24 :                     xlmeta.fastlevel = metad->btm_fastlevel;
    1356          24 :                     xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    1357          24 :                     xlmeta.allequalimage = metad->btm_allequalimage;
    1358             : 
    1359          24 :                     XLogRegisterBuffer(2, metabuf,
    1360             :                                        REGBUF_WILL_INIT | REGBUF_STANDARD);
    1361          24 :                     XLogRegisterBufData(2, &xlmeta,
    1362             :                                         sizeof(xl_btree_metadata));
    1363             :                 }
    1364             :             }
    1365             : 
    1366     6654298 :             XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1367     6654298 :             if (postingoff == 0)
    1368             :             {
    1369             :                 /* Just log itup from caller */
    1370     6630564 :                 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
    1371             :             }
    1372             :             else
    1373             :             {
    1374             :                 /*
    1375             :                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
    1376             :                  * record) case.
    1377             :                  *
    1378             :                  * Log postingoff.  Also log origitup, not itup.  REDO routine
    1379             :                  * must reconstruct final itup (as well as nposting) using
    1380             :                  * _bt_swap_posting().
    1381             :                  */
    1382       23734 :                 upostingoff = postingoff;
    1383             : 
    1384       23734 :                 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
    1385       23734 :                 XLogRegisterBufData(0, origitup,
    1386       23734 :                                     IndexTupleSize(origitup));
    1387             :             }
    1388             : 
    1389     6654298 :             recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    1390             : 
    1391     6654298 :             if (BufferIsValid(metabuf))
    1392          24 :                 PageSetLSN(metapg, recptr);
    1393     6654298 :             if (!isleaf)
    1394       19320 :                 PageSetLSN(BufferGetPage(cbuf), recptr);
    1395             : 
    1396     6654298 :             PageSetLSN(page, recptr);
    1397             :         }
    1398             : 
    1399     7160280 :         END_CRIT_SECTION();
    1400             : 
    1401             :         /* Release subsidiary buffers */
    1402     7160280 :         if (BufferIsValid(metabuf))
    1403          24 :             _bt_relbuf(rel, metabuf);
    1404     7160280 :         if (!isleaf)
    1405       20574 :             _bt_relbuf(rel, cbuf);
    1406             : 
    1407             :         /*
    1408             :          * Cache the block number if this is the rightmost leaf page.  Cache
    1409             :          * may be used by a future inserter within _bt_search_insert().
    1410             :          */
    1411     7160280 :         blockcache = InvalidBlockNumber;
    1412     7160280 :         if (isrightmost && isleaf && !isroot)
    1413     3942334 :             blockcache = BufferGetBlockNumber(buf);
    1414             : 
    1415             :         /* Release buffer for insertion target block */
    1416     7160280 :         _bt_relbuf(rel, buf);
    1417             : 
    1418             :         /*
    1419             :          * If we decided to cache the insertion target block before releasing
    1420             :          * its buffer lock, then cache it now.  Check the height of the tree
    1421             :          * first, though.  We don't go for the optimization with small
    1422             :          * indexes.  Defer final check to this point to ensure that we don't
    1423             :          * call _bt_getrootheight while holding a buffer lock.
    1424             :          */
    1425    11102614 :         if (BlockNumberIsValid(blockcache) &&
    1426     3942334 :             _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
    1427       62760 :             RelationSetTargetBlock(rel, blockcache);
    1428             :     }
    1429             : 
    1430             :     /* be tidy */
    1431     7182354 :     if (postingoff != 0)
    1432             :     {
    1433             :         /* itup is actually a modified copy of caller's original */
    1434       23802 :         pfree(nposting);
    1435       23802 :         pfree(itup);
    1436             :     }
    1437     7182354 : }
    1438             : 
    1439             : /*
    1440             :  *  _bt_split() -- split a page in the btree.
    1441             :  *
    1442             :  *      On entry, buf is the page to split, and is pinned and write-locked.
    1443             :  *      newitemoff etc. tell us about the new item that must be inserted
    1444             :  *      along with the data from the original page.
    1445             :  *
    1446             :  *      itup_key is used for suffix truncation on leaf pages (internal
    1447             :  *      page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
    1448             :  *      is the left-sibling of the page we're inserting the downlink for.
    1449             :  *      This function will clear the INCOMPLETE_SPLIT flag on it, and
    1450             :  *      release the buffer.
    1451             :  *
    1452             :  *      orignewitem, nposting, and postingoff are needed when an insert of
    1453             :  *      orignewitem results in both a posting list split and a page split.
    1454             :  *      These extra posting list split details are used here in the same
    1455             :  *      way as they are used in the more common case where a posting list
    1456             :  *      split does not coincide with a page split.  We need to deal with
    1457             :  *      posting list splits directly in order to ensure that everything
    1458             :  *      that follows from the insert of orignewitem is handled as a single
    1459             :  *      atomic operation (though caller's insert of a new pivot/downlink
    1460             :  *      into parent page will still be a separate operation).  See
    1461             :  *      nbtree/README for details on the design of posting list splits.
    1462             :  *
    1463             :  *      Returns the new right sibling of buf, pinned and write-locked.
    1464             :  *      The pin and lock on buf are maintained.
    1465             :  */
    1466             : static Buffer
    1467       22074 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
    1468             :           Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1469             :           IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
    1470             : {
    1471             :     Buffer      rbuf;
    1472             :     Page        origpage;
    1473             :     Page        leftpage,
    1474             :                 rightpage;
    1475             :     BlockNumber origpagenumber,
    1476             :                 rightpagenumber;
    1477             :     BTPageOpaque ropaque,
    1478             :                 lopaque,
    1479             :                 oopaque;
    1480       22074 :     Buffer      sbuf = InvalidBuffer;
    1481       22074 :     Page        spage = NULL;
    1482       22074 :     BTPageOpaque sopaque = NULL;
    1483             :     Size        itemsz;
    1484             :     ItemId      itemid;
    1485             :     IndexTuple  firstright,
    1486             :                 lefthighkey;
    1487             :     OffsetNumber firstrightoff;
    1488             :     OffsetNumber afterleftoff,
    1489             :                 afterrightoff,
    1490             :                 minusinfoff;
    1491             :     OffsetNumber origpagepostingoff;
    1492             :     OffsetNumber maxoff;
    1493             :     OffsetNumber i;
    1494             :     bool        newitemonleft,
    1495             :                 isleaf,
    1496             :                 isrightmost;
    1497             : 
    1498             :     /*
    1499             :      * origpage is the original page to be split.  leftpage is a temporary
    1500             :      * buffer that receives the left-sibling data, which will be copied back
    1501             :      * into origpage on success.  rightpage is the new page that will receive
    1502             :      * the right-sibling data.
    1503             :      *
    1504             :      * leftpage is allocated after choosing a split point.  rightpage's new
    1505             :      * buffer isn't acquired until after leftpage is initialized and has new
    1506             :      * high key, the last point where splitting the page may fail (barring
    1507             :      * corruption).  Failing before acquiring new buffer won't have lasting
    1508             :      * consequences, since origpage won't have been modified and leftpage is
    1509             :      * only workspace.
    1510             :      */
    1511       22074 :     origpage = BufferGetPage(buf);
    1512       22074 :     oopaque = BTPageGetOpaque(origpage);
    1513       22074 :     isleaf = P_ISLEAF(oopaque);
    1514       22074 :     isrightmost = P_RIGHTMOST(oopaque);
    1515       22074 :     maxoff = PageGetMaxOffsetNumber(origpage);
    1516       22074 :     origpagenumber = BufferGetBlockNumber(buf);
    1517             : 
    1518             :     /*
    1519             :      * Choose a point to split origpage at.
    1520             :      *
    1521             :      * A split point can be thought of as a point _between_ two existing data
    1522             :      * items on origpage (the lastleft and firstright tuples), provided you
    1523             :      * pretend that the new item that didn't fit is already on origpage.
    1524             :      *
    1525             :      * Since origpage does not actually contain newitem, the representation of
    1526             :      * split points needs to work with two boundary cases: splits where
    1527             :      * newitem is lastleft, and splits where newitem is firstright.
    1528             :      * newitemonleft resolves the ambiguity that would otherwise exist when
    1529             :      * newitemoff == firstrightoff.  In all other cases it's clear which side
    1530             :      * of the split every tuple goes on from context.  newitemonleft is
    1531             :      * usually (but not always) redundant information.
    1532             :      *
    1533             :      * firstrightoff is supposed to be an origpage offset number, but it's
    1534             :      * possible that its value will be maxoff+1, which is "past the end" of
    1535             :      * origpage.  This happens in the rare case where newitem goes after all
    1536             :      * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
    1537             :      * origpage at the point that leaves newitem alone on new right page.  Any
    1538             :      * "!newitemonleft && newitemoff == firstrightoff" split point makes
    1539             :      * newitem the firstright tuple, though, so this case isn't a special
    1540             :      * case.
    1541             :      */
    1542       22074 :     firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
    1543             :                                      newitem, &newitemonleft);
    1544             : 
    1545             :     /* Allocate temp buffer for leftpage */
    1546       22074 :     leftpage = PageGetTempPage(origpage);
    1547       22074 :     _bt_pageinit(leftpage, BufferGetPageSize(buf));
    1548       22074 :     lopaque = BTPageGetOpaque(leftpage);
    1549             : 
    1550             :     /*
    1551             :      * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
    1552             :      * and HAS_GARBAGE flags.
    1553             :      */
    1554       22074 :     lopaque->btpo_flags = oopaque->btpo_flags;
    1555       22074 :     lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1556             :     /* set flag in leftpage indicating that rightpage has no downlink yet */
    1557       22074 :     lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
    1558       22074 :     lopaque->btpo_prev = oopaque->btpo_prev;
    1559             :     /* handle btpo_next after rightpage buffer acquired */
    1560       22074 :     lopaque->btpo_level = oopaque->btpo_level;
    1561             :     /* handle btpo_cycleid after rightpage buffer acquired */
    1562             : 
    1563             :     /*
    1564             :      * Copy the original page's LSN into leftpage, which will become the
    1565             :      * updated version of the page.  We need this because XLogInsert will
    1566             :      * examine the LSN and possibly dump it in a page image.
    1567             :      */
    1568       22074 :     PageSetLSN(leftpage, PageGetLSN(origpage));
    1569             : 
    1570             :     /*
    1571             :      * Determine page offset number of existing overlapped-with-orignewitem
    1572             :      * posting list when it is necessary to perform a posting list split in
    1573             :      * passing.  Note that newitem was already changed by caller (newitem no
    1574             :      * longer has the orignewitem TID).
    1575             :      *
    1576             :      * This page offset number (origpagepostingoff) will be used to pretend
    1577             :      * that the posting split has already taken place, even though the
    1578             :      * required modifications to origpage won't occur until we reach the
    1579             :      * critical section.  The lastleft and firstright tuples of our page split
    1580             :      * point should, in effect, come from an imaginary version of origpage
    1581             :      * that has the nposting tuple instead of the original posting list tuple.
    1582             :      *
    1583             :      * Note: _bt_findsplitloc() should have compensated for coinciding posting
    1584             :      * list splits in just the same way, at least in theory.  It doesn't
    1585             :      * bother with that, though.  In practice it won't affect its choice of
    1586             :      * split point.
    1587             :      */
    1588       22074 :     origpagepostingoff = InvalidOffsetNumber;
    1589       22074 :     if (postingoff != 0)
    1590             :     {
    1591             :         Assert(isleaf);
    1592             :         Assert(ItemPointerCompare(&orignewitem->t_tid,
    1593             :                                   &newitem->t_tid) < 0);
    1594             :         Assert(BTreeTupleIsPosting(nposting));
    1595          68 :         origpagepostingoff = OffsetNumberPrev(newitemoff);
    1596             :     }
    1597             : 
    1598             :     /*
    1599             :      * The high key for the new left page is a possibly-truncated copy of
    1600             :      * firstright on the leaf level (it's "firstright itself" on internal
    1601             :      * pages; see !isleaf comments below).  This may seem to be contrary to
    1602             :      * Lehman & Yao's approach of using a copy of lastleft as the new high key
    1603             :      * when splitting on the leaf level.  It isn't, though.
    1604             :      *
    1605             :      * Suffix truncation will leave the left page's high key fully equal to
    1606             :      * lastleft when lastleft and firstright are equal prior to heap TID (that
    1607             :      * is, the tiebreaker TID value comes from lastleft).  It isn't actually
    1608             :      * necessary for a new leaf high key to be a copy of lastleft for the L&Y
    1609             :      * "subtree" invariant to hold.  It's sufficient to make sure that the new
    1610             :      * leaf high key is strictly less than firstright, and greater than or
    1611             :      * equal to (not necessarily equal to) lastleft.  In other words, when
    1612             :      * suffix truncation isn't possible during a leaf page split, we take
    1613             :      * L&Y's exact approach to generating a new high key for the left page.
    1614             :      * (Actually, that is slightly inaccurate.  We don't just use a copy of
    1615             :      * lastleft.  A tuple with all the keys from firstright but the max heap
    1616             :      * TID from lastleft is used, to avoid introducing a special case.)
    1617             :      */
    1618       22074 :     if (!newitemonleft && newitemoff == firstrightoff)
    1619             :     {
    1620             :         /* incoming tuple becomes firstright */
    1621          14 :         itemsz = newitemsz;
    1622          14 :         firstright = newitem;
    1623             :     }
    1624             :     else
    1625             :     {
    1626             :         /* existing item at firstrightoff becomes firstright */
    1627       22060 :         itemid = PageGetItemId(origpage, firstrightoff);
    1628       22060 :         itemsz = ItemIdGetLength(itemid);
    1629       22060 :         firstright = (IndexTuple) PageGetItem(origpage, itemid);
    1630       22060 :         if (firstrightoff == origpagepostingoff)
    1631           2 :             firstright = nposting;
    1632             :     }
    1633             : 
    1634       22074 :     if (isleaf)
    1635             :     {
    1636             :         IndexTuple  lastleft;
    1637             : 
    1638             :         /* Attempt suffix truncation for leaf page splits */
    1639       21928 :         if (newitemonleft && newitemoff == firstrightoff)
    1640             :         {
    1641             :             /* incoming tuple becomes lastleft */
    1642         290 :             lastleft = newitem;
    1643             :         }
    1644             :         else
    1645             :         {
    1646             :             OffsetNumber lastleftoff;
    1647             : 
    1648             :             /* existing item before firstrightoff becomes lastleft */
    1649       21638 :             lastleftoff = OffsetNumberPrev(firstrightoff);
    1650             :             Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
    1651       21638 :             itemid = PageGetItemId(origpage, lastleftoff);
    1652       21638 :             lastleft = (IndexTuple) PageGetItem(origpage, itemid);
    1653       21638 :             if (lastleftoff == origpagepostingoff)
    1654           6 :                 lastleft = nposting;
    1655             :         }
    1656             : 
    1657       21928 :         lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
    1658       21928 :         itemsz = IndexTupleSize(lefthighkey);
    1659             :     }
    1660             :     else
    1661             :     {
    1662             :         /*
    1663             :          * Don't perform suffix truncation on a copy of firstright to make
    1664             :          * left page high key for internal page splits.  Must use firstright
    1665             :          * as new high key directly.
    1666             :          *
    1667             :          * Each distinct separator key value originates as a leaf level high
    1668             :          * key; all other separator keys/pivot tuples are copied from one
    1669             :          * level down.  A separator key in a grandparent page must be
    1670             :          * identical to high key in rightmost parent page of the subtree to
    1671             :          * its left, which must itself be identical to high key in rightmost
    1672             :          * child page of that same subtree (this even applies to separator
    1673             :          * from grandparent's high key).  There must always be an unbroken
    1674             :          * "seam" of identical separator keys that guide index scans at every
    1675             :          * level, starting from the grandparent.  That's why suffix truncation
    1676             :          * is unsafe here.
    1677             :          *
    1678             :          * Internal page splits will truncate firstright into a "negative
    1679             :          * infinity" data item when it gets inserted on the new right page
    1680             :          * below, though.  This happens during the call to _bt_pgaddtup() for
    1681             :          * the new first data item for right page.  Do not confuse this
    1682             :          * mechanism with suffix truncation.  It is just a convenient way of
    1683             :          * implementing page splits that split the internal page "inside"
    1684             :          * firstright.  The lefthighkey separator key cannot appear a second
    1685             :          * time in the right page (only firstright's downlink goes in right
    1686             :          * page).
    1687             :          */
    1688         146 :         lefthighkey = firstright;
    1689             :     }
    1690             : 
    1691             :     /*
    1692             :      * Add new high key to leftpage
    1693             :      */
    1694       22074 :     afterleftoff = P_HIKEY;
    1695             : 
    1696             :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
    1697             :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
    1698             :            IndexRelationGetNumberOfKeyAttributes(rel));
    1699             :     Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
    1700       22074 :     if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
    1701             :                     false) == InvalidOffsetNumber)
    1702           0 :         elog(ERROR, "failed to add high key to the left sibling"
    1703             :              " while splitting block %u of index \"%s\"",
    1704             :              origpagenumber, RelationGetRelationName(rel));
    1705       22074 :     afterleftoff = OffsetNumberNext(afterleftoff);
    1706             : 
    1707             :     /*
    1708             :      * Acquire a new right page to split into, now that left page has a new
    1709             :      * high key.  From here on, it's not okay to throw an error without
    1710             :      * zeroing rightpage first.  This coding rule ensures that we won't
    1711             :      * confuse future VACUUM operations, which might otherwise try to re-find
    1712             :      * a downlink to a leftover junk page as the page undergoes deletion.
    1713             :      *
    1714             :      * It would be reasonable to start the critical section just after the new
    1715             :      * rightpage buffer is acquired instead; that would allow us to avoid
    1716             :      * leftover junk pages without bothering to zero rightpage.  We do it this
    1717             :      * way because it avoids an unnecessary PANIC when either origpage or its
    1718             :      * existing sibling page are corrupt.
    1719             :      */
    1720       22074 :     rbuf = _bt_allocbuf(rel, heaprel);
    1721       22074 :     rightpage = BufferGetPage(rbuf);
    1722       22074 :     rightpagenumber = BufferGetBlockNumber(rbuf);
    1723             :     /* rightpage was initialized by _bt_allocbuf */
    1724       22074 :     ropaque = BTPageGetOpaque(rightpage);
    1725             : 
    1726             :     /*
    1727             :      * Finish off remaining leftpage special area fields.  They cannot be set
    1728             :      * before both origpage (leftpage) and rightpage buffers are acquired and
    1729             :      * locked.
    1730             :      *
    1731             :      * btpo_cycleid is only used with leaf pages, though we set it here in all
    1732             :      * cases just to be consistent.
    1733             :      */
    1734       22074 :     lopaque->btpo_next = rightpagenumber;
    1735       22074 :     lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
    1736             : 
    1737             :     /*
    1738             :      * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
    1739             :      * and HAS_GARBAGE flags.
    1740             :      */
    1741       22074 :     ropaque->btpo_flags = oopaque->btpo_flags;
    1742       22074 :     ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1743       22074 :     ropaque->btpo_prev = origpagenumber;
    1744       22074 :     ropaque->btpo_next = oopaque->btpo_next;
    1745       22074 :     ropaque->btpo_level = oopaque->btpo_level;
    1746       22074 :     ropaque->btpo_cycleid = lopaque->btpo_cycleid;
    1747             : 
    1748             :     /*
    1749             :      * Add new high key to rightpage where necessary.
    1750             :      *
    1751             :      * If the page we're splitting is not the rightmost page at its level in
    1752             :      * the tree, then the first entry on the page is the high key from
    1753             :      * origpage.
    1754             :      */
    1755       22074 :     afterrightoff = P_HIKEY;
    1756             : 
    1757       22074 :     if (!isrightmost)
    1758             :     {
    1759             :         IndexTuple  righthighkey;
    1760             : 
    1761        9560 :         itemid = PageGetItemId(origpage, P_HIKEY);
    1762        9560 :         itemsz = ItemIdGetLength(itemid);
    1763        9560 :         righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    1764             :         Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
    1765             :         Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
    1766             :                IndexRelationGetNumberOfKeyAttributes(rel));
    1767        9560 :         if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
    1768             :                         false, false) == InvalidOffsetNumber)
    1769             :         {
    1770           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1771           0 :             elog(ERROR, "failed to add high key to the right sibling"
    1772             :                  " while splitting block %u of index \"%s\"",
    1773             :                  origpagenumber, RelationGetRelationName(rel));
    1774             :         }
    1775        9560 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1776             :     }
    1777             : 
    1778             :     /*
    1779             :      * Internal page splits truncate first data item on right page -- it
    1780             :      * becomes "minus infinity" item for the page.  Set this up here.
    1781             :      */
    1782       22074 :     minusinfoff = InvalidOffsetNumber;
    1783       22074 :     if (!isleaf)
    1784         146 :         minusinfoff = afterrightoff;
    1785             : 
    1786             :     /*
    1787             :      * Now transfer all the data items (non-pivot tuples in isleaf case, or
    1788             :      * additional pivot tuples in !isleaf case) to the appropriate page.
    1789             :      *
    1790             :      * Note: we *must* insert at least the right page's items in item-number
    1791             :      * order, for the benefit of _bt_restore_page().
    1792             :      */
    1793     6743406 :     for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
    1794             :     {
    1795             :         IndexTuple  dataitem;
    1796             : 
    1797     6721332 :         itemid = PageGetItemId(origpage, i);
    1798     6721332 :         itemsz = ItemIdGetLength(itemid);
    1799     6721332 :         dataitem = (IndexTuple) PageGetItem(origpage, itemid);
    1800             : 
    1801             :         /* replace original item with nposting due to posting split? */
    1802     6721332 :         if (i == origpagepostingoff)
    1803             :         {
    1804             :             Assert(BTreeTupleIsPosting(dataitem));
    1805             :             Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
    1806          68 :             dataitem = nposting;
    1807             :         }
    1808             : 
    1809             :         /* does new item belong before this one? */
    1810     6721264 :         else if (i == newitemoff)
    1811             :         {
    1812       12768 :             if (newitemonleft)
    1813             :             {
    1814             :                 Assert(newitemoff <= firstrightoff);
    1815        3194 :                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
    1816             :                                   false))
    1817             :                 {
    1818           0 :                     memset(rightpage, 0, BufferGetPageSize(rbuf));
    1819           0 :                     elog(ERROR, "failed to add new item to the left sibling"
    1820             :                          " while splitting block %u of index \"%s\"",
    1821             :                          origpagenumber, RelationGetRelationName(rel));
    1822             :                 }
    1823        3194 :                 afterleftoff = OffsetNumberNext(afterleftoff);
    1824             :             }
    1825             :             else
    1826             :             {
    1827             :                 Assert(newitemoff >= firstrightoff);
    1828        9574 :                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1829             :                                   afterrightoff == minusinfoff))
    1830             :                 {
    1831           0 :                     memset(rightpage, 0, BufferGetPageSize(rbuf));
    1832           0 :                     elog(ERROR, "failed to add new item to the right sibling"
    1833             :                          " while splitting block %u of index \"%s\"",
    1834             :                          origpagenumber, RelationGetRelationName(rel));
    1835             :                 }
    1836        9574 :                 afterrightoff = OffsetNumberNext(afterrightoff);
    1837             :             }
    1838             :         }
    1839             : 
    1840             :         /* decide which page to put it on */
    1841     6721332 :         if (i < firstrightoff)
    1842             :         {
    1843     5087084 :             if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
    1844             :             {
    1845           0 :                 memset(rightpage, 0, BufferGetPageSize(rbuf));
    1846           0 :                 elog(ERROR, "failed to add old item to the left sibling"
    1847             :                      " while splitting block %u of index \"%s\"",
    1848             :                      origpagenumber, RelationGetRelationName(rel));
    1849             :             }
    1850     5087084 :             afterleftoff = OffsetNumberNext(afterleftoff);
    1851             :         }
    1852             :         else
    1853             :         {
    1854     1634248 :             if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
    1855             :                               afterrightoff == minusinfoff))
    1856             :             {
    1857           0 :                 memset(rightpage, 0, BufferGetPageSize(rbuf));
    1858           0 :                 elog(ERROR, "failed to add old item to the right sibling"
    1859             :                      " while splitting block %u of index \"%s\"",
    1860             :                      origpagenumber, RelationGetRelationName(rel));
    1861             :             }
    1862     1634248 :             afterrightoff = OffsetNumberNext(afterrightoff);
    1863             :         }
    1864             :     }
    1865             : 
    1866             :     /* Handle case where newitem goes at the end of rightpage */
    1867       22074 :     if (i <= newitemoff)
    1868             :     {
    1869             :         /*
    1870             :          * Can't have newitemonleft here; that would imply we were told to put
    1871             :          * *everything* on the left page, which cannot fit (if it could, we'd
    1872             :          * not be splitting the page).
    1873             :          */
    1874             :         Assert(!newitemonleft && newitemoff == maxoff + 1);
    1875        9306 :         if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1876             :                           afterrightoff == minusinfoff))
    1877             :         {
    1878           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1879           0 :             elog(ERROR, "failed to add new item to the right sibling"
    1880             :                  " while splitting block %u of index \"%s\"",
    1881             :                  origpagenumber, RelationGetRelationName(rel));
    1882             :         }
    1883        9306 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1884             :     }
    1885             : 
    1886             :     /*
    1887             :      * We have to grab the original right sibling (if any) and update its prev
    1888             :      * link.  We are guaranteed that this is deadlock-free, since we couple
    1889             :      * the locks in the standard order: left to right.
    1890             :      */
    1891       22074 :     if (!isrightmost)
    1892             :     {
    1893        9560 :         sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
    1894        9560 :         spage = BufferGetPage(sbuf);
    1895        9560 :         sopaque = BTPageGetOpaque(spage);
    1896        9560 :         if (sopaque->btpo_prev != origpagenumber)
    1897             :         {
    1898           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1899           0 :             ereport(ERROR,
    1900             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1901             :                      errmsg_internal("right sibling's left-link doesn't match: "
    1902             :                                      "block %u links to %u instead of expected %u in index \"%s\"",
    1903             :                                      oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
    1904             :                                      RelationGetRelationName(rel))));
    1905             :         }
    1906             : 
    1907             :         /*
    1908             :          * Check to see if we can set the SPLIT_END flag in the right-hand
    1909             :          * split page; this can save some I/O for vacuum since it need not
    1910             :          * proceed to the right sibling.  We can set the flag if the right
    1911             :          * sibling has a different cycleid: that means it could not be part of
    1912             :          * a group of pages that were all split off from the same ancestor
    1913             :          * page.  If you're confused, imagine that page A splits to A B and
    1914             :          * then again, yielding A C B, while vacuum is in progress.  Tuples
    1915             :          * originally in A could now be in either B or C, hence vacuum must
    1916             :          * examine both pages.  But if D, our right sibling, has a different
    1917             :          * cycleid then it could not contain any tuples that were in A when
    1918             :          * the vacuum started.
    1919             :          */
    1920        9560 :         if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
    1921           0 :             ropaque->btpo_flags |= BTP_SPLIT_END;
    1922             :     }
    1923             : 
    1924             :     /*
    1925             :      * Right sibling is locked, new siblings are prepared, but original page
    1926             :      * is not updated yet.
    1927             :      *
    1928             :      * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
    1929             :      * not starting the critical section till here because we haven't been
    1930             :      * scribbling on the original page yet; see comments above.
    1931             :      */
    1932       22074 :     START_CRIT_SECTION();
    1933             : 
    1934             :     /*
    1935             :      * By here, the original data page has been split into two new halves, and
    1936             :      * these are correct.  The algorithm requires that the left page never
    1937             :      * move during a split, so we copy the new left page back on top of the
    1938             :      * original.  We need to do this before writing the WAL record, so that
    1939             :      * XLogInsert can WAL log an image of the page if necessary.
    1940             :      */
    1941       22074 :     PageRestoreTempPage(leftpage, origpage);
    1942             :     /* leftpage, lopaque must not be used below here */
    1943             : 
    1944       22074 :     MarkBufferDirty(buf);
    1945       22074 :     MarkBufferDirty(rbuf);
    1946             : 
    1947       22074 :     if (!isrightmost)
    1948             :     {
    1949        9560 :         sopaque->btpo_prev = rightpagenumber;
    1950        9560 :         MarkBufferDirty(sbuf);
    1951             :     }
    1952             : 
    1953             :     /*
    1954             :      * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
    1955             :      * a split
    1956             :      */
    1957       22074 :     if (!isleaf)
    1958             :     {
    1959         146 :         Page        cpage = BufferGetPage(cbuf);
    1960         146 :         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1961             : 
    1962         146 :         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1963         146 :         MarkBufferDirty(cbuf);
    1964             :     }
    1965             : 
    1966             :     /* XLOG stuff */
    1967       22074 :     if (RelationNeedsWAL(rel))
    1968             :     {
    1969             :         xl_btree_split xlrec;
    1970             :         uint8       xlinfo;
    1971             :         XLogRecPtr  recptr;
    1972             : 
    1973       20784 :         xlrec.level = ropaque->btpo_level;
    1974             :         /* See comments below on newitem, orignewitem, and posting lists */
    1975       20784 :         xlrec.firstrightoff = firstrightoff;
    1976       20784 :         xlrec.newitemoff = newitemoff;
    1977       20784 :         xlrec.postingoff = 0;
    1978       20784 :         if (postingoff != 0 && origpagepostingoff < firstrightoff)
    1979          28 :             xlrec.postingoff = postingoff;
    1980             : 
    1981       20784 :         XLogBeginInsert();
    1982       20784 :         XLogRegisterData(&xlrec, SizeOfBtreeSplit);
    1983             : 
    1984       20784 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1985       20784 :         XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
    1986             :         /* Log original right sibling, since we've changed its prev-pointer */
    1987       20784 :         if (!isrightmost)
    1988        9548 :             XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
    1989       20784 :         if (!isleaf)
    1990         146 :             XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
    1991             : 
    1992             :         /*
    1993             :          * Log the new item, if it was inserted on the left page. (If it was
    1994             :          * put on the right page, we don't need to explicitly WAL log it
    1995             :          * because it's included with all the other items on the right page.)
    1996             :          * Show the new item as belonging to the left page buffer, so that it
    1997             :          * is not stored if XLogInsert decides it needs a full-page image of
    1998             :          * the left page.  We always store newitemoff in the record, though.
    1999             :          *
    2000             :          * The details are sometimes slightly different for page splits that
    2001             :          * coincide with a posting list split.  If both the replacement
    2002             :          * posting list and newitem go on the right page, then we don't need
    2003             :          * to log anything extra, just like the simple !newitemonleft
    2004             :          * no-posting-split case (postingoff is set to zero in the WAL record,
    2005             :          * so recovery doesn't need to process a posting list split at all).
    2006             :          * Otherwise, we set postingoff and log orignewitem instead of
    2007             :          * newitem, despite having actually inserted newitem.  REDO routine
    2008             :          * must reconstruct nposting and newitem using _bt_swap_posting().
    2009             :          *
    2010             :          * Note: It's possible that our page split point is the point that
    2011             :          * makes the posting list lastleft and newitem firstright.  This is
    2012             :          * the only case where we log orignewitem/newitem despite newitem
    2013             :          * going on the right page.  If XLogInsert decides that it can omit
    2014             :          * orignewitem due to logging a full-page image of the left page,
    2015             :          * everything still works out, since recovery only needs to log
    2016             :          * orignewitem for items on the left page (just like the regular
    2017             :          * newitem-logged case).
    2018             :          */
    2019       20784 :         if (newitemonleft && xlrec.postingoff == 0)
    2020        3166 :             XLogRegisterBufData(0, newitem, newitemsz);
    2021       17618 :         else if (xlrec.postingoff != 0)
    2022             :         {
    2023             :             Assert(isleaf);
    2024             :             Assert(newitemonleft || firstrightoff == newitemoff);
    2025             :             Assert(newitemsz == IndexTupleSize(orignewitem));
    2026          28 :             XLogRegisterBufData(0, orignewitem, newitemsz);
    2027             :         }
    2028             : 
    2029             :         /* Log the left page's new high key */
    2030       20784 :         if (!isleaf)
    2031             :         {
    2032             :             /* lefthighkey isn't local copy, get current pointer */
    2033         146 :             itemid = PageGetItemId(origpage, P_HIKEY);
    2034         146 :             lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    2035             :         }
    2036       20784 :         XLogRegisterBufData(0, lefthighkey,
    2037       20784 :                             MAXALIGN(IndexTupleSize(lefthighkey)));
    2038             : 
    2039             :         /*
    2040             :          * Log the contents of the right page in the format understood by
    2041             :          * _bt_restore_page().  The whole right page will be recreated.
    2042             :          *
    2043             :          * Direct access to page is not good but faster - we should implement
    2044             :          * some new func in page API.  Note we only store the tuples
    2045             :          * themselves, knowing that they were inserted in item-number order
    2046             :          * and so the line pointers can be reconstructed.  See comments for
    2047             :          * _bt_restore_page().
    2048             :          */
    2049       20784 :         XLogRegisterBufData(1,
    2050       20784 :                             (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
    2051       20784 :                             ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
    2052             : 
    2053       20784 :         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
    2054       20784 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2055             : 
    2056       20784 :         PageSetLSN(origpage, recptr);
    2057       20784 :         PageSetLSN(rightpage, recptr);
    2058       20784 :         if (!isrightmost)
    2059        9548 :             PageSetLSN(spage, recptr);
    2060       20784 :         if (!isleaf)
    2061         146 :             PageSetLSN(BufferGetPage(cbuf), recptr);
    2062             :     }
    2063             : 
    2064       22074 :     END_CRIT_SECTION();
    2065             : 
    2066             :     /* release the old right sibling */
    2067       22074 :     if (!isrightmost)
    2068        9560 :         _bt_relbuf(rel, sbuf);
    2069             : 
    2070             :     /* release the child */
    2071       22074 :     if (!isleaf)
    2072         146 :         _bt_relbuf(rel, cbuf);
    2073             : 
    2074             :     /* be tidy */
    2075       22074 :     if (isleaf)
    2076       21928 :         pfree(lefthighkey);
    2077             : 
    2078             :     /* split's done */
    2079       22074 :     return rbuf;
    2080             : }
    2081             : 
    2082             : /*
    2083             :  * _bt_insert_parent() -- Insert downlink into parent, completing split.
    2084             :  *
    2085             :  * On entry, buf and rbuf are the left and right split pages, which we
    2086             :  * still hold write locks on.  Both locks will be released here.  We
    2087             :  * release the rbuf lock once we have a write lock on the page that we
    2088             :  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
    2089             :  * The lock on buf is released at the same point as the lock on the parent
    2090             :  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
    2091             :  * atomic operation that completes the split by inserting a new downlink.
    2092             :  *
    2093             :  * stack - stack showing how we got here.  Will be NULL when splitting true
    2094             :  *          root, or during concurrent root split, where we can be inefficient
    2095             :  * isroot - we split the true root
    2096             :  * isonly - we split a page alone on its level (might have been fast root)
    2097             :  */
    2098             : static void
    2099       22074 : _bt_insert_parent(Relation rel,
    2100             :                   Relation heaprel,
    2101             :                   Buffer buf,
    2102             :                   Buffer rbuf,
    2103             :                   BTStack stack,
    2104             :                   bool isroot,
    2105             :                   bool isonly)
    2106             : {
    2107             :     Assert(heaprel != NULL);
    2108             : 
    2109             :     /*
    2110             :      * Here we have to do something Lehman and Yao don't talk about: deal with
    2111             :      * a root split and construction of a new root.  If our stack is empty
    2112             :      * then we have just split a node on what had been the root level when we
    2113             :      * descended the tree.  If it was still the root then we perform a
    2114             :      * new-root construction.  If it *wasn't* the root anymore, search to find
    2115             :      * the next higher level that someone constructed meanwhile, and find the
    2116             :      * right place to insert as for the normal case.
    2117             :      *
    2118             :      * If we have to search for the parent level, we do so by re-descending
    2119             :      * from the root.  This is not super-efficient, but it's rare enough not
    2120             :      * to matter.
    2121             :      */
    2122       22074 :     if (isroot)
    2123             :     {
    2124             :         Buffer      rootbuf;
    2125             : 
    2126             :         Assert(stack == NULL);
    2127             :         Assert(isonly);
    2128             :         /* create a new root node one level up and update the metapage */
    2129        1354 :         rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
    2130             :         /* release the split buffers */
    2131        1354 :         _bt_relbuf(rel, rootbuf);
    2132        1354 :         _bt_relbuf(rel, rbuf);
    2133        1354 :         _bt_relbuf(rel, buf);
    2134             :     }
    2135             :     else
    2136             :     {
    2137       20720 :         BlockNumber bknum = BufferGetBlockNumber(buf);
    2138       20720 :         BlockNumber rbknum = BufferGetBlockNumber(rbuf);
    2139       20720 :         Page        page = BufferGetPage(buf);
    2140             :         IndexTuple  new_item;
    2141             :         BTStackData fakestack;
    2142             :         IndexTuple  ritem;
    2143             :         Buffer      pbuf;
    2144             : 
    2145       20720 :         if (stack == NULL)
    2146             :         {
    2147             :             BTPageOpaque opaque;
    2148             : 
    2149          24 :             elog(DEBUG2, "concurrent ROOT page split");
    2150          24 :             opaque = BTPageGetOpaque(page);
    2151             : 
    2152             :             /*
    2153             :              * We should never reach here when a leaf page split takes place
    2154             :              * despite the insert of newitem being able to apply the fastpath
    2155             :              * optimization.  Make sure of that with an assertion.
    2156             :              *
    2157             :              * This is more of a performance issue than a correctness issue.
    2158             :              * The fastpath won't have a descent stack.  Using a phony stack
    2159             :              * here works, but never rely on that.  The fastpath should be
    2160             :              * rejected within _bt_search_insert() when the rightmost leaf
    2161             :              * page will split, since it's faster to go through _bt_search()
    2162             :              * and get a stack in the usual way.
    2163             :              */
    2164             :             Assert(!(P_ISLEAF(opaque) &&
    2165             :                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
    2166             : 
    2167             :             /* Find the leftmost page at the next level up */
    2168          24 :             pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
    2169             :             /* Set up a phony stack entry pointing there */
    2170          24 :             stack = &fakestack;
    2171          24 :             stack->bts_blkno = BufferGetBlockNumber(pbuf);
    2172          24 :             stack->bts_offset = InvalidOffsetNumber;
    2173          24 :             stack->bts_parent = NULL;
    2174          24 :             _bt_relbuf(rel, pbuf);
    2175             :         }
    2176             : 
    2177             :         /* get high key from left, a strict lower bound for new right page */
    2178       20720 :         ritem = (IndexTuple) PageGetItem(page,
    2179       20720 :                                          PageGetItemId(page, P_HIKEY));
    2180             : 
    2181             :         /* form an index tuple that points at the new right page */
    2182       20720 :         new_item = CopyIndexTuple(ritem);
    2183       20720 :         BTreeTupleSetDownLink(new_item, rbknum);
    2184             : 
    2185             :         /*
    2186             :          * Re-find and write lock the parent of buf.
    2187             :          *
    2188             :          * It's possible that the location of buf's downlink has changed since
    2189             :          * our initial _bt_search() descent.  _bt_getstackbuf() will detect
    2190             :          * and recover from this, updating the stack, which ensures that the
    2191             :          * new downlink will be inserted at the correct offset. Even buf's
    2192             :          * parent may have changed.
    2193             :          */
    2194       20720 :         pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
    2195             : 
    2196             :         /*
    2197             :          * Unlock the right child.  The left child will be unlocked in
    2198             :          * _bt_insertonpg().
    2199             :          *
    2200             :          * Unlocking the right child must be delayed until here to ensure that
    2201             :          * no concurrent VACUUM operation can become confused.  Page deletion
    2202             :          * cannot be allowed to fail to re-find a downlink for the rbuf page.
    2203             :          * (Actually, this is just a vestige of how things used to work.  The
    2204             :          * page deletion code is expected to check for the INCOMPLETE_SPLIT
    2205             :          * flag on the left child.  It won't attempt deletion of the right
    2206             :          * child until the split is complete.  Despite all this, we opt to
    2207             :          * conservatively delay unlocking the right child until here.)
    2208             :          */
    2209       20720 :         _bt_relbuf(rel, rbuf);
    2210             : 
    2211       20720 :         if (pbuf == InvalidBuffer)
    2212           0 :             ereport(ERROR,
    2213             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2214             :                      errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
    2215             :                                      RelationGetRelationName(rel), bknum, rbknum)));
    2216             : 
    2217             :         /* Recursively insert into the parent */
    2218       41440 :         _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
    2219       20720 :                        new_item, MAXALIGN(IndexTupleSize(new_item)),
    2220       20720 :                        stack->bts_offset + 1, 0, isonly);
    2221             : 
    2222             :         /* be tidy */
    2223       20720 :         pfree(new_item);
    2224             :     }
    2225       22074 : }
    2226             : 
    2227             : /*
    2228             :  * _bt_finish_split() -- Finish an incomplete split
    2229             :  *
    2230             :  * A crash or other failure can leave a split incomplete.  The insertion
    2231             :  * routines won't allow to insert on a page that is incompletely split.
    2232             :  * Before inserting on such a page, call _bt_finish_split().
    2233             :  *
    2234             :  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
    2235             :  * and unpinned.
    2236             :  *
    2237             :  * Caller must provide a valid heaprel, since finishing a page split requires
    2238             :  * allocating a new page if and when the parent page splits in turn.
    2239             :  */
    2240             : void
    2241           0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
    2242             : {
    2243           0 :     Page        lpage = BufferGetPage(lbuf);
    2244           0 :     BTPageOpaque lpageop = BTPageGetOpaque(lpage);
    2245             :     Buffer      rbuf;
    2246             :     Page        rpage;
    2247             :     BTPageOpaque rpageop;
    2248             :     bool        wasroot;
    2249             :     bool        wasonly;
    2250             : 
    2251             :     Assert(P_INCOMPLETE_SPLIT(lpageop));
    2252             :     Assert(heaprel != NULL);
    2253             : 
    2254             :     /* Lock right sibling, the one missing the downlink */
    2255           0 :     rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
    2256           0 :     rpage = BufferGetPage(rbuf);
    2257           0 :     rpageop = BTPageGetOpaque(rpage);
    2258             : 
    2259             :     /* Could this be a root split? */
    2260           0 :     if (!stack)
    2261             :     {
    2262             :         Buffer      metabuf;
    2263             :         Page        metapg;
    2264             :         BTMetaPageData *metad;
    2265             : 
    2266             :         /* acquire lock on the metapage */
    2267           0 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2268           0 :         metapg = BufferGetPage(metabuf);
    2269           0 :         metad = BTPageGetMeta(metapg);
    2270             : 
    2271           0 :         wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
    2272             : 
    2273           0 :         _bt_relbuf(rel, metabuf);
    2274             :     }
    2275             :     else
    2276           0 :         wasroot = false;
    2277             : 
    2278             :     /* Was this the only page on the level before split? */
    2279           0 :     wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
    2280             : 
    2281           0 :     elog(DEBUG1, "finishing incomplete split of %u/%u",
    2282             :          BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
    2283             : 
    2284           0 :     _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
    2285           0 : }
    2286             : 
    2287             : /*
    2288             :  *  _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
    2289             :  *                       tuple whose downlink points to child page.
    2290             :  *
    2291             :  *      Caller passes child's block number, which is used to identify
    2292             :  *      associated pivot tuple in parent page using a linear search that
    2293             :  *      matches on pivot's downlink/block number.  The expected location of
    2294             :  *      the pivot tuple is taken from the stack one level above the child
    2295             :  *      page.  This is used as a starting point.  Insertions into the
    2296             :  *      parent level could cause the pivot tuple to move right; deletions
    2297             :  *      could cause it to move left, but not left of the page we previously
    2298             :  *      found it on.
    2299             :  *
    2300             :  *      Caller can use its stack to relocate the pivot tuple/downlink for
    2301             :  *      any same-level page to the right of the page found by its initial
    2302             :  *      descent.  This is necessary because of the possibility that caller
    2303             :  *      moved right to recover from a concurrent page split.  It's also
    2304             :  *      convenient for certain callers to be able to step right when there
    2305             :  *      wasn't a concurrent page split, while still using their original
    2306             :  *      stack.  For example, the checkingunique _bt_doinsert() case may
    2307             :  *      have to step right when there are many physical duplicates, and its
    2308             :  *      scantid forces an insertion to the right of the "first page the
    2309             :  *      value could be on".  (This is also relied on by all of our callers
    2310             :  *      when dealing with !heapkeyspace indexes.)
    2311             :  *
    2312             :  *      Returns write-locked parent page buffer, or InvalidBuffer if pivot
    2313             :  *      tuple not found (should not happen).  Adjusts bts_blkno &
    2314             :  *      bts_offset if changed.  Page split caller should insert its new
    2315             :  *      pivot tuple for its new right sibling page on parent page, at the
    2316             :  *      offset number bts_offset + 1.
    2317             :  */
    2318             : Buffer
    2319       26414 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
    2320             : {
    2321             :     BlockNumber blkno;
    2322             :     OffsetNumber start;
    2323             : 
    2324       26414 :     blkno = stack->bts_blkno;
    2325       26414 :     start = stack->bts_offset;
    2326             : 
    2327             :     for (;;)
    2328           2 :     {
    2329             :         Buffer      buf;
    2330             :         Page        page;
    2331             :         BTPageOpaque opaque;
    2332             : 
    2333       26416 :         buf = _bt_getbuf(rel, blkno, BT_WRITE);
    2334       26416 :         page = BufferGetPage(buf);
    2335       26416 :         opaque = BTPageGetOpaque(page);
    2336             : 
    2337             :         Assert(heaprel != NULL);
    2338       26416 :         if (P_INCOMPLETE_SPLIT(opaque))
    2339             :         {
    2340           0 :             _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
    2341           0 :             continue;
    2342             :         }
    2343             : 
    2344       26416 :         if (!P_IGNORE(opaque))
    2345             :         {
    2346             :             OffsetNumber offnum,
    2347             :                         minoff,
    2348             :                         maxoff;
    2349             :             ItemId      itemid;
    2350             :             IndexTuple  item;
    2351             : 
    2352       26416 :             minoff = P_FIRSTDATAKEY(opaque);
    2353       26416 :             maxoff = PageGetMaxOffsetNumber(page);
    2354             : 
    2355             :             /*
    2356             :              * start = InvalidOffsetNumber means "search the whole page". We
    2357             :              * need this test anyway due to possibility that page has a high
    2358             :              * key now when it didn't before.
    2359             :              */
    2360       26416 :             if (start < minoff)
    2361          26 :                 start = minoff;
    2362             : 
    2363             :             /*
    2364             :              * Need this check too, to guard against possibility that page
    2365             :              * split since we visited it originally.
    2366             :              */
    2367       26416 :             if (start > maxoff)
    2368           2 :                 start = OffsetNumberNext(maxoff);
    2369             : 
    2370             :             /*
    2371             :              * These loops will check every item on the page --- but in an
    2372             :              * order that's attuned to the probability of where it actually
    2373             :              * is.  Scan to the right first, then to the left.
    2374             :              */
    2375       26416 :             for (offnum = start;
    2376       26792 :                  offnum <= maxoff;
    2377         376 :                  offnum = OffsetNumberNext(offnum))
    2378             :             {
    2379       26790 :                 itemid = PageGetItemId(page, offnum);
    2380       26790 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2381             : 
    2382       26790 :                 if (BTreeTupleGetDownLink(item) == child)
    2383             :                 {
    2384             :                     /* Return accurate pointer to where link is now */
    2385       26414 :                     stack->bts_blkno = blkno;
    2386       26414 :                     stack->bts_offset = offnum;
    2387       26414 :                     return buf;
    2388             :                 }
    2389             :             }
    2390             : 
    2391           2 :             for (offnum = OffsetNumberPrev(start);
    2392         572 :                  offnum >= minoff;
    2393         570 :                  offnum = OffsetNumberPrev(offnum))
    2394             :             {
    2395         570 :                 itemid = PageGetItemId(page, offnum);
    2396         570 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2397             : 
    2398         570 :                 if (BTreeTupleGetDownLink(item) == child)
    2399             :                 {
    2400             :                     /* Return accurate pointer to where link is now */
    2401           0 :                     stack->bts_blkno = blkno;
    2402           0 :                     stack->bts_offset = offnum;
    2403           0 :                     return buf;
    2404             :                 }
    2405             :             }
    2406             :         }
    2407             : 
    2408             :         /*
    2409             :          * The item we're looking for moved right at least one page.
    2410             :          *
    2411             :          * Lehman and Yao couple/chain locks when moving right here, which we
    2412             :          * can avoid.  See nbtree/README.
    2413             :          */
    2414           2 :         if (P_RIGHTMOST(opaque))
    2415             :         {
    2416           0 :             _bt_relbuf(rel, buf);
    2417           0 :             return InvalidBuffer;
    2418             :         }
    2419           2 :         blkno = opaque->btpo_next;
    2420           2 :         start = InvalidOffsetNumber;
    2421           2 :         _bt_relbuf(rel, buf);
    2422             :     }
    2423             : }
    2424             : 
    2425             : /*
    2426             :  *  _bt_newlevel() -- Create a new level above root page.
    2427             :  *
    2428             :  *      We've just split the old root page and need to create a new one.
    2429             :  *      In order to do this, we add a new root page to the file, then lock
    2430             :  *      the metadata page and update it.  This is guaranteed to be deadlock-
    2431             :  *      free, because all readers release their locks on the metadata page
    2432             :  *      before trying to lock the root, and all writers lock the root before
    2433             :  *      trying to lock the metadata page.  We have a write lock on the old
    2434             :  *      root page, so we have not introduced any cycles into the waits-for
    2435             :  *      graph.
    2436             :  *
    2437             :  *      On entry, lbuf (the old root) and rbuf (its new peer) are write-
    2438             :  *      locked. On exit, a new root page exists with entries for the
    2439             :  *      two new children, metapage is updated and unlocked/unpinned.
    2440             :  *      The new root buffer is returned to caller which has to unlock/unpin
    2441             :  *      lbuf, rbuf & rootbuf.
    2442             :  */
    2443             : static Buffer
    2444        1354 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
    2445             : {
    2446             :     Buffer      rootbuf;
    2447             :     Page        lpage,
    2448             :                 rootpage;
    2449             :     BlockNumber lbkno,
    2450             :                 rbkno;
    2451             :     BlockNumber rootblknum;
    2452             :     BTPageOpaque rootopaque;
    2453             :     BTPageOpaque lopaque;
    2454             :     ItemId      itemid;
    2455             :     IndexTuple  item;
    2456             :     IndexTuple  left_item;
    2457             :     Size        left_item_sz;
    2458             :     IndexTuple  right_item;
    2459             :     Size        right_item_sz;
    2460             :     Buffer      metabuf;
    2461             :     Page        metapg;
    2462             :     BTMetaPageData *metad;
    2463             : 
    2464        1354 :     lbkno = BufferGetBlockNumber(lbuf);
    2465        1354 :     rbkno = BufferGetBlockNumber(rbuf);
    2466        1354 :     lpage = BufferGetPage(lbuf);
    2467        1354 :     lopaque = BTPageGetOpaque(lpage);
    2468             : 
    2469             :     /* get a new root page */
    2470        1354 :     rootbuf = _bt_allocbuf(rel, heaprel);
    2471        1354 :     rootpage = BufferGetPage(rootbuf);
    2472        1354 :     rootblknum = BufferGetBlockNumber(rootbuf);
    2473             : 
    2474             :     /* acquire lock on the metapage */
    2475        1354 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2476        1354 :     metapg = BufferGetPage(metabuf);
    2477        1354 :     metad = BTPageGetMeta(metapg);
    2478             : 
    2479             :     /*
    2480             :      * Create downlink item for left page (old root).  The key value used is
    2481             :      * "minus infinity", a sentinel value that's reliably less than any real
    2482             :      * key value that could appear in the left page.
    2483             :      */
    2484        1354 :     left_item_sz = sizeof(IndexTupleData);
    2485        1354 :     left_item = (IndexTuple) palloc(left_item_sz);
    2486        1354 :     left_item->t_info = left_item_sz;
    2487        1354 :     BTreeTupleSetDownLink(left_item, lbkno);
    2488        1354 :     BTreeTupleSetNAtts(left_item, 0, false);
    2489             : 
    2490             :     /*
    2491             :      * Create downlink item for right page.  The key for it is obtained from
    2492             :      * the "high key" position in the left page.
    2493             :      */
    2494        1354 :     itemid = PageGetItemId(lpage, P_HIKEY);
    2495        1354 :     right_item_sz = ItemIdGetLength(itemid);
    2496        1354 :     item = (IndexTuple) PageGetItem(lpage, itemid);
    2497        1354 :     right_item = CopyIndexTuple(item);
    2498        1354 :     BTreeTupleSetDownLink(right_item, rbkno);
    2499             : 
    2500             :     /* NO EREPORT(ERROR) from here till newroot op is logged */
    2501        1354 :     START_CRIT_SECTION();
    2502             : 
    2503             :     /* upgrade metapage if needed */
    2504        1354 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
    2505           0 :         _bt_upgrademetapage(metapg);
    2506             : 
    2507             :     /* set btree special data */
    2508        1354 :     rootopaque = BTPageGetOpaque(rootpage);
    2509        1354 :     rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
    2510        1354 :     rootopaque->btpo_flags = BTP_ROOT;
    2511        1354 :     rootopaque->btpo_level =
    2512        1354 :         (BTPageGetOpaque(lpage))->btpo_level + 1;
    2513        1354 :     rootopaque->btpo_cycleid = 0;
    2514             : 
    2515             :     /* update metapage data */
    2516        1354 :     metad->btm_root = rootblknum;
    2517        1354 :     metad->btm_level = rootopaque->btpo_level;
    2518        1354 :     metad->btm_fastroot = rootblknum;
    2519        1354 :     metad->btm_fastlevel = rootopaque->btpo_level;
    2520             : 
    2521             :     /*
    2522             :      * Insert the left page pointer into the new root page.  The root page is
    2523             :      * the rightmost page on its level so there is no "high key" in it; the
    2524             :      * two items will go into positions P_HIKEY and P_FIRSTKEY.
    2525             :      *
    2526             :      * Note: we *must* insert the two items in item-number order, for the
    2527             :      * benefit of _bt_restore_page().
    2528             :      */
    2529             :     Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
    2530        1354 :     if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
    2531             :                     false, false) == InvalidOffsetNumber)
    2532           0 :         elog(PANIC, "failed to add leftkey to new root page"
    2533             :              " while splitting block %u of index \"%s\"",
    2534             :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2535             : 
    2536             :     /*
    2537             :      * insert the right page pointer into the new root page.
    2538             :      */
    2539             :     Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
    2540             :     Assert(BTreeTupleGetNAtts(right_item, rel) <=
    2541             :            IndexRelationGetNumberOfKeyAttributes(rel));
    2542        1354 :     if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
    2543             :                     false, false) == InvalidOffsetNumber)
    2544           0 :         elog(PANIC, "failed to add rightkey to new root page"
    2545             :              " while splitting block %u of index \"%s\"",
    2546             :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2547             : 
    2548             :     /* Clear the incomplete-split flag in the left child */
    2549             :     Assert(P_INCOMPLETE_SPLIT(lopaque));
    2550        1354 :     lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    2551        1354 :     MarkBufferDirty(lbuf);
    2552             : 
    2553        1354 :     MarkBufferDirty(rootbuf);
    2554        1354 :     MarkBufferDirty(metabuf);
    2555             : 
    2556             :     /* XLOG stuff */
    2557        1354 :     if (RelationNeedsWAL(rel))
    2558             :     {
    2559             :         xl_btree_newroot xlrec;
    2560             :         XLogRecPtr  recptr;
    2561             :         xl_btree_metadata md;
    2562             : 
    2563        1318 :         xlrec.rootblk = rootblknum;
    2564        1318 :         xlrec.level = metad->btm_level;
    2565             : 
    2566        1318 :         XLogBeginInsert();
    2567        1318 :         XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
    2568             : 
    2569        1318 :         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
    2570        1318 :         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2571        1318 :         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2572             : 
    2573             :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2574        1318 :         md.version = metad->btm_version;
    2575        1318 :         md.root = rootblknum;
    2576        1318 :         md.level = metad->btm_level;
    2577        1318 :         md.fastroot = rootblknum;
    2578        1318 :         md.fastlevel = metad->btm_level;
    2579        1318 :         md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2580        1318 :         md.allequalimage = metad->btm_allequalimage;
    2581             : 
    2582        1318 :         XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
    2583             : 
    2584             :         /*
    2585             :          * Direct access to page is not good but faster - we should implement
    2586             :          * some new func in page API.
    2587             :          */
    2588        1318 :         XLogRegisterBufData(0,
    2589        1318 :                             (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
    2590        1318 :                             ((PageHeader) rootpage)->pd_special -
    2591        1318 :                             ((PageHeader) rootpage)->pd_upper);
    2592             : 
    2593        1318 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
    2594             : 
    2595        1318 :         PageSetLSN(lpage, recptr);
    2596        1318 :         PageSetLSN(rootpage, recptr);
    2597        1318 :         PageSetLSN(metapg, recptr);
    2598             :     }
    2599             : 
    2600        1354 :     END_CRIT_SECTION();
    2601             : 
    2602             :     /* done with metapage */
    2603        1354 :     _bt_relbuf(rel, metabuf);
    2604             : 
    2605        1354 :     pfree(left_item);
    2606        1354 :     pfree(right_item);
    2607             : 
    2608        1354 :     return rootbuf;
    2609             : }
    2610             : 
    2611             : /*
    2612             :  *  _bt_pgaddtup() -- add a data item to a particular page during split.
    2613             :  *
    2614             :  *      The difference between this routine and a bare PageAddItem call is
    2615             :  *      that this code can deal with the first data item on an internal btree
    2616             :  *      page in passing.  This data item (which is called "firstright" within
    2617             :  *      _bt_split()) has a key that must be treated as minus infinity after
    2618             :  *      the split.  Therefore, we truncate away all attributes when caller
    2619             :  *      specifies it's the first data item on page (downlink is not changed,
    2620             :  *      though).  This extra step is only needed for the right page of an
    2621             :  *      internal page split.  There is no need to do this for the first data
    2622             :  *      item on the existing/left page, since that will already have been
    2623             :  *      truncated during an earlier page split.
    2624             :  *
    2625             :  *      See _bt_split() for a high level explanation of why we truncate here.
    2626             :  *      Note that this routine has nothing to do with suffix truncation,
    2627             :  *      despite using some of the same infrastructure.
    2628             :  */
    2629             : static inline bool
    2630     6743406 : _bt_pgaddtup(Page page,
    2631             :              Size itemsize,
    2632             :              IndexTuple itup,
    2633             :              OffsetNumber itup_off,
    2634             :              bool newfirstdataitem)
    2635             : {
    2636             :     IndexTupleData trunctuple;
    2637             : 
    2638     6743406 :     if (newfirstdataitem)
    2639             :     {
    2640         146 :         trunctuple = *itup;
    2641         146 :         trunctuple.t_info = sizeof(IndexTupleData);
    2642         146 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
    2643         146 :         itup = &trunctuple;
    2644         146 :         itemsize = sizeof(IndexTupleData);
    2645             :     }
    2646             : 
    2647     6743406 :     if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
    2648             :                              false) == InvalidOffsetNumber))
    2649           0 :         return false;
    2650             : 
    2651     6743406 :     return true;
    2652             : }
    2653             : 
    2654             : /*
    2655             :  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
    2656             :  *
    2657             :  * There are three operations performed here: simple index deletion, bottom-up
    2658             :  * index deletion, and deduplication.  If all three operations fail to free
    2659             :  * enough space for the incoming item then caller will go on to split the
    2660             :  * page.  We always consider simple deletion first.  If that doesn't work out
    2661             :  * we consider alternatives.  Callers that only want us to consider simple
    2662             :  * deletion (without any fallback) ask for that using the 'simpleonly'
    2663             :  * argument.
    2664             :  *
    2665             :  * We usually pick only one alternative "complex" operation when simple
    2666             :  * deletion alone won't prevent a page split.  The 'checkingunique',
    2667             :  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
    2668             :  *
    2669             :  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
    2670             :  * level flag was found set.  The flag was useful back when there wasn't
    2671             :  * necessarily one single page for a duplicate tuple to go on (before heap TID
    2672             :  * became a part of the key space in version 4 indexes).  But we don't
    2673             :  * actually look at the flag anymore (it's not a gating condition for our
    2674             :  * caller).  That would cause us to miss tuples that are safe to delete,
    2675             :  * without getting any benefit in return.  We know that the alternative is to
    2676             :  * split the page; scanning the line pointer array in passing won't have
    2677             :  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
    2678             :  * all this because !heapkeyspace indexes must still do a "getting tired"
    2679             :  * linear search, and so are likely to get some benefit from using it as a
    2680             :  * gating condition.)
    2681             :  */
    2682             : static void
    2683       50062 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
    2684             :                              BTInsertState insertstate,
    2685             :                              bool simpleonly, bool checkingunique,
    2686             :                              bool uniquedup, bool indexUnchanged)
    2687             : {
    2688             :     OffsetNumber deletable[MaxIndexTuplesPerPage];
    2689       50062 :     int         ndeletable = 0;
    2690             :     OffsetNumber offnum,
    2691             :                 minoff,
    2692             :                 maxoff;
    2693       50062 :     Buffer      buffer = insertstate->buf;
    2694       50062 :     BTScanInsert itup_key = insertstate->itup_key;
    2695       50062 :     Page        page = BufferGetPage(buffer);
    2696       50062 :     BTPageOpaque opaque = BTPageGetOpaque(page);
    2697             : 
    2698             :     Assert(P_ISLEAF(opaque));
    2699             :     Assert(simpleonly || itup_key->heapkeyspace);
    2700             :     Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
    2701             : 
    2702             :     /*
    2703             :      * Scan over all items to see which ones need to be deleted according to
    2704             :      * LP_DEAD flags.  We'll usually manage to delete a few extra items that
    2705             :      * are not marked LP_DEAD in passing.  Often the extra items that actually
    2706             :      * end up getting deleted are items that would have had their LP_DEAD bit
    2707             :      * set before long anyway (if we opted not to include them as extras).
    2708             :      */
    2709       50062 :     minoff = P_FIRSTDATAKEY(opaque);
    2710       50062 :     maxoff = PageGetMaxOffsetNumber(page);
    2711       50062 :     for (offnum = minoff;
    2712    13436780 :          offnum <= maxoff;
    2713    13386718 :          offnum = OffsetNumberNext(offnum))
    2714             :     {
    2715    13386718 :         ItemId      itemId = PageGetItemId(page, offnum);
    2716             : 
    2717    13386718 :         if (ItemIdIsDead(itemId))
    2718      254600 :             deletable[ndeletable++] = offnum;
    2719             :     }
    2720             : 
    2721       50062 :     if (ndeletable > 0)
    2722             :     {
    2723        7370 :         _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
    2724             :                            insertstate->itup, minoff, maxoff);
    2725        7370 :         insertstate->bounds_valid = false;
    2726             : 
    2727             :         /* Return when a page split has already been avoided */
    2728        7370 :         if (PageGetFreeSpace(page) >= insertstate->itemsz)
    2729       22860 :             return;
    2730             : 
    2731             :         /* Might as well assume duplicates (if checkingunique) */
    2732         100 :         uniquedup = true;
    2733             :     }
    2734             : 
    2735             :     /*
    2736             :      * We're done with simple deletion.  Return early with callers that only
    2737             :      * call here so that simple deletion can be considered.  This includes
    2738             :      * callers that explicitly ask for this and checkingunique callers that
    2739             :      * probably don't have any version churn duplicates on the page.
    2740             :      *
    2741             :      * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
    2742             :      * return at this point (or when we go on the try either or both of our
    2743             :      * other strategies and they also fail).  We do not bother expending a
    2744             :      * separate write to clear it, however.  Caller will definitely clear it
    2745             :      * when it goes on to split the page (note also that the deduplication
    2746             :      * process will clear the flag in passing, just to keep things tidy).
    2747             :      */
    2748       42792 :     if (simpleonly || (checkingunique && !uniquedup))
    2749             :     {
    2750             :         Assert(!indexUnchanged);
    2751       15218 :         return;
    2752             :     }
    2753             : 
    2754             :     /* Assume bounds about to be invalidated (this is almost certain now) */
    2755       27574 :     insertstate->bounds_valid = false;
    2756             : 
    2757             :     /*
    2758             :      * Perform bottom-up index deletion pass when executor hint indicated that
    2759             :      * incoming item is logically unchanged, or for a unique index that is
    2760             :      * known to have physical duplicates for some other reason.  (There is a
    2761             :      * large overlap between these two cases for a unique index.  It's worth
    2762             :      * having both triggering conditions in order to apply the optimization in
    2763             :      * the event of successive related INSERT and DELETE statements.)
    2764             :      *
    2765             :      * We'll go on to do a deduplication pass when a bottom-up pass fails to
    2766             :      * delete an acceptable amount of free space (a significant fraction of
    2767             :      * the page, or space for the new item, whichever is greater).
    2768             :      *
    2769             :      * Note: Bottom-up index deletion uses the same equality/equivalence
    2770             :      * routines as deduplication internally.  However, it does not merge
    2771             :      * together index tuples, so the same correctness considerations do not
    2772             :      * apply.  We deliberately omit an index-is-allequalimage test here.
    2773             :      */
    2774       31404 :     if ((indexUnchanged || uniquedup) &&
    2775        3830 :         _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
    2776         372 :         return;
    2777             : 
    2778             :     /* Perform deduplication pass (when enabled and index-is-allequalimage) */
    2779       27202 :     if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
    2780       27184 :         _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
    2781       27184 :                        (indexUnchanged || uniquedup));
    2782             : }
    2783             : 
    2784             : /*
    2785             :  * _bt_simpledel_pass - Simple index tuple deletion pass.
    2786             :  *
    2787             :  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
    2788             :  * of all such tuples are determined by caller (caller passes these to us as
    2789             :  * its 'deletable' argument).
    2790             :  *
    2791             :  * We might also delete extra index tuples that turn out to be safe to delete
    2792             :  * in passing (though they must be cheap to check in passing to begin with).
    2793             :  * There is no certainty that any extra tuples will be deleted, though.  The
    2794             :  * high level goal of the approach we take is to get the most out of each call
    2795             :  * here (without noticeably increasing the per-call overhead compared to what
    2796             :  * we need to do just to be able to delete the page's LP_DEAD-marked index
    2797             :  * tuples).
    2798             :  *
    2799             :  * The number of extra index tuples that turn out to be deletable might
    2800             :  * greatly exceed the number of LP_DEAD-marked index tuples due to various
    2801             :  * locality related effects.  For example, it's possible that the total number
    2802             :  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
    2803             :  * quite low, in which case we might end up checking if it's possible to
    2804             :  * delete _most_ index tuples on the page (without the tableam needing to
    2805             :  * access additional table blocks).  The tableam will sometimes stumble upon
    2806             :  * _many_ extra deletable index tuples in indexes where this pattern is
    2807             :  * common.
    2808             :  *
    2809             :  * See nbtree/README for further details on simple index tuple deletion.
    2810             :  */
    2811             : static void
    2812        7370 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
    2813             :                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
    2814             :                    OffsetNumber minoff, OffsetNumber maxoff)
    2815             : {
    2816        7370 :     Page        page = BufferGetPage(buffer);
    2817             :     BlockNumber *deadblocks;
    2818             :     int         ndeadblocks;
    2819             :     TM_IndexDeleteOp delstate;
    2820             :     OffsetNumber offnum;
    2821             : 
    2822             :     /* Get array of table blocks pointed to by LP_DEAD-set tuples */
    2823        7370 :     deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
    2824             :                                 &ndeadblocks);
    2825             : 
    2826             :     /* Initialize tableam state that describes index deletion operation */
    2827        7370 :     delstate.irel = rel;
    2828        7370 :     delstate.iblknum = BufferGetBlockNumber(buffer);
    2829        7370 :     delstate.bottomup = false;
    2830        7370 :     delstate.bottomupfreespace = 0;
    2831        7370 :     delstate.ndeltids = 0;
    2832        7370 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
    2833        7370 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
    2834             : 
    2835        7370 :     for (offnum = minoff;
    2836     2117638 :          offnum <= maxoff;
    2837     2110268 :          offnum = OffsetNumberNext(offnum))
    2838             :     {
    2839     2110268 :         ItemId      itemid = PageGetItemId(page, offnum);
    2840     2110268 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    2841     2110268 :         TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
    2842     2110268 :         TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
    2843             :         BlockNumber tidblock;
    2844             :         void       *match;
    2845             : 
    2846     2110268 :         if (!BTreeTupleIsPosting(itup))
    2847             :         {
    2848     2022640 :             tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
    2849     2022640 :             match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2850             :                             sizeof(BlockNumber), _bt_blk_cmp);
    2851             : 
    2852     2022640 :             if (!match)
    2853             :             {
    2854             :                 Assert(!ItemIdIsDead(itemid));
    2855     1271644 :                 continue;
    2856             :             }
    2857             : 
    2858             :             /*
    2859             :              * TID's table block is among those pointed to by the TIDs from
    2860             :              * LP_DEAD-bit set tuples on page -- add TID to deltids
    2861             :              */
    2862      750996 :             odeltid->tid = itup->t_tid;
    2863      750996 :             odeltid->id = delstate.ndeltids;
    2864      750996 :             ostatus->idxoffnum = offnum;
    2865      750996 :             ostatus->knowndeletable = ItemIdIsDead(itemid);
    2866      750996 :             ostatus->promising = false; /* unused */
    2867      750996 :             ostatus->freespace = 0; /* unused */
    2868             : 
    2869      750996 :             delstate.ndeltids++;
    2870             :         }
    2871             :         else
    2872             :         {
    2873       87628 :             int         nitem = BTreeTupleGetNPosting(itup);
    2874             : 
    2875      439792 :             for (int p = 0; p < nitem; p++)
    2876             :             {
    2877      352164 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
    2878             : 
    2879      352164 :                 tidblock = ItemPointerGetBlockNumber(tid);
    2880      352164 :                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2881             :                                 sizeof(BlockNumber), _bt_blk_cmp);
    2882             : 
    2883      352164 :                 if (!match)
    2884             :                 {
    2885             :                     Assert(!ItemIdIsDead(itemid));
    2886      307094 :                     continue;
    2887             :                 }
    2888             : 
    2889             :                 /*
    2890             :                  * TID's table block is among those pointed to by the TIDs
    2891             :                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
    2892             :                  */
    2893       45070 :                 odeltid->tid = *tid;
    2894       45070 :                 odeltid->id = delstate.ndeltids;
    2895       45070 :                 ostatus->idxoffnum = offnum;
    2896       45070 :                 ostatus->knowndeletable = ItemIdIsDead(itemid);
    2897       45070 :                 ostatus->promising = false; /* unused */
    2898       45070 :                 ostatus->freespace = 0; /* unused */
    2899             : 
    2900       45070 :                 odeltid++;
    2901       45070 :                 ostatus++;
    2902       45070 :                 delstate.ndeltids++;
    2903             :             }
    2904             :         }
    2905             :     }
    2906             : 
    2907        7370 :     pfree(deadblocks);
    2908             : 
    2909             :     Assert(delstate.ndeltids >= ndeletable);
    2910             : 
    2911             :     /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
    2912        7370 :     _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
    2913             : 
    2914        7370 :     pfree(delstate.deltids);
    2915        7370 :     pfree(delstate.status);
    2916        7370 : }
    2917             : 
    2918             : /*
    2919             :  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
    2920             :  *
    2921             :  * Builds sorted and unique-ified array of table block numbers from index
    2922             :  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
    2923             :  * block from incoming newitem just in case it isn't among the LP_DEAD-related
    2924             :  * table blocks.
    2925             :  *
    2926             :  * Always counting the newitem's table block as an LP_DEAD related block makes
    2927             :  * sense because the cost is consistently low; it is practically certain that
    2928             :  * the table block will not incur a buffer miss in tableam.  On the other hand
    2929             :  * the benefit is often quite high.  There is a decent chance that there will
    2930             :  * be some deletable items from this block, since in general most garbage
    2931             :  * tuples became garbage in the recent past (in many cases this won't be the
    2932             :  * first logical row that core code added to/modified in table block
    2933             :  * recently).
    2934             :  *
    2935             :  * Returns final array, and sets *nblocks to its final size for caller.
    2936             :  */
    2937             : static BlockNumber *
    2938        7370 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
    2939             :                IndexTuple newitem, int *nblocks)
    2940             : {
    2941             :     int         spacentids,
    2942             :                 ntids;
    2943             :     BlockNumber *tidblocks;
    2944             : 
    2945             :     /*
    2946             :      * Accumulate each TID's block in array whose initial size has space for
    2947             :      * one table block per LP_DEAD-set tuple (plus space for the newitem table
    2948             :      * block).  Array will only need to grow when there are LP_DEAD-marked
    2949             :      * posting list tuples (which is not that common).
    2950             :      */
    2951        7370 :     spacentids = ndeletable + 1;
    2952        7370 :     ntids = 0;
    2953        7370 :     tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
    2954             : 
    2955             :     /*
    2956             :      * First add the table block for the incoming newitem.  This is the one
    2957             :      * case where simple deletion can visit a table block that doesn't have
    2958             :      * any known deletable items.
    2959             :      */
    2960             :     Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
    2961        7370 :     tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
    2962             : 
    2963      261970 :     for (int i = 0; i < ndeletable; i++)
    2964             :     {
    2965      254600 :         ItemId      itemid = PageGetItemId(page, deletable[i]);
    2966      254600 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    2967             : 
    2968             :         Assert(ItemIdIsDead(itemid));
    2969             : 
    2970      254600 :         if (!BTreeTupleIsPosting(itup))
    2971             :         {
    2972      246278 :             if (ntids + 1 > spacentids)
    2973             :             {
    2974         198 :                 spacentids *= 2;
    2975             :                 tidblocks = (BlockNumber *)
    2976         198 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    2977             :             }
    2978             : 
    2979      246278 :             tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
    2980             :         }
    2981             :         else
    2982             :         {
    2983        8322 :             int         nposting = BTreeTupleGetNPosting(itup);
    2984             : 
    2985        8322 :             if (ntids + nposting > spacentids)
    2986             :             {
    2987         160 :                 spacentids = Max(spacentids * 2, ntids + nposting);
    2988             :                 tidblocks = (BlockNumber *)
    2989         160 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    2990             :             }
    2991             : 
    2992       27414 :             for (int j = 0; j < nposting; j++)
    2993             :             {
    2994       19092 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
    2995             : 
    2996       19092 :                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
    2997             :             }
    2998             :         }
    2999             :     }
    3000             : 
    3001        7370 :     qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3002        7370 :     *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3003             : 
    3004        7370 :     return tidblocks;
    3005             : }
    3006             : 
    3007             : /*
    3008             :  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
    3009             :  */
    3010             : static inline int
    3011     5309740 : _bt_blk_cmp(const void *arg1, const void *arg2)
    3012             : {
    3013     5309740 :     BlockNumber b1 = *((BlockNumber *) arg1);
    3014     5309740 :     BlockNumber b2 = *((BlockNumber *) arg2);
    3015             : 
    3016     5309740 :     return pg_cmp_u32(b1, b2);
    3017             : }

Generated by: LCOV version 1.16