LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtinsert.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 89.4 % 782 699
Test Date: 2026-03-27 22:16:19 Functions: 94.1 % 17 16
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtinsert.c
       4              :  *    Item insertion in Lehman and Yao btrees for Postgres.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/nbtree/nbtinsert.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #include "postgres.h"
      17              : 
      18              : #include "access/nbtree.h"
      19              : #include "access/nbtxlog.h"
      20              : #include "access/tableam.h"
      21              : #include "access/transam.h"
      22              : #include "access/xloginsert.h"
      23              : #include "common/int.h"
      24              : #include "common/pg_prng.h"
      25              : #include "lib/qunique.h"
      26              : #include "miscadmin.h"
      27              : #include "storage/lmgr.h"
      28              : #include "storage/predicate.h"
      29              : #include "utils/injection_point.h"
      30              : 
      31              : /* Minimum tree height for application of fastpath optimization */
      32              : #define BTREE_FASTPATH_MIN_LEVEL    2
      33              : 
      34              : 
      35              : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
      36              :                                  BTInsertState insertstate);
      37              : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
      38              :                                       Relation heapRel,
      39              :                                       IndexUniqueCheck checkUnique, bool *is_unique,
      40              :                                       uint32 *speculativeToken);
      41              : static OffsetNumber _bt_findinsertloc(Relation rel,
      42              :                                       BTInsertState insertstate,
      43              :                                       bool checkingunique,
      44              :                                       bool indexUnchanged,
      45              :                                       BTStack stack,
      46              :                                       Relation heapRel);
      47              : static void _bt_stepright(Relation rel, Relation heaprel,
      48              :                           BTInsertState insertstate, BTStack stack);
      49              : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
      50              :                            Buffer buf,
      51              :                            Buffer cbuf,
      52              :                            BTStack stack,
      53              :                            IndexTuple itup,
      54              :                            Size itemsz,
      55              :                            OffsetNumber newitemoff,
      56              :                            int postingoff,
      57              :                            bool split_only_page);
      58              : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
      59              :                         Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
      60              :                         Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
      61              :                         IndexTuple nposting, uint16 postingoff);
      62              : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
      63              :                               Buffer rbuf, BTStack stack, bool isroot, bool isonly);
      64              : static void _bt_freestack(BTStack stack);
      65              : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
      66              : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
      67              :                                 OffsetNumber itup_off, bool newfirstdataitem);
      68              : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
      69              :                                          BTInsertState insertstate,
      70              :                                          bool simpleonly, bool checkingunique,
      71              :                                          bool uniquedup, bool indexUnchanged);
      72              : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
      73              :                                OffsetNumber *deletable, int ndeletable,
      74              :                                IndexTuple newitem, OffsetNumber minoff,
      75              :                                OffsetNumber maxoff);
      76              : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
      77              :                                    int ndeletable, IndexTuple newitem,
      78              :                                    int *nblocks);
      79              : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
      80              : 
      81              : /*
      82              :  *  _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
      83              :  *
      84              :  *      This routine is called by the public interface routine, btinsert.
      85              :  *      By here, itup is filled in, including the TID.
      86              :  *
      87              :  *      If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
      88              :  *      will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
      89              :  *      UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
      90              :  *      For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
      91              :  *      don't actually insert.
      92              :  *
      93              :  *      indexUnchanged executor hint indicates if itup is from an
      94              :  *      UPDATE that didn't logically change the indexed value, but
      95              :  *      must nevertheless have a new entry to point to a successor
      96              :  *      version.
      97              :  *
      98              :  *      The result value is only significant for UNIQUE_CHECK_PARTIAL:
      99              :  *      it must be true if the entry is known unique, else false.
     100              :  *      (In the current implementation we'll also return true after a
     101              :  *      successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
     102              :  *      that's just a coding artifact.)
     103              :  */
     104              : bool
     105      4647255 : _bt_doinsert(Relation rel, IndexTuple itup,
     106              :              IndexUniqueCheck checkUnique, bool indexUnchanged,
     107              :              Relation heapRel)
     108              : {
     109      4647255 :     bool        is_unique = false;
     110              :     BTInsertStateData insertstate;
     111              :     BTScanInsert itup_key;
     112              :     BTStack     stack;
     113      4647255 :     bool        checkingunique = (checkUnique != UNIQUE_CHECK_NO);
     114              : 
     115              :     /* we need an insertion scan key to do our search, so build one */
     116      4647255 :     itup_key = _bt_mkscankey(rel, itup);
     117              : 
     118      4647255 :     if (checkingunique)
     119              :     {
     120      3320498 :         if (!itup_key->anynullkeys)
     121              :         {
     122              :             /* No (heapkeyspace) scantid until uniqueness established */
     123      3310382 :             itup_key->scantid = NULL;
     124              :         }
     125              :         else
     126              :         {
     127              :             /*
     128              :              * Scan key for new tuple contains NULL key values.  Bypass
     129              :              * checkingunique steps.  They are unnecessary because core code
     130              :              * considers NULL unequal to every value, including NULL.
     131              :              *
     132              :              * This optimization avoids O(N^2) behavior within the
     133              :              * _bt_findinsertloc() heapkeyspace path when a unique index has a
     134              :              * large number of "duplicates" with NULL key values.
     135              :              */
     136        10116 :             checkingunique = false;
     137              :             /* Tuple is unique in the sense that core code cares about */
     138              :             Assert(checkUnique != UNIQUE_CHECK_EXISTING);
     139        10116 :             is_unique = true;
     140              :         }
     141              :     }
     142              : 
     143              :     /*
     144              :      * Fill in the BTInsertState working area, to track the current page and
     145              :      * position within the page to insert on.
     146              :      *
     147              :      * Note that itemsz is passed down to lower level code that deals with
     148              :      * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
     149              :      * accounting code consistently considers the alignment overhead that we
     150              :      * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
     151              :      * already conservative about alignment, but we don't rely on that from
     152              :      * this distance.  Besides, preserving the "true" tuple size in index
     153              :      * tuple headers for the benefit of nbtsplitloc.c might happen someday.
     154              :      * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
     155              :      */
     156      4647255 :     insertstate.itup = itup;
     157      4647255 :     insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
     158      4647255 :     insertstate.itup_key = itup_key;
     159      4647255 :     insertstate.bounds_valid = false;
     160      4647255 :     insertstate.buf = InvalidBuffer;
     161      4647255 :     insertstate.postingoff = 0;
     162              : 
     163      4647267 : search:
     164              : 
     165              :     /*
     166              :      * Find and lock the leaf page that the tuple should be added to by
     167              :      * searching from the root page.  insertstate.buf will hold a buffer that
     168              :      * is locked in exclusive mode afterwards.
     169              :      */
     170      4647267 :     stack = _bt_search_insert(rel, heapRel, &insertstate);
     171              : 
     172              :     /*
     173              :      * checkingunique inserts are not allowed to go ahead when two tuples with
     174              :      * equal key attribute values would be visible to new MVCC snapshots once
     175              :      * the xact commits.  Check for conflicts in the locked page/buffer (if
     176              :      * needed) here.
     177              :      *
     178              :      * It might be necessary to check a page to the right in _bt_check_unique,
     179              :      * though that should be very rare.  In practice the first page the value
     180              :      * could be on (with scantid omitted) is almost always also the only page
     181              :      * that a matching tuple might be found on.  This is due to the behavior
     182              :      * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
     183              :      * only be allowed to cross a page boundary when there is no candidate
     184              :      * leaf page split point that avoids it.  Also, _bt_check_unique can use
     185              :      * the leaf page high key to determine that there will be no duplicates on
     186              :      * the right sibling without actually visiting it (it uses the high key in
     187              :      * cases where the new item happens to belong at the far right of the leaf
     188              :      * page).
     189              :      *
     190              :      * NOTE: obviously, _bt_check_unique can only detect keys that are already
     191              :      * in the index; so it cannot defend against concurrent insertions of the
     192              :      * same key.  We protect against that by means of holding a write lock on
     193              :      * the first page the value could be on, with omitted/-inf value for the
     194              :      * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
     195              :      * the same key must acquire a write lock on the same page, so only one
     196              :      * would-be inserter can be making the check at one time.  Furthermore,
     197              :      * once we are past the check we hold write locks continuously until we
     198              :      * have performed our insertion, so no later inserter can fail to see our
     199              :      * insertion.  (This requires some care in _bt_findinsertloc.)
     200              :      *
     201              :      * If we must wait for another xact, we release the lock while waiting,
     202              :      * and then must perform a new search.
     203              :      *
     204              :      * For a partial uniqueness check, we don't wait for the other xact. Just
     205              :      * let the tuple in and return false for possibly non-unique, or true for
     206              :      * definitely unique.
     207              :      */
     208      4647267 :     if (checkingunique)
     209              :     {
     210              :         TransactionId xwait;
     211              :         uint32      speculativeToken;
     212              : 
     213      3310394 :         xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
     214              :                                  &is_unique, &speculativeToken);
     215              : 
     216      3310071 :         if (unlikely(TransactionIdIsValid(xwait)))
     217              :         {
     218              :             /* Have to wait for the other guy ... */
     219           12 :             _bt_relbuf(rel, insertstate.buf);
     220           12 :             insertstate.buf = InvalidBuffer;
     221              : 
     222              :             /*
     223              :              * If it's a speculative insertion, wait for it to finish (ie. to
     224              :              * go ahead with the insertion, or kill the tuple).  Otherwise
     225              :              * wait for the transaction to finish as usual.
     226              :              */
     227           12 :             if (speculativeToken)
     228            0 :                 SpeculativeInsertionWait(xwait, speculativeToken);
     229              :             else
     230           12 :                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
     231              : 
     232              :             /* start over... */
     233           12 :             if (stack)
     234            0 :                 _bt_freestack(stack);
     235           12 :             goto search;
     236              :         }
     237              : 
     238              :         /* Uniqueness is established -- restore heap tid as scantid */
     239      3310059 :         if (itup_key->heapkeyspace)
     240      3310059 :             itup_key->scantid = &itup->t_tid;
     241              :     }
     242              : 
     243      4646932 :     if (checkUnique != UNIQUE_CHECK_EXISTING)
     244              :     {
     245              :         OffsetNumber newitemoff;
     246              : 
     247              :         /*
     248              :          * The only conflict predicate locking cares about for indexes is when
     249              :          * an index tuple insert conflicts with an existing lock.  We don't
     250              :          * know the actual page we're going to insert on for sure just yet in
     251              :          * checkingunique and !heapkeyspace cases, but it's okay to use the
     252              :          * first page the value could be on (with scantid omitted) instead.
     253              :          */
     254      4646896 :         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
     255              : 
     256              :         /*
     257              :          * Do the insertion.  Note that insertstate contains cached binary
     258              :          * search bounds established within _bt_check_unique when insertion is
     259              :          * checkingunique.
     260              :          */
     261      4646893 :         newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
     262              :                                        indexUnchanged, stack, heapRel);
     263      4646893 :         _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
     264              :                        stack, itup, insertstate.itemsz, newitemoff,
     265              :                        insertstate.postingoff, false);
     266              :     }
     267              :     else
     268              :     {
     269              :         /* just release the buffer */
     270           36 :         _bt_relbuf(rel, insertstate.buf);
     271              :     }
     272              : 
     273              :     /* be tidy */
     274      4646929 :     if (stack)
     275      4081338 :         _bt_freestack(stack);
     276      4646929 :     pfree(itup_key);
     277              : 
     278      4646929 :     return is_unique;
     279              : }
     280              : 
     281              : /*
     282              :  *  _bt_search_insert() -- _bt_search() wrapper for inserts
     283              :  *
     284              :  * Search the tree for a particular scankey, or more precisely for the first
     285              :  * leaf page it could be on.  Try to make use of the fastpath optimization's
     286              :  * rightmost leaf page cache before actually searching the tree from the root
     287              :  * page, though.
     288              :  *
     289              :  * Return value is a stack of parent-page pointers (though see notes about
     290              :  * fastpath optimization and page splits below).  insertstate->buf is set to
     291              :  * the address of the leaf-page buffer, which is write-locked and pinned in
     292              :  * all cases (if necessary by creating a new empty root page for caller).
     293              :  *
     294              :  * The fastpath optimization avoids most of the work of searching the tree
     295              :  * repeatedly when a single backend inserts successive new tuples on the
     296              :  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
     297              :  * page is maintained within _bt_insertonpg(), and used here.  The cache is
     298              :  * invalidated here when an insert of a non-pivot tuple must take place on a
     299              :  * non-rightmost leaf page.
     300              :  *
     301              :  * The optimization helps with indexes on an auto-incremented field.  It also
     302              :  * helps with indexes on datetime columns, as well as indexes with lots of
     303              :  * NULL values.  (NULLs usually get inserted in the rightmost page for single
     304              :  * column indexes, since they usually get treated as coming after everything
     305              :  * else in the key space.  Individual NULL tuples will generally be placed on
     306              :  * the rightmost leaf page due to the influence of the heap TID column.)
     307              :  *
     308              :  * Note that we avoid applying the optimization when there is insufficient
     309              :  * space on the rightmost page to fit caller's new item.  This is necessary
     310              :  * because we'll need to return a real descent stack when a page split is
     311              :  * expected (actually, caller can cope with a leaf page split that uses a NULL
     312              :  * stack, but that's very slow and so must be avoided).  Note also that the
     313              :  * fastpath optimization acquires the lock on the page conditionally as a way
     314              :  * of reducing extra contention when there are concurrent insertions into the
     315              :  * rightmost page (we give up if we'd have to wait for the lock).  We assume
     316              :  * that it isn't useful to apply the optimization when there is contention,
     317              :  * since each per-backend cache won't stay valid for long.
     318              :  */
     319              : static BTStack
     320      4647267 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
     321              : {
     322              :     Assert(insertstate->buf == InvalidBuffer);
     323              :     Assert(!insertstate->bounds_valid);
     324              :     Assert(insertstate->postingoff == 0);
     325              : 
     326      4647267 :     if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
     327              :     {
     328              :         /* Simulate a _bt_getbuf() call with conditional locking */
     329        33425 :         insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
     330        33425 :         if (_bt_conditionallockbuf(rel, insertstate->buf))
     331              :         {
     332              :             Page        page;
     333              :             BTPageOpaque opaque;
     334              : 
     335        32771 :             _bt_checkpage(rel, insertstate->buf);
     336        32771 :             page = BufferGetPage(insertstate->buf);
     337        32771 :             opaque = BTPageGetOpaque(page);
     338              : 
     339              :             /*
     340              :              * Check if the page is still the rightmost leaf page and has
     341              :              * enough free space to accommodate the new tuple.  Also check
     342              :              * that the insertion scan key is strictly greater than the first
     343              :              * non-pivot tuple on the page.  (Note that we expect itup_key's
     344              :              * scantid to be unset when our caller is a checkingunique
     345              :              * inserter.)
     346              :              */
     347        32771 :             if (P_RIGHTMOST(opaque) &&
     348        32736 :                 P_ISLEAF(opaque) &&
     349        32736 :                 !P_IGNORE(opaque) &&
     350        65259 :                 PageGetFreeSpace(page) > insertstate->itemsz &&
     351        65046 :                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
     352        32523 :                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
     353              :             {
     354              :                 /*
     355              :                  * Caller can use the fastpath optimization because cached
     356              :                  * block is still rightmost leaf page, which can fit caller's
     357              :                  * new tuple without splitting.  Keep block in local cache for
     358              :                  * next insert, and have caller use NULL stack.
     359              :                  *
     360              :                  * Note that _bt_insert_parent() has an assertion that catches
     361              :                  * leaf page splits that somehow follow from a fastpath insert
     362              :                  * (it should only be passed a NULL stack when it must deal
     363              :                  * with a concurrent root page split, and never because a NULL
     364              :                  * stack was returned here).
     365              :                  */
     366        32505 :                 return NULL;
     367              :             }
     368              : 
     369              :             /* Page unsuitable for caller, drop lock and pin */
     370          266 :             _bt_relbuf(rel, insertstate->buf);
     371              :         }
     372              :         else
     373              :         {
     374              :             /* Lock unavailable, drop pin */
     375          654 :             ReleaseBuffer(insertstate->buf);
     376              :         }
     377              : 
     378              :         /* Forget block, since cache doesn't appear to be useful */
     379          920 :         RelationSetTargetBlock(rel, InvalidBlockNumber);
     380              :     }
     381              : 
     382              :     /* Cannot use optimization -- descend tree, return proper descent stack */
     383      4614762 :     return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
     384              :                       BT_WRITE, true);
     385              : }
     386              : 
     387              : /*
     388              :  *  _bt_check_unique() -- Check for violation of unique index constraint
     389              :  *
     390              :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
     391              :  * we must wait for to see if it commits a conflicting tuple.   If an actual
     392              :  * conflict is detected, no return --- just ereport().  If an xact ID is
     393              :  * returned, and the conflicting tuple still has a speculative insertion in
     394              :  * progress, *speculativeToken is set to non-zero, and the caller can wait for
     395              :  * the verdict on the insertion using SpeculativeInsertionWait().
     396              :  *
     397              :  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
     398              :  * InvalidTransactionId because we don't want to wait.  In this case we
     399              :  * set *is_unique to false if there is a potential conflict, and the
     400              :  * core code must redo the uniqueness check later.
     401              :  *
     402              :  * As a side-effect, sets state in insertstate that can later be used by
     403              :  * _bt_findinsertloc() to reuse most of the binary search work we do
     404              :  * here.
     405              :  *
     406              :  * This code treats NULLs as equal, unlike the default semantics for unique
     407              :  * indexes.  So do not call here when there are NULL values in scan key and
     408              :  * the index uses the default NULLS DISTINCT mode.
     409              :  */
     410              : static TransactionId
     411      3310394 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
     412              :                  IndexUniqueCheck checkUnique, bool *is_unique,
     413              :                  uint32 *speculativeToken)
     414              : {
     415      3310394 :     IndexTuple  itup = insertstate->itup;
     416      3310394 :     IndexTuple  curitup = NULL;
     417      3310394 :     ItemId      curitemid = NULL;
     418      3310394 :     BTScanInsert itup_key = insertstate->itup_key;
     419              :     SnapshotData SnapshotDirty;
     420              :     OffsetNumber offset;
     421              :     OffsetNumber maxoff;
     422              :     Page        page;
     423              :     BTPageOpaque opaque;
     424      3310394 :     Buffer      nbuf = InvalidBuffer;
     425      3310394 :     bool        found = false;
     426      3310394 :     bool        inposting = false;
     427      3310394 :     bool        prevalldead = true;
     428      3310394 :     int         curposti = 0;
     429              : 
     430              :     /* Assume unique until we find a duplicate */
     431      3310394 :     *is_unique = true;
     432              : 
     433      3310394 :     InitDirtySnapshot(SnapshotDirty);
     434              : 
     435      3310394 :     page = BufferGetPage(insertstate->buf);
     436      3310394 :     opaque = BTPageGetOpaque(page);
     437      3310394 :     maxoff = PageGetMaxOffsetNumber(page);
     438              : 
     439              :     /*
     440              :      * Find the first tuple with the same key.
     441              :      *
     442              :      * This also saves the binary search bounds in insertstate.  We use them
     443              :      * in the fastpath below, but also in the _bt_findinsertloc() call later.
     444              :      */
     445              :     Assert(!insertstate->bounds_valid);
     446      3310394 :     offset = _bt_binsrch_insert(rel, insertstate);
     447              : 
     448              :     /*
     449              :      * Scan over all equal tuples, looking for live conflicts.
     450              :      */
     451              :     Assert(!insertstate->bounds_valid || insertstate->low == offset);
     452              :     Assert(!itup_key->anynullkeys);
     453              :     Assert(itup_key->scantid == NULL);
     454              :     for (;;)
     455              :     {
     456              :         /*
     457              :          * Each iteration of the loop processes one heap TID, not one index
     458              :          * tuple.  Current offset number for page isn't usually advanced on
     459              :          * iterations that process heap TIDs from posting list tuples.
     460              :          *
     461              :          * "inposting" state is set when _inside_ a posting list --- not when
     462              :          * we're at the start (or end) of a posting list.  We advance curposti
     463              :          * at the end of the iteration when inside a posting list tuple.  In
     464              :          * general, every loop iteration either advances the page offset or
     465              :          * advances curposti --- an iteration that handles the rightmost/max
     466              :          * heap TID in a posting list finally advances the page offset (and
     467              :          * unsets "inposting").
     468              :          *
     469              :          * Make sure the offset points to an actual index tuple before trying
     470              :          * to examine it...
     471              :          */
     472     11164452 :         if (offset <= maxoff)
     473              :         {
     474              :             /*
     475              :              * Fastpath: In most cases, we can use cached search bounds to
     476              :              * limit our consideration to items that are definitely
     477              :              * duplicates.  This fastpath doesn't apply when the original page
     478              :              * is empty, or when initial offset is past the end of the
     479              :              * original page, which may indicate that we need to examine a
     480              :              * second or subsequent page.
     481              :              *
     482              :              * Note that this optimization allows us to avoid calling
     483              :              * _bt_compare() directly when there are no duplicates, as long as
     484              :              * the offset where the key will go is not at the end of the page.
     485              :              */
     486      9193255 :             if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
     487              :             {
     488              :                 Assert(insertstate->bounds_valid);
     489              :                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
     490              :                 Assert(insertstate->low <= insertstate->stricthigh);
     491              :                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
     492      1205978 :                 break;
     493              :             }
     494              : 
     495              :             /*
     496              :              * We can skip items that are already marked killed.
     497              :              *
     498              :              * In the presence of heavy update activity an index may contain
     499              :              * many killed items with the same key; running _bt_compare() on
     500              :              * each killed item gets expensive.  Just advance over killed
     501              :              * items as quickly as we can.  We only apply _bt_compare() when
     502              :              * we get to a non-killed item.  We could reuse the bounds to
     503              :              * avoid _bt_compare() calls for known equal tuples, but it
     504              :              * doesn't seem worth it.
     505              :              */
     506      7987277 :             if (!inposting)
     507      4977766 :                 curitemid = PageGetItemId(page, offset);
     508      7987277 :             if (inposting || !ItemIdIsDead(curitemid))
     509              :             {
     510              :                 ItemPointerData htid;
     511      7693854 :                 bool        all_dead = false;
     512              : 
     513      7693854 :                 if (!inposting)
     514              :                 {
     515              :                     /* Plain tuple, or first TID in posting list tuple */
     516      4684343 :                     if (_bt_compare(rel, itup_key, page, offset) != 0)
     517       116112 :                         break;  /* we're past all the equal tuples */
     518              : 
     519              :                     /* Advanced curitup */
     520      4568231 :                     curitup = (IndexTuple) PageGetItem(page, curitemid);
     521              :                     Assert(!BTreeTupleIsPivot(curitup));
     522              :                 }
     523              : 
     524              :                 /* okay, we gotta fetch the heap tuple using htid ... */
     525      7577742 :                 if (!BTreeTupleIsPosting(curitup))
     526              :                 {
     527              :                     /* ... htid is from simple non-pivot tuple */
     528              :                     Assert(!inposting);
     529      4540288 :                     htid = curitup->t_tid;
     530              :                 }
     531      3037454 :                 else if (!inposting)
     532              :                 {
     533              :                     /* ... htid is first TID in new posting list */
     534        27943 :                     inposting = true;
     535        27943 :                     prevalldead = true;
     536        27943 :                     curposti = 0;
     537        27943 :                     htid = *BTreeTupleGetPostingN(curitup, 0);
     538              :                 }
     539              :                 else
     540              :                 {
     541              :                     /* ... htid is second or subsequent TID in posting list */
     542              :                     Assert(curposti > 0);
     543      3009511 :                     htid = *BTreeTupleGetPostingN(curitup, curposti);
     544              :                 }
     545              : 
     546              :                 /*
     547              :                  * If we are doing a recheck, we expect to find the tuple we
     548              :                  * are rechecking.  It's not a duplicate, but we have to keep
     549              :                  * scanning.
     550              :                  */
     551      7577895 :                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
     552          153 :                     ItemPointerCompare(&htid, &itup->t_tid) == 0)
     553              :                 {
     554           36 :                     found = true;
     555              :                 }
     556              : 
     557              :                 /*
     558              :                  * Check if there's any table tuples for this index entry
     559              :                  * satisfying SnapshotDirty. This is necessary because for AMs
     560              :                  * with optimizations like heap's HOT, we have just a single
     561              :                  * index entry for the entire chain.
     562              :                  */
     563      7577706 :                 else if (table_index_fetch_tuple_check(heapRel, &htid,
     564              :                                                        &SnapshotDirty,
     565              :                                                        &all_dead))
     566              :                 {
     567              :                     TransactionId xwait;
     568              : 
     569              :                     /*
     570              :                      * It is a duplicate. If we are only doing a partial
     571              :                      * check, then don't bother checking if the tuple is being
     572              :                      * updated in another transaction. Just return the fact
     573              :                      * that it is a potential conflict and leave the full
     574              :                      * check till later. Don't invalidate binary search
     575              :                      * bounds.
     576              :                      */
     577          477 :                     if (checkUnique == UNIQUE_CHECK_PARTIAL)
     578              :                     {
     579          142 :                         if (nbuf != InvalidBuffer)
     580            0 :                             _bt_relbuf(rel, nbuf);
     581          142 :                         *is_unique = false;
     582          154 :                         return InvalidTransactionId;
     583              :                     }
     584              : 
     585              :                     /*
     586              :                      * If this tuple is being updated by other transaction
     587              :                      * then we have to wait for its commit/abort.
     588              :                      */
     589          670 :                     xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
     590          335 :                         SnapshotDirty.xmin : SnapshotDirty.xmax;
     591              : 
     592          335 :                     if (TransactionIdIsValid(xwait))
     593              :                     {
     594           12 :                         if (nbuf != InvalidBuffer)
     595            0 :                             _bt_relbuf(rel, nbuf);
     596              :                         /* Tell _bt_doinsert to wait... */
     597           12 :                         *speculativeToken = SnapshotDirty.speculativeToken;
     598              :                         /* Caller releases lock on buf immediately */
     599           12 :                         insertstate->bounds_valid = false;
     600           12 :                         return xwait;
     601              :                     }
     602              : 
     603              :                     /*
     604              :                      * Otherwise we have a definite conflict.  But before
     605              :                      * complaining, look to see if the tuple we want to insert
     606              :                      * is itself now committed dead --- if so, don't complain.
     607              :                      * This is a waste of time in normal scenarios but we must
     608              :                      * do it to support CREATE INDEX CONCURRENTLY.
     609              :                      *
     610              :                      * We must follow HOT-chains here because during
     611              :                      * concurrent index build, we insert the root TID though
     612              :                      * the actual tuple may be somewhere in the HOT-chain.
     613              :                      * While following the chain we might not stop at the
     614              :                      * exact tuple which triggered the insert, but that's OK
     615              :                      * because if we find a live tuple anywhere in this chain,
     616              :                      * we have a unique key conflict.  The other live tuple is
     617              :                      * not part of this chain because it had a different index
     618              :                      * entry.
     619              :                      */
     620          323 :                     htid = itup->t_tid;
     621          323 :                     if (table_index_fetch_tuple_check(heapRel, &htid,
     622              :                                                       SnapshotSelf, NULL))
     623              :                     {
     624              :                         /* Normal case --- it's still live */
     625              :                     }
     626              :                     else
     627              :                     {
     628              :                         /*
     629              :                          * It's been deleted, so no error, and no need to
     630              :                          * continue searching
     631              :                          */
     632            0 :                         break;
     633              :                     }
     634              : 
     635              :                     /*
     636              :                      * Check for a conflict-in as we would if we were going to
     637              :                      * write to this page.  We aren't actually going to write,
     638              :                      * but we want a chance to report SSI conflicts that would
     639              :                      * otherwise be masked by this unique constraint
     640              :                      * violation.
     641              :                      */
     642          323 :                     CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
     643              : 
     644              :                     /*
     645              :                      * This is a definite conflict.  Break the tuple down into
     646              :                      * datums and report the error.  But first, make sure we
     647              :                      * release the buffer locks we're holding ---
     648              :                      * BuildIndexValueDescription could make catalog accesses,
     649              :                      * which in the worst case might touch this same index and
     650              :                      * cause deadlocks.
     651              :                      */
     652          319 :                     if (nbuf != InvalidBuffer)
     653            0 :                         _bt_relbuf(rel, nbuf);
     654          319 :                     _bt_relbuf(rel, insertstate->buf);
     655          319 :                     insertstate->buf = InvalidBuffer;
     656          319 :                     insertstate->bounds_valid = false;
     657              : 
     658              :                     {
     659              :                         Datum       values[INDEX_MAX_KEYS];
     660              :                         bool        isnull[INDEX_MAX_KEYS];
     661              :                         char       *key_desc;
     662              : 
     663          319 :                         index_deform_tuple(itup, RelationGetDescr(rel),
     664              :                                            values, isnull);
     665              : 
     666          319 :                         key_desc = BuildIndexValueDescription(rel, values,
     667              :                                                               isnull);
     668              : 
     669          319 :                         ereport(ERROR,
     670              :                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
     671              :                                  errmsg("duplicate key value violates unique constraint \"%s\"",
     672              :                                         RelationGetRelationName(rel)),
     673              :                                  key_desc ? errdetail("Key %s already exists.",
     674              :                                                       key_desc) : 0,
     675              :                                  errtableconstraint(heapRel,
     676              :                                                     RelationGetRelationName(rel))));
     677              :                     }
     678              :                 }
     679      7577229 :                 else if (all_dead && (!inposting ||
     680        18607 :                                       (prevalldead &&
     681        18607 :                                        curposti == BTreeTupleGetNPosting(curitup) - 1)))
     682              :                 {
     683              :                     /*
     684              :                      * The conflicting tuple (or all HOT chains pointed to by
     685              :                      * all posting list TIDs) is dead to everyone, so try to
     686              :                      * mark the index entry killed. It's ok if we're not
     687              :                      * allowed to, this isn't required for correctness.
     688              :                      */
     689              :                     Buffer      buf;
     690              : 
     691              :                     /* Be sure to operate on the proper buffer */
     692        54062 :                     if (nbuf != InvalidBuffer)
     693            3 :                         buf = nbuf;
     694              :                     else
     695        54059 :                         buf = insertstate->buf;
     696              : 
     697              :                     /*
     698              :                      * Use the hint bit infrastructure to check if we can
     699              :                      * update the page while just holding a share lock.
     700              :                      *
     701              :                      * Can't use BufferSetHintBits16() here as we update two
     702              :                      * different locations.
     703              :                      */
     704        54062 :                     if (BufferBeginSetHintBits(buf))
     705              :                     {
     706        54062 :                         ItemIdMarkDead(curitemid);
     707        54062 :                         opaque->btpo_flags |= BTP_HAS_GARBAGE;
     708        54062 :                         BufferFinishSetHintBits(buf, true, true);
     709              :                     }
     710              :                 }
     711              : 
     712              :                 /*
     713              :                  * Remember if posting list tuple has even a single HOT chain
     714              :                  * whose members are not all dead
     715              :                  */
     716      7577265 :                 if (!all_dead && inposting)
     717      3018827 :                     prevalldead = false;
     718              :             }
     719              :         }
     720              : 
     721      9841885 :         if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
     722              :         {
     723              :             /* Advance to next TID in same posting list */
     724      3009511 :             curposti++;
     725      3009511 :             continue;
     726              :         }
     727      6832374 :         else if (offset < maxoff)
     728              :         {
     729              :             /* Advance to next tuple */
     730      4837851 :             curposti = 0;
     731      4837851 :             inposting = false;
     732      4837851 :             offset = OffsetNumberNext(offset);
     733              :         }
     734              :         else
     735              :         {
     736              :             int         highkeycmp;
     737              : 
     738              :             /* If scankey == hikey we gotta check the next page too */
     739      1994523 :             if (P_RIGHTMOST(opaque))
     740      1921647 :                 break;
     741        72876 :             highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
     742              :             Assert(highkeycmp <= 0);
     743        72876 :             if (highkeycmp != 0)
     744        66180 :                 break;
     745              :             /* Advance to next non-dead page --- there must be one */
     746              :             for (;;)
     747            0 :             {
     748         6696 :                 BlockNumber nblkno = opaque->btpo_next;
     749              : 
     750         6696 :                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
     751         6696 :                 page = BufferGetPage(nbuf);
     752         6696 :                 opaque = BTPageGetOpaque(page);
     753         6696 :                 if (!P_IGNORE(opaque))
     754         6696 :                     break;
     755            0 :                 if (P_RIGHTMOST(opaque))
     756            0 :                     elog(ERROR, "fell off the end of index \"%s\"",
     757              :                          RelationGetRelationName(rel));
     758              :             }
     759              :             /* Will also advance to next tuple */
     760         6696 :             curposti = 0;
     761         6696 :             inposting = false;
     762         6696 :             maxoff = PageGetMaxOffsetNumber(page);
     763         6696 :             offset = P_FIRSTDATAKEY(opaque);
     764              :             /* Don't invalidate binary search bounds */
     765              :         }
     766              :     }
     767              : 
     768              :     /*
     769              :      * If we are doing a recheck then we should have found the tuple we are
     770              :      * checking.  Otherwise there's something very wrong --- probably, the
     771              :      * index is on a non-immutable expression.
     772              :      */
     773      3309917 :     if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
     774            0 :         ereport(ERROR,
     775              :                 (errcode(ERRCODE_INTERNAL_ERROR),
     776              :                  errmsg("failed to re-find tuple within index \"%s\"",
     777              :                         RelationGetRelationName(rel)),
     778              :                  errhint("This may be because of a non-immutable index expression."),
     779              :                  errtableconstraint(heapRel,
     780              :                                     RelationGetRelationName(rel))));
     781              : 
     782      3309917 :     if (nbuf != InvalidBuffer)
     783         3840 :         _bt_relbuf(rel, nbuf);
     784              : 
     785      3309917 :     return InvalidTransactionId;
     786              : }
     787              : 
     788              : 
     789              : /*
     790              :  *  _bt_findinsertloc() -- Finds an insert location for a tuple
     791              :  *
     792              :  *      On entry, insertstate buffer contains the page the new tuple belongs
     793              :  *      on.  It is exclusive-locked and pinned by the caller.
     794              :  *
     795              :  *      If 'checkingunique' is true, the buffer on entry is the first page
     796              :  *      that contains duplicates of the new key.  If there are duplicates on
     797              :  *      multiple pages, the correct insertion position might be some page to
     798              :  *      the right, rather than the first page.  In that case, this function
     799              :  *      moves right to the correct target page.
     800              :  *
     801              :  *      (In a !heapkeyspace index, there can be multiple pages with the same
     802              :  *      high key, where the new tuple could legitimately be placed on.  In
     803              :  *      that case, the caller passes the first page containing duplicates,
     804              :  *      just like when checkingunique=true.  If that page doesn't have enough
     805              :  *      room for the new tuple, this function moves right, trying to find a
     806              :  *      legal page that does.)
     807              :  *
     808              :  *      If 'indexUnchanged' is true, this is for an UPDATE that didn't
     809              :  *      logically change the indexed value, but must nevertheless have a new
     810              :  *      entry to point to a successor version.  This hint from the executor
     811              :  *      will influence our behavior when the page might have to be split and
     812              :  *      we must consider our options.  Bottom-up index deletion can avoid
     813              :  *      pathological version-driven page splits, but we only want to go to the
     814              :  *      trouble of trying it when we already have moderate confidence that
     815              :  *      it's appropriate.  The hint should not significantly affect our
     816              :  *      behavior over time unless practically all inserts on to the leaf page
     817              :  *      get the hint.
     818              :  *
     819              :  *      On exit, insertstate buffer contains the chosen insertion page, and
     820              :  *      the offset within that page is returned.  If _bt_findinsertloc needed
     821              :  *      to move right, the lock and pin on the original page are released, and
     822              :  *      the new buffer is exclusively locked and pinned instead.
     823              :  *
     824              :  *      If insertstate contains cached binary search bounds, we will take
     825              :  *      advantage of them.  This avoids repeating comparisons that we made in
     826              :  *      _bt_check_unique() already.
     827              :  */
     828              : static OffsetNumber
     829      4646893 : _bt_findinsertloc(Relation rel,
     830              :                   BTInsertState insertstate,
     831              :                   bool checkingunique,
     832              :                   bool indexUnchanged,
     833              :                   BTStack stack,
     834              :                   Relation heapRel)
     835              : {
     836      4646893 :     BTScanInsert itup_key = insertstate->itup_key;
     837      4646893 :     Page        page = BufferGetPage(insertstate->buf);
     838              :     BTPageOpaque opaque;
     839              :     OffsetNumber newitemoff;
     840              : 
     841      4646893 :     opaque = BTPageGetOpaque(page);
     842              : 
     843              :     /* Check 1/3 of a page restriction */
     844      4646893 :     if (unlikely(insertstate->itemsz > BTMaxItemSize))
     845            0 :         _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
     846              :                              insertstate->itup);
     847              : 
     848              :     Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
     849              :     Assert(!insertstate->bounds_valid || checkingunique);
     850              :     Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
     851              :     Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
     852              :     Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
     853              : 
     854      4646893 :     if (itup_key->heapkeyspace)
     855              :     {
     856              :         /* Keep track of whether checkingunique duplicate seen */
     857      4646893 :         bool        uniquedup = indexUnchanged;
     858              : 
     859              :         /*
     860              :          * If we're inserting into a unique index, we may have to walk right
     861              :          * through leaf pages to find the one leaf page that we must insert on
     862              :          * to.
     863              :          *
     864              :          * This is needed for checkingunique callers because a scantid was not
     865              :          * used when we called _bt_search().  scantid can only be set after
     866              :          * _bt_check_unique() has checked for duplicates.  The buffer
     867              :          * initially stored in insertstate->buf has the page where the first
     868              :          * duplicate key might be found, which isn't always the page that new
     869              :          * tuple belongs on.  The heap TID attribute for new tuple (scantid)
     870              :          * could force us to insert on a sibling page, though that should be
     871              :          * very rare in practice.
     872              :          */
     873      4646893 :         if (checkingunique)
     874              :         {
     875      3310020 :             if (insertstate->low < insertstate->stricthigh)
     876              :             {
     877              :                 /* Encountered a duplicate in _bt_check_unique() */
     878              :                 Assert(insertstate->bounds_valid);
     879       254850 :                 uniquedup = true;
     880              :             }
     881              : 
     882              :             for (;;)
     883              :             {
     884              :                 /*
     885              :                  * Does the new tuple belong on this page?
     886              :                  *
     887              :                  * The earlier _bt_check_unique() call may well have
     888              :                  * established a strict upper bound on the offset for the new
     889              :                  * item.  If it's not the last item of the page (i.e. if there
     890              :                  * is at least one tuple on the page that goes after the tuple
     891              :                  * we're inserting) then we know that the tuple belongs on
     892              :                  * this page.  We can skip the high key check.
     893              :                  */
     894      3316716 :                 if (insertstate->bounds_valid &&
     895      6607698 :                     insertstate->low <= insertstate->stricthigh &&
     896      3303849 :                     insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     897      1307236 :                     break;
     898              : 
     899              :                 /* Test '<=', not '!=', since scantid is set now */
     900      2090819 :                 if (P_RIGHTMOST(opaque) ||
     901        81339 :                     _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
     902              :                     break;
     903              : 
     904         6696 :                 _bt_stepright(rel, heapRel, insertstate, stack);
     905              :                 /* Update local state after stepping right */
     906         6696 :                 page = BufferGetPage(insertstate->buf);
     907         6696 :                 opaque = BTPageGetOpaque(page);
     908              :                 /* Assume duplicates (if checkingunique) */
     909         6696 :                 uniquedup = true;
     910              :             }
     911              :         }
     912              : 
     913              :         /*
     914              :          * If the target page cannot fit newitem, try to avoid splitting the
     915              :          * page on insert by performing deletion or deduplication now
     916              :          */
     917      4646893 :         if (PageGetFreeSpace(page) < insertstate->itemsz)
     918        33223 :             _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
     919              :                                          checkingunique, uniquedup,
     920              :                                          indexUnchanged);
     921              :     }
     922              :     else
     923              :     {
     924              :         /*----------
     925              :          * This is a !heapkeyspace (version 2 or 3) index.  The current page
     926              :          * is the first page that we could insert the new tuple to, but there
     927              :          * may be other pages to the right that we could opt to use instead.
     928              :          *
     929              :          * If the new key is equal to one or more existing keys, we can
     930              :          * legitimately place it anywhere in the series of equal keys.  In
     931              :          * fact, if the new key is equal to the page's "high key" we can place
     932              :          * it on the next page.  If it is equal to the high key, and there's
     933              :          * not room to insert the new tuple on the current page without
     934              :          * splitting, then we move right hoping to find more free space and
     935              :          * avoid a split.
     936              :          *
     937              :          * Keep scanning right until we
     938              :          *      (a) find a page with enough free space,
     939              :          *      (b) reach the last page where the tuple can legally go, or
     940              :          *      (c) get tired of searching.
     941              :          * (c) is not flippant; it is important because if there are many
     942              :          * pages' worth of equal keys, it's better to split one of the early
     943              :          * pages than to scan all the way to the end of the run of equal keys
     944              :          * on every insert.  We implement "get tired" as a random choice,
     945              :          * since stopping after scanning a fixed number of pages wouldn't work
     946              :          * well (we'd never reach the right-hand side of previously split
     947              :          * pages).  The probability of moving right is set at 0.99, which may
     948              :          * seem too high to change the behavior much, but it does an excellent
     949              :          * job of preventing O(N^2) behavior with many equal keys.
     950              :          *----------
     951              :          */
     952            0 :         while (PageGetFreeSpace(page) < insertstate->itemsz)
     953              :         {
     954              :             /*
     955              :              * Before considering moving right, see if we can obtain enough
     956              :              * space by erasing LP_DEAD items
     957              :              */
     958            0 :             if (P_HAS_GARBAGE(opaque))
     959              :             {
     960              :                 /* Perform simple deletion */
     961            0 :                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     962              :                                              false, false, false);
     963              : 
     964            0 :                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
     965            0 :                     break;      /* OK, now we have enough space */
     966              :             }
     967              : 
     968              :             /*
     969              :              * Nope, so check conditions (b) and (c) enumerated above
     970              :              *
     971              :              * The earlier _bt_check_unique() call may well have established a
     972              :              * strict upper bound on the offset for the new item.  If it's not
     973              :              * the last item of the page (i.e. if there is at least one tuple
     974              :              * on the page that's greater than the tuple we're inserting to)
     975              :              * then we know that the tuple belongs on this page.  We can skip
     976              :              * the high key check.
     977              :              */
     978            0 :             if (insertstate->bounds_valid &&
     979            0 :                 insertstate->low <= insertstate->stricthigh &&
     980            0 :                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     981            0 :                 break;
     982              : 
     983            0 :             if (P_RIGHTMOST(opaque) ||
     984            0 :                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
     985            0 :                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
     986              :                 break;
     987              : 
     988            0 :             _bt_stepright(rel, heapRel, insertstate, stack);
     989              :             /* Update local state after stepping right */
     990            0 :             page = BufferGetPage(insertstate->buf);
     991            0 :             opaque = BTPageGetOpaque(page);
     992              :         }
     993              :     }
     994              : 
     995              :     /*
     996              :      * We should now be on the correct page.  Find the offset within the page
     997              :      * for the new tuple. (Possibly reusing earlier search bounds.)
     998              :      */
     999              :     Assert(P_RIGHTMOST(opaque) ||
    1000              :            _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
    1001              : 
    1002      4646893 :     newitemoff = _bt_binsrch_insert(rel, insertstate);
    1003              : 
    1004      4646893 :     if (insertstate->postingoff == -1)
    1005              :     {
    1006              :         /*
    1007              :          * There is an overlapping posting list tuple with its LP_DEAD bit
    1008              :          * set.  We don't want to unnecessarily unset its LP_DEAD bit while
    1009              :          * performing a posting list split, so perform simple index tuple
    1010              :          * deletion early.
    1011              :          */
    1012            3 :         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
    1013              :                                      false, false, false);
    1014              : 
    1015              :         /*
    1016              :          * Do new binary search.  New insert location cannot overlap with any
    1017              :          * posting list now.
    1018              :          */
    1019              :         Assert(!insertstate->bounds_valid);
    1020            3 :         insertstate->postingoff = 0;
    1021            3 :         newitemoff = _bt_binsrch_insert(rel, insertstate);
    1022              :         Assert(insertstate->postingoff == 0);
    1023              :     }
    1024              : 
    1025      4646893 :     return newitemoff;
    1026              : }
    1027              : 
    1028              : /*
    1029              :  * Step right to next non-dead page, during insertion.
    1030              :  *
    1031              :  * This is a bit more complicated than moving right in a search.  We must
    1032              :  * write-lock the target page before releasing write lock on current page;
    1033              :  * else someone else's _bt_check_unique scan could fail to see our insertion.
    1034              :  * Write locks on intermediate dead pages won't do because we don't know when
    1035              :  * they will get de-linked from the tree.
    1036              :  *
    1037              :  * This is more aggressive than it needs to be for non-unique !heapkeyspace
    1038              :  * indexes.
    1039              :  */
    1040              : static void
    1041         6696 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
    1042              :               BTStack stack)
    1043              : {
    1044              :     Page        page;
    1045              :     BTPageOpaque opaque;
    1046              :     Buffer      rbuf;
    1047              :     BlockNumber rblkno;
    1048              : 
    1049              :     Assert(heaprel != NULL);
    1050         6696 :     page = BufferGetPage(insertstate->buf);
    1051         6696 :     opaque = BTPageGetOpaque(page);
    1052              : 
    1053         6696 :     rbuf = InvalidBuffer;
    1054         6696 :     rblkno = opaque->btpo_next;
    1055              :     for (;;)
    1056              :     {
    1057         6696 :         rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
    1058         6696 :         page = BufferGetPage(rbuf);
    1059         6696 :         opaque = BTPageGetOpaque(page);
    1060              : 
    1061              :         /*
    1062              :          * If this page was incompletely split, finish the split now.  We do
    1063              :          * this while holding a lock on the left sibling, which is not good
    1064              :          * because finishing the split could be a fairly lengthy operation.
    1065              :          * But this should happen very seldom.
    1066              :          */
    1067         6696 :         if (P_INCOMPLETE_SPLIT(opaque))
    1068              :         {
    1069            0 :             _bt_finish_split(rel, heaprel, rbuf, stack);
    1070            0 :             rbuf = InvalidBuffer;
    1071            0 :             continue;
    1072              :         }
    1073              : 
    1074         6696 :         if (!P_IGNORE(opaque))
    1075         6696 :             break;
    1076            0 :         if (P_RIGHTMOST(opaque))
    1077            0 :             elog(ERROR, "fell off the end of index \"%s\"",
    1078              :                  RelationGetRelationName(rel));
    1079              : 
    1080            0 :         rblkno = opaque->btpo_next;
    1081              :     }
    1082              :     /* rbuf locked; unlock buf, update state for caller */
    1083         6696 :     _bt_relbuf(rel, insertstate->buf);
    1084         6696 :     insertstate->buf = rbuf;
    1085         6696 :     insertstate->bounds_valid = false;
    1086         6696 : }
    1087              : 
    1088              : /*----------
    1089              :  *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
    1090              :  *
    1091              :  *      This recursive procedure does the following things:
    1092              :  *
    1093              :  *          +  if postingoff != 0, splits existing posting list tuple
    1094              :  *             (since it overlaps with new 'itup' tuple).
    1095              :  *          +  if necessary, splits the target page, using 'itup_key' for
    1096              :  *             suffix truncation on leaf pages (caller passes NULL for
    1097              :  *             non-leaf pages).
    1098              :  *          +  inserts the new tuple (might be split from posting list).
    1099              :  *          +  if the page was split, pops the parent stack, and finds the
    1100              :  *             right place to insert the new child pointer (by walking
    1101              :  *             right using information stored in the parent stack).
    1102              :  *          +  invokes itself with the appropriate tuple for the right
    1103              :  *             child page on the parent.
    1104              :  *          +  updates the metapage if a true root or fast root is split.
    1105              :  *
    1106              :  *      On entry, we must have the correct buffer in which to do the
    1107              :  *      insertion, and the buffer must be pinned and write-locked.  On return,
    1108              :  *      we will have dropped both the pin and the lock on the buffer.
    1109              :  *
    1110              :  *      This routine only performs retail tuple insertions.  'itup' should
    1111              :  *      always be either a non-highkey leaf item, or a downlink (new high
    1112              :  *      key items are created indirectly, when a page is split).  When
    1113              :  *      inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
    1114              :  *      we're inserting the downlink for.  This function will clear the
    1115              :  *      INCOMPLETE_SPLIT flag on it, and release the buffer.
    1116              :  *----------
    1117              :  */
    1118              : static void
    1119      4660392 : _bt_insertonpg(Relation rel,
    1120              :                Relation heaprel,
    1121              :                BTScanInsert itup_key,
    1122              :                Buffer buf,
    1123              :                Buffer cbuf,
    1124              :                BTStack stack,
    1125              :                IndexTuple itup,
    1126              :                Size itemsz,
    1127              :                OffsetNumber newitemoff,
    1128              :                int postingoff,
    1129              :                bool split_only_page)
    1130              : {
    1131              :     Page        page;
    1132              :     BTPageOpaque opaque;
    1133              :     bool        isleaf,
    1134              :                 isroot,
    1135              :                 isrightmost,
    1136              :                 isonly;
    1137      4660392 :     IndexTuple  oposting = NULL;
    1138      4660392 :     IndexTuple  origitup = NULL;
    1139      4660392 :     IndexTuple  nposting = NULL;
    1140              :     XLogRecPtr  recptr;
    1141              : 
    1142      4660392 :     page = BufferGetPage(buf);
    1143      4660392 :     opaque = BTPageGetOpaque(page);
    1144      4660392 :     isleaf = P_ISLEAF(opaque);
    1145      4660392 :     isroot = P_ISROOT(opaque);
    1146      4660392 :     isrightmost = P_RIGHTMOST(opaque);
    1147      4660392 :     isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
    1148              : 
    1149              :     /* child buffer must be given iff inserting on an internal page */
    1150              :     Assert(isleaf == !BufferIsValid(cbuf));
    1151              :     /* tuple must have appropriate number of attributes */
    1152              :     Assert(!isleaf ||
    1153              :            BTreeTupleGetNAtts(itup, rel) ==
    1154              :            IndexRelationGetNumberOfAttributes(rel));
    1155              :     Assert(isleaf ||
    1156              :            BTreeTupleGetNAtts(itup, rel) <=
    1157              :            IndexRelationGetNumberOfKeyAttributes(rel));
    1158              :     Assert(!BTreeTupleIsPosting(itup));
    1159              :     Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
    1160              :     /* Caller must always finish incomplete split for us */
    1161              :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    1162              : 
    1163              :     /*
    1164              :      * Every internal page should have exactly one negative infinity item at
    1165              :      * all times.  Only _bt_split() and _bt_newlevel() should add items that
    1166              :      * become negative infinity items through truncation, since they're the
    1167              :      * only routines that allocate new internal pages.
    1168              :      */
    1169              :     Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
    1170              : 
    1171              :     /*
    1172              :      * Do we need to split an existing posting list item?
    1173              :      */
    1174      4660392 :     if (postingoff != 0)
    1175              :     {
    1176        21975 :         ItemId      itemid = PageGetItemId(page, newitemoff);
    1177              : 
    1178              :         /*
    1179              :          * The new tuple is a duplicate with a heap TID that falls inside the
    1180              :          * range of an existing posting list tuple on a leaf page.  Prepare to
    1181              :          * split an existing posting list.  Overwriting the posting list with
    1182              :          * its post-split version is treated as an extra step in either the
    1183              :          * insert or page split critical section.
    1184              :          */
    1185              :         Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
    1186        21975 :         oposting = (IndexTuple) PageGetItem(page, itemid);
    1187              : 
    1188              :         /*
    1189              :          * postingoff value comes from earlier call to _bt_binsrch_posting().
    1190              :          * Its binary search might think that a plain tuple must be a posting
    1191              :          * list tuple that needs to be split.  This can happen with corruption
    1192              :          * involving an existing plain tuple that is a duplicate of the new
    1193              :          * item, up to and including its table TID.  Check for that here in
    1194              :          * passing.
    1195              :          *
    1196              :          * Also verify that our caller has made sure that the existing posting
    1197              :          * list tuple does not have its LP_DEAD bit set.
    1198              :          */
    1199        21975 :         if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
    1200            0 :             ereport(ERROR,
    1201              :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1202              :                      errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
    1203              :                                      ItemPointerGetBlockNumber(&itup->t_tid),
    1204              :                                      ItemPointerGetOffsetNumber(&itup->t_tid),
    1205              :                                      newitemoff, BufferGetBlockNumber(buf),
    1206              :                                      RelationGetRelationName(rel))));
    1207              : 
    1208              :         /* use a mutable copy of itup as our itup from here on */
    1209        21975 :         origitup = itup;
    1210        21975 :         itup = CopyIndexTuple(origitup);
    1211        21975 :         nposting = _bt_swap_posting(itup, oposting, postingoff);
    1212              :         /* itup now contains rightmost/max TID from oposting */
    1213              : 
    1214              :         /* Alter offset so that newitem goes after posting list */
    1215        21975 :         newitemoff = OffsetNumberNext(newitemoff);
    1216              :     }
    1217              : 
    1218              :     /*
    1219              :      * Do we need to split the page to fit the item on it?
    1220              :      *
    1221              :      * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
    1222              :      * so this comparison is correct even though we appear to be accounting
    1223              :      * only for the item and not for its line pointer.
    1224              :      */
    1225      4660392 :     if (PageGetFreeSpace(page) < itemsz)
    1226              :     {
    1227              :         Buffer      rbuf;
    1228              : 
    1229              :         Assert(!split_only_page);
    1230              : 
    1231              :         /* split the buffer into left and right halves */
    1232        14271 :         rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
    1233              :                          itup, origitup, nposting, postingoff);
    1234        14271 :         PredicateLockPageSplit(rel,
    1235              :                                BufferGetBlockNumber(buf),
    1236              :                                BufferGetBlockNumber(rbuf));
    1237              : 
    1238              :         /*----------
    1239              :          * By here,
    1240              :          *
    1241              :          *      +  our target page has been split;
    1242              :          *      +  the original tuple has been inserted;
    1243              :          *      +  we have write locks on both the old (left half)
    1244              :          *         and new (right half) buffers, after the split; and
    1245              :          *      +  we know the key we want to insert into the parent
    1246              :          *         (it's the "high key" on the left child page).
    1247              :          *
    1248              :          * We're ready to do the parent insertion.  We need to hold onto the
    1249              :          * locks for the child pages until we locate the parent, but we can
    1250              :          * at least release the lock on the right child before doing the
    1251              :          * actual insertion.  The lock on the left child will be released
    1252              :          * last of all by parent insertion, where it is the 'cbuf' of parent
    1253              :          * page.
    1254              :          *----------
    1255              :          */
    1256              : #ifdef USE_INJECTION_POINTS
    1257        14271 :         if (P_ISLEAF(opaque))
    1258        14063 :             INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
    1259              :         else
    1260          208 :             INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
    1261              : #endif
    1262              : 
    1263        14271 :         _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
    1264              :     }
    1265              :     else
    1266              :     {
    1267      4646121 :         Buffer      metabuf = InvalidBuffer;
    1268      4646121 :         Page        metapg = NULL;
    1269      4646121 :         BTMetaPageData *metad = NULL;
    1270              :         BlockNumber blockcache;
    1271              : 
    1272              :         /*
    1273              :          * If we are doing this insert because we split a page that was the
    1274              :          * only one on its tree level, but was not the root, it may have been
    1275              :          * the "fast root".  We need to ensure that the fast root link points
    1276              :          * at or above the current page.  We can safely acquire a lock on the
    1277              :          * metapage here --- see comments for _bt_newlevel().
    1278              :          */
    1279      4646121 :         if (unlikely(split_only_page))
    1280              :         {
    1281              :             Assert(!isleaf);
    1282              :             Assert(BufferIsValid(cbuf));
    1283              : 
    1284           16 :             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    1285           16 :             metapg = BufferGetPage(metabuf);
    1286           16 :             metad = BTPageGetMeta(metapg);
    1287              : 
    1288           16 :             if (metad->btm_fastlevel >= opaque->btpo_level)
    1289              :             {
    1290              :                 /* no update wanted */
    1291            0 :                 _bt_relbuf(rel, metabuf);
    1292            0 :                 metabuf = InvalidBuffer;
    1293              :             }
    1294              :         }
    1295              : 
    1296              :         /* Do the update.  No ereport(ERROR) until changes are logged */
    1297      4646121 :         START_CRIT_SECTION();
    1298              : 
    1299      4646121 :         if (postingoff != 0)
    1300        21930 :             memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
    1301              : 
    1302      4646121 :         if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
    1303            0 :             elog(PANIC, "failed to add new item to block %u in index \"%s\"",
    1304              :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1305              : 
    1306      4646121 :         MarkBufferDirty(buf);
    1307              : 
    1308      4646121 :         if (BufferIsValid(metabuf))
    1309              :         {
    1310              :             /* upgrade meta-page if needed */
    1311           16 :             if (metad->btm_version < BTREE_NOVAC_VERSION)
    1312            0 :                 _bt_upgrademetapage(metapg);
    1313           16 :             metad->btm_fastroot = BufferGetBlockNumber(buf);
    1314           16 :             metad->btm_fastlevel = opaque->btpo_level;
    1315           16 :             MarkBufferDirty(metabuf);
    1316              :         }
    1317              : 
    1318              :         /*
    1319              :          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
    1320              :          * finishes a split
    1321              :          */
    1322      4646121 :         if (!isleaf)
    1323              :         {
    1324        13291 :             Page        cpage = BufferGetPage(cbuf);
    1325        13291 :             BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1326              : 
    1327              :             Assert(P_INCOMPLETE_SPLIT(cpageop));
    1328        13291 :             cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1329        13291 :             MarkBufferDirty(cbuf);
    1330              :         }
    1331              : 
    1332              :         /* XLOG stuff */
    1333      4646121 :         if (RelationNeedsWAL(rel))
    1334      4380015 :         {
    1335              :             xl_btree_insert xlrec;
    1336              :             xl_btree_metadata xlmeta;
    1337              :             uint8       xlinfo;
    1338              :             uint16      upostingoff;
    1339              : 
    1340      4380015 :             xlrec.offnum = newitemoff;
    1341              : 
    1342      4380015 :             XLogBeginInsert();
    1343      4380015 :             XLogRegisterData(&xlrec, SizeOfBtreeInsert);
    1344              : 
    1345      4380015 :             if (isleaf && postingoff == 0)
    1346              :             {
    1347              :                 /* Simple leaf insert */
    1348      4345450 :                 xlinfo = XLOG_BTREE_INSERT_LEAF;
    1349              :             }
    1350        34565 :             else if (postingoff != 0)
    1351              :             {
    1352              :                 /*
    1353              :                  * Leaf insert with posting list split.  Must include
    1354              :                  * postingoff field before newitem/orignewitem.
    1355              :                  */
    1356              :                 Assert(isleaf);
    1357        21930 :                 xlinfo = XLOG_BTREE_INSERT_POST;
    1358              :             }
    1359              :             else
    1360              :             {
    1361              :                 /* Internal page insert, which finishes a split on cbuf */
    1362        12635 :                 xlinfo = XLOG_BTREE_INSERT_UPPER;
    1363        12635 :                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
    1364              : 
    1365        12635 :                 if (BufferIsValid(metabuf))
    1366              :                 {
    1367              :                     /* Actually, it's an internal page insert + meta update */
    1368           16 :                     xlinfo = XLOG_BTREE_INSERT_META;
    1369              : 
    1370              :                     Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    1371           16 :                     xlmeta.version = metad->btm_version;
    1372           16 :                     xlmeta.root = metad->btm_root;
    1373           16 :                     xlmeta.level = metad->btm_level;
    1374           16 :                     xlmeta.fastroot = metad->btm_fastroot;
    1375           16 :                     xlmeta.fastlevel = metad->btm_fastlevel;
    1376           16 :                     xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    1377           16 :                     xlmeta.allequalimage = metad->btm_allequalimage;
    1378              : 
    1379           16 :                     XLogRegisterBuffer(2, metabuf,
    1380              :                                        REGBUF_WILL_INIT | REGBUF_STANDARD);
    1381           16 :                     XLogRegisterBufData(2, &xlmeta,
    1382              :                                         sizeof(xl_btree_metadata));
    1383              :                 }
    1384              :             }
    1385              : 
    1386      4380015 :             XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1387      4380015 :             if (postingoff == 0)
    1388              :             {
    1389              :                 /* Just log itup from caller */
    1390      4358085 :                 XLogRegisterBufData(0, itup, IndexTupleSize(itup));
    1391              :             }
    1392              :             else
    1393              :             {
    1394              :                 /*
    1395              :                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
    1396              :                  * record) case.
    1397              :                  *
    1398              :                  * Log postingoff.  Also log origitup, not itup.  REDO routine
    1399              :                  * must reconstruct final itup (as well as nposting) using
    1400              :                  * _bt_swap_posting().
    1401              :                  */
    1402        21930 :                 upostingoff = postingoff;
    1403              : 
    1404        21930 :                 XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
    1405        21930 :                 XLogRegisterBufData(0, origitup,
    1406        21930 :                                     IndexTupleSize(origitup));
    1407              :             }
    1408              : 
    1409      4380015 :             recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    1410              :         }
    1411              :         else
    1412       266106 :             recptr = XLogGetFakeLSN(rel);
    1413              : 
    1414      4646121 :         if (BufferIsValid(metabuf))
    1415           16 :             PageSetLSN(metapg, recptr);
    1416      4646121 :         if (!isleaf)
    1417        13291 :             PageSetLSN(BufferGetPage(cbuf), recptr);
    1418              : 
    1419      4646121 :         PageSetLSN(page, recptr);
    1420              : 
    1421      4646121 :         END_CRIT_SECTION();
    1422              : 
    1423              :         /* Release subsidiary buffers */
    1424      4646121 :         if (BufferIsValid(metabuf))
    1425           16 :             _bt_relbuf(rel, metabuf);
    1426      4646121 :         if (!isleaf)
    1427        13291 :             _bt_relbuf(rel, cbuf);
    1428              : 
    1429              :         /*
    1430              :          * Cache the block number if this is the rightmost leaf page.  Cache
    1431              :          * may be used by a future inserter within _bt_search_insert().
    1432              :          */
    1433      4646121 :         blockcache = InvalidBlockNumber;
    1434      4646121 :         if (isrightmost && isleaf && !isroot)
    1435      2713761 :             blockcache = BufferGetBlockNumber(buf);
    1436              : 
    1437              :         /* Release buffer for insertion target block */
    1438      4646121 :         _bt_relbuf(rel, buf);
    1439              : 
    1440              :         /*
    1441              :          * If we decided to cache the insertion target block before releasing
    1442              :          * its buffer lock, then cache it now.  Check the height of the tree
    1443              :          * first, though.  We don't go for the optimization with small
    1444              :          * indexes.  Defer final check to this point to ensure that we don't
    1445              :          * call _bt_getrootheight while holding a buffer lock.
    1446              :          */
    1447      7359882 :         if (BlockNumberIsValid(blockcache) &&
    1448      2713761 :             _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
    1449        33438 :             RelationSetTargetBlock(rel, blockcache);
    1450              :     }
    1451              : 
    1452              :     /* be tidy */
    1453      4660392 :     if (postingoff != 0)
    1454              :     {
    1455              :         /* itup is actually a modified copy of caller's original */
    1456        21975 :         pfree(nposting);
    1457        21975 :         pfree(itup);
    1458              :     }
    1459      4660392 : }
    1460              : 
    1461              : /*
    1462              :  *  _bt_split() -- split a page in the btree.
    1463              :  *
    1464              :  *      On entry, buf is the page to split, and is pinned and write-locked.
    1465              :  *      newitemoff etc. tell us about the new item that must be inserted
    1466              :  *      along with the data from the original page.
    1467              :  *
    1468              :  *      itup_key is used for suffix truncation on leaf pages (internal
    1469              :  *      page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
    1470              :  *      is the left-sibling of the page we're inserting the downlink for.
    1471              :  *      This function will clear the INCOMPLETE_SPLIT flag on it, and
    1472              :  *      release the buffer.
    1473              :  *
    1474              :  *      orignewitem, nposting, and postingoff are needed when an insert of
    1475              :  *      orignewitem results in both a posting list split and a page split.
    1476              :  *      These extra posting list split details are used here in the same
    1477              :  *      way as they are used in the more common case where a posting list
    1478              :  *      split does not coincide with a page split.  We need to deal with
    1479              :  *      posting list splits directly in order to ensure that everything
    1480              :  *      that follows from the insert of orignewitem is handled as a single
    1481              :  *      atomic operation (though caller's insert of a new pivot/downlink
    1482              :  *      into parent page will still be a separate operation).  See
    1483              :  *      nbtree/README for details on the design of posting list splits.
    1484              :  *
    1485              :  *      Returns the new right sibling of buf, pinned and write-locked.
    1486              :  *      The pin and lock on buf are maintained.
    1487              :  */
    1488              : static Buffer
    1489        14271 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
    1490              :           Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1491              :           IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
    1492              : {
    1493              :     Buffer      rbuf;
    1494              :     Page        origpage;
    1495              :     Page        leftpage,
    1496              :                 rightpage;
    1497              :     PGAlignedBlock leftpage_buf,
    1498              :                 rightpage_buf;
    1499              :     BlockNumber origpagenumber,
    1500              :                 rightpagenumber;
    1501              :     BTPageOpaque ropaque,
    1502              :                 lopaque,
    1503              :                 oopaque;
    1504        14271 :     Buffer      sbuf = InvalidBuffer;
    1505        14271 :     Page        spage = NULL;
    1506        14271 :     BTPageOpaque sopaque = NULL;
    1507              :     Size        itemsz;
    1508              :     ItemId      itemid;
    1509              :     IndexTuple  firstright,
    1510              :                 lefthighkey;
    1511              :     OffsetNumber firstrightoff;
    1512              :     OffsetNumber afterleftoff,
    1513              :                 afterrightoff,
    1514              :                 minusinfoff;
    1515              :     OffsetNumber origpagepostingoff;
    1516              :     OffsetNumber maxoff;
    1517              :     OffsetNumber i;
    1518              :     bool        newitemonleft,
    1519              :                 isleaf,
    1520              :                 isrightmost;
    1521              :     XLogRecPtr  recptr;
    1522              : 
    1523              :     /*
    1524              :      * origpage is the original page to be split.  leftpage is a temporary
    1525              :      * buffer that receives the left-sibling data, which will be copied back
    1526              :      * into origpage on success.  rightpage is the new page that will receive
    1527              :      * the right-sibling data.
    1528              :      *
    1529              :      * leftpage is allocated after choosing a split point.  rightpage's new
    1530              :      * buffer isn't acquired until after leftpage is initialized and has new
    1531              :      * high key, the last point where splitting the page may fail (barring
    1532              :      * corruption).  Failing before acquiring new buffer won't have lasting
    1533              :      * consequences, since origpage won't have been modified and leftpage is
    1534              :      * only workspace.
    1535              :      */
    1536        14271 :     origpage = BufferGetPage(buf);
    1537        14271 :     oopaque = BTPageGetOpaque(origpage);
    1538        14271 :     isleaf = P_ISLEAF(oopaque);
    1539        14271 :     isrightmost = P_RIGHTMOST(oopaque);
    1540        14271 :     maxoff = PageGetMaxOffsetNumber(origpage);
    1541        14271 :     origpagenumber = BufferGetBlockNumber(buf);
    1542              : 
    1543              :     /*
    1544              :      * Choose a point to split origpage at.
    1545              :      *
    1546              :      * A split point can be thought of as a point _between_ two existing data
    1547              :      * items on origpage (the lastleft and firstright tuples), provided you
    1548              :      * pretend that the new item that didn't fit is already on origpage.
    1549              :      *
    1550              :      * Since origpage does not actually contain newitem, the representation of
    1551              :      * split points needs to work with two boundary cases: splits where
    1552              :      * newitem is lastleft, and splits where newitem is firstright.
    1553              :      * newitemonleft resolves the ambiguity that would otherwise exist when
    1554              :      * newitemoff == firstrightoff.  In all other cases it's clear which side
    1555              :      * of the split every tuple goes on from context.  newitemonleft is
    1556              :      * usually (but not always) redundant information.
    1557              :      *
    1558              :      * firstrightoff is supposed to be an origpage offset number, but it's
    1559              :      * possible that its value will be maxoff+1, which is "past the end" of
    1560              :      * origpage.  This happens in the rare case where newitem goes after all
    1561              :      * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
    1562              :      * origpage at the point that leaves newitem alone on new right page.  Any
    1563              :      * "!newitemonleft && newitemoff == firstrightoff" split point makes
    1564              :      * newitem the firstright tuple, though, so this case isn't a special
    1565              :      * case.
    1566              :      */
    1567        14271 :     firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
    1568              :                                      newitem, &newitemonleft);
    1569              : 
    1570              :     /* Use temporary buffer for leftpage */
    1571        14271 :     leftpage = leftpage_buf.data;
    1572        14271 :     _bt_pageinit(leftpage, BufferGetPageSize(buf));
    1573        14271 :     lopaque = BTPageGetOpaque(leftpage);
    1574              : 
    1575              :     /*
    1576              :      * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
    1577              :      * and HAS_GARBAGE flags.
    1578              :      */
    1579        14271 :     lopaque->btpo_flags = oopaque->btpo_flags;
    1580        14271 :     lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1581              :     /* set flag in leftpage indicating that rightpage has no downlink yet */
    1582        14271 :     lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
    1583        14271 :     lopaque->btpo_prev = oopaque->btpo_prev;
    1584              :     /* handle btpo_next after rightpage buffer acquired */
    1585        14271 :     lopaque->btpo_level = oopaque->btpo_level;
    1586              :     /* handle btpo_cycleid after rightpage buffer acquired */
    1587              : 
    1588              :     /*
    1589              :      * Copy the original page's LSN into leftpage, which will become the
    1590              :      * updated version of the page.  We need this because XLogInsert will
    1591              :      * examine the LSN and possibly dump it in a page image.
    1592              :      */
    1593        14271 :     PageSetLSN(leftpage, PageGetLSN(origpage));
    1594              : 
    1595              :     /*
    1596              :      * Determine page offset number of existing overlapped-with-orignewitem
    1597              :      * posting list when it is necessary to perform a posting list split in
    1598              :      * passing.  Note that newitem was already changed by caller (newitem no
    1599              :      * longer has the orignewitem TID).
    1600              :      *
    1601              :      * This page offset number (origpagepostingoff) will be used to pretend
    1602              :      * that the posting split has already taken place, even though the
    1603              :      * required modifications to origpage won't occur until we reach the
    1604              :      * critical section.  The lastleft and firstright tuples of our page split
    1605              :      * point should, in effect, come from an imaginary version of origpage
    1606              :      * that has the nposting tuple instead of the original posting list tuple.
    1607              :      *
    1608              :      * Note: _bt_findsplitloc() should have compensated for coinciding posting
    1609              :      * list splits in just the same way, at least in theory.  It doesn't
    1610              :      * bother with that, though.  In practice it won't affect its choice of
    1611              :      * split point.
    1612              :      */
    1613        14271 :     origpagepostingoff = InvalidOffsetNumber;
    1614        14271 :     if (postingoff != 0)
    1615              :     {
    1616              :         Assert(isleaf);
    1617              :         Assert(ItemPointerCompare(&orignewitem->t_tid,
    1618              :                                   &newitem->t_tid) < 0);
    1619              :         Assert(BTreeTupleIsPosting(nposting));
    1620           45 :         origpagepostingoff = OffsetNumberPrev(newitemoff);
    1621              :     }
    1622              : 
    1623              :     /*
    1624              :      * The high key for the new left page is a possibly-truncated copy of
    1625              :      * firstright on the leaf level (it's "firstright itself" on internal
    1626              :      * pages; see !isleaf comments below).  This may seem to be contrary to
    1627              :      * Lehman & Yao's approach of using a copy of lastleft as the new high key
    1628              :      * when splitting on the leaf level.  It isn't, though.
    1629              :      *
    1630              :      * Suffix truncation will leave the left page's high key fully equal to
    1631              :      * lastleft when lastleft and firstright are equal prior to heap TID (that
    1632              :      * is, the tiebreaker TID value comes from lastleft).  It isn't actually
    1633              :      * necessary for a new leaf high key to be a copy of lastleft for the L&Y
    1634              :      * "subtree" invariant to hold.  It's sufficient to make sure that the new
    1635              :      * leaf high key is strictly less than firstright, and greater than or
    1636              :      * equal to (not necessarily equal to) lastleft.  In other words, when
    1637              :      * suffix truncation isn't possible during a leaf page split, we take
    1638              :      * L&Y's exact approach to generating a new high key for the left page.
    1639              :      * (Actually, that is slightly inaccurate.  We don't just use a copy of
    1640              :      * lastleft.  A tuple with all the keys from firstright but the max heap
    1641              :      * TID from lastleft is used, to avoid introducing a special case.)
    1642              :      */
    1643        14271 :     if (!newitemonleft && newitemoff == firstrightoff)
    1644              :     {
    1645              :         /* incoming tuple becomes firstright */
    1646           36 :         itemsz = newitemsz;
    1647           36 :         firstright = newitem;
    1648              :     }
    1649              :     else
    1650              :     {
    1651              :         /* existing item at firstrightoff becomes firstright */
    1652        14235 :         itemid = PageGetItemId(origpage, firstrightoff);
    1653        14235 :         itemsz = ItemIdGetLength(itemid);
    1654        14235 :         firstright = (IndexTuple) PageGetItem(origpage, itemid);
    1655        14235 :         if (firstrightoff == origpagepostingoff)
    1656            0 :             firstright = nposting;
    1657              :     }
    1658              : 
    1659        14271 :     if (isleaf)
    1660              :     {
    1661              :         IndexTuple  lastleft;
    1662              : 
    1663              :         /* Attempt suffix truncation for leaf page splits */
    1664        14063 :         if (newitemonleft && newitemoff == firstrightoff)
    1665              :         {
    1666              :             /* incoming tuple becomes lastleft */
    1667          196 :             lastleft = newitem;
    1668              :         }
    1669              :         else
    1670              :         {
    1671              :             OffsetNumber lastleftoff;
    1672              : 
    1673              :             /* existing item before firstrightoff becomes lastleft */
    1674        13867 :             lastleftoff = OffsetNumberPrev(firstrightoff);
    1675              :             Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
    1676        13867 :             itemid = PageGetItemId(origpage, lastleftoff);
    1677        13867 :             lastleft = (IndexTuple) PageGetItem(origpage, itemid);
    1678        13867 :             if (lastleftoff == origpagepostingoff)
    1679            5 :                 lastleft = nposting;
    1680              :         }
    1681              : 
    1682        14063 :         lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
    1683        14063 :         itemsz = IndexTupleSize(lefthighkey);
    1684              :     }
    1685              :     else
    1686              :     {
    1687              :         /*
    1688              :          * Don't perform suffix truncation on a copy of firstright to make
    1689              :          * left page high key for internal page splits.  Must use firstright
    1690              :          * as new high key directly.
    1691              :          *
    1692              :          * Each distinct separator key value originates as a leaf level high
    1693              :          * key; all other separator keys/pivot tuples are copied from one
    1694              :          * level down.  A separator key in a grandparent page must be
    1695              :          * identical to high key in rightmost parent page of the subtree to
    1696              :          * its left, which must itself be identical to high key in rightmost
    1697              :          * child page of that same subtree (this even applies to separator
    1698              :          * from grandparent's high key).  There must always be an unbroken
    1699              :          * "seam" of identical separator keys that guide index scans at every
    1700              :          * level, starting from the grandparent.  That's why suffix truncation
    1701              :          * is unsafe here.
    1702              :          *
    1703              :          * Internal page splits will truncate firstright into a "negative
    1704              :          * infinity" data item when it gets inserted on the new right page
    1705              :          * below, though.  This happens during the call to _bt_pgaddtup() for
    1706              :          * the new first data item for right page.  Do not confuse this
    1707              :          * mechanism with suffix truncation.  It is just a convenient way of
    1708              :          * implementing page splits that split the internal page "inside"
    1709              :          * firstright.  The lefthighkey separator key cannot appear a second
    1710              :          * time in the right page (only firstright's downlink goes in right
    1711              :          * page).
    1712              :          */
    1713          208 :         lefthighkey = firstright;
    1714              :     }
    1715              : 
    1716              :     /*
    1717              :      * Add new high key to leftpage
    1718              :      */
    1719        14271 :     afterleftoff = P_HIKEY;
    1720              : 
    1721              :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
    1722              :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
    1723              :            IndexRelationGetNumberOfKeyAttributes(rel));
    1724              :     Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
    1725        14271 :     if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
    1726            0 :         elog(ERROR, "failed to add high key to the left sibling"
    1727              :              " while splitting block %u of index \"%s\"",
    1728              :              origpagenumber, RelationGetRelationName(rel));
    1729        14271 :     afterleftoff = OffsetNumberNext(afterleftoff);
    1730              : 
    1731              :     /*
    1732              :      * Acquire a new right page to split into, now that left page has a new
    1733              :      * high key.
    1734              :      *
    1735              :      * To not confuse future VACUUM operations, we zero the right page and
    1736              :      * work on an in-memory copy of it before writing WAL, then copy its
    1737              :      * contents back to the actual page once we start the critical section
    1738              :      * work.  This simplifies the split work, so as there is no need to zero
    1739              :      * the right page before throwing an error.
    1740              :      */
    1741        14271 :     rbuf = _bt_allocbuf(rel, heaprel);
    1742        14271 :     rightpage = rightpage_buf.data;
    1743              : 
    1744              :     /*
    1745              :      * Copy the contents of the right page into its temporary location, and
    1746              :      * zero the original space.
    1747              :      */
    1748        14271 :     memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
    1749        14271 :     memset(BufferGetPage(rbuf), 0, BLCKSZ);
    1750        14271 :     rightpagenumber = BufferGetBlockNumber(rbuf);
    1751              :     /* rightpage was initialized by _bt_allocbuf */
    1752        14271 :     ropaque = BTPageGetOpaque(rightpage);
    1753              : 
    1754              :     /*
    1755              :      * Finish off remaining leftpage special area fields.  They cannot be set
    1756              :      * before both origpage (leftpage) and rightpage buffers are acquired and
    1757              :      * locked.
    1758              :      *
    1759              :      * btpo_cycleid is only used with leaf pages, though we set it here in all
    1760              :      * cases just to be consistent.
    1761              :      */
    1762        14271 :     lopaque->btpo_next = rightpagenumber;
    1763        14271 :     lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
    1764              : 
    1765              :     /*
    1766              :      * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
    1767              :      * and HAS_GARBAGE flags.
    1768              :      */
    1769        14271 :     ropaque->btpo_flags = oopaque->btpo_flags;
    1770        14271 :     ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1771        14271 :     ropaque->btpo_prev = origpagenumber;
    1772        14271 :     ropaque->btpo_next = oopaque->btpo_next;
    1773        14271 :     ropaque->btpo_level = oopaque->btpo_level;
    1774        14271 :     ropaque->btpo_cycleid = lopaque->btpo_cycleid;
    1775              : 
    1776              :     /*
    1777              :      * Add new high key to rightpage where necessary.
    1778              :      *
    1779              :      * If the page we're splitting is not the rightmost page at its level in
    1780              :      * the tree, then the first entry on the page is the high key from
    1781              :      * origpage.
    1782              :      */
    1783        14271 :     afterrightoff = P_HIKEY;
    1784              : 
    1785        14271 :     if (!isrightmost)
    1786              :     {
    1787              :         IndexTuple  righthighkey;
    1788              : 
    1789         5805 :         itemid = PageGetItemId(origpage, P_HIKEY);
    1790         5805 :         itemsz = ItemIdGetLength(itemid);
    1791         5805 :         righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    1792              :         Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
    1793              :         Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
    1794              :                IndexRelationGetNumberOfKeyAttributes(rel));
    1795         5805 :         if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
    1796              :         {
    1797            0 :             elog(ERROR, "failed to add high key to the right sibling"
    1798              :                  " while splitting block %u of index \"%s\"",
    1799              :                  origpagenumber, RelationGetRelationName(rel));
    1800              :         }
    1801         5805 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1802              :     }
    1803              : 
    1804              :     /*
    1805              :      * Internal page splits truncate first data item on right page -- it
    1806              :      * becomes "minus infinity" item for the page.  Set this up here.
    1807              :      */
    1808        14271 :     minusinfoff = InvalidOffsetNumber;
    1809        14271 :     if (!isleaf)
    1810          208 :         minusinfoff = afterrightoff;
    1811              : 
    1812              :     /*
    1813              :      * Now transfer all the data items (non-pivot tuples in isleaf case, or
    1814              :      * additional pivot tuples in !isleaf case) to the appropriate page.
    1815              :      *
    1816              :      * Note: we *must* insert at least the right page's items in item-number
    1817              :      * order, for the benefit of _bt_restore_page().
    1818              :      */
    1819      4358535 :     for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
    1820              :     {
    1821              :         IndexTuple  dataitem;
    1822              : 
    1823      4344264 :         itemid = PageGetItemId(origpage, i);
    1824      4344264 :         itemsz = ItemIdGetLength(itemid);
    1825      4344264 :         dataitem = (IndexTuple) PageGetItem(origpage, itemid);
    1826              : 
    1827              :         /* replace original item with nposting due to posting split? */
    1828      4344264 :         if (i == origpagepostingoff)
    1829              :         {
    1830              :             Assert(BTreeTupleIsPosting(dataitem));
    1831              :             Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
    1832           45 :             dataitem = nposting;
    1833              :         }
    1834              : 
    1835              :         /* does new item belong before this one? */
    1836      4344219 :         else if (i == newitemoff)
    1837              :         {
    1838         8023 :             if (newitemonleft)
    1839              :             {
    1840              :                 Assert(newitemoff <= firstrightoff);
    1841         2022 :                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
    1842              :                                   false))
    1843              :                 {
    1844            0 :                     elog(ERROR, "failed to add new item to the left sibling"
    1845              :                          " while splitting block %u of index \"%s\"",
    1846              :                          origpagenumber, RelationGetRelationName(rel));
    1847              :                 }
    1848         2022 :                 afterleftoff = OffsetNumberNext(afterleftoff);
    1849              :             }
    1850              :             else
    1851              :             {
    1852              :                 Assert(newitemoff >= firstrightoff);
    1853         6001 :                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1854              :                                   afterrightoff == minusinfoff))
    1855              :                 {
    1856            0 :                     elog(ERROR, "failed to add new item to the right sibling"
    1857              :                          " while splitting block %u of index \"%s\"",
    1858              :                          origpagenumber, RelationGetRelationName(rel));
    1859              :                 }
    1860         6001 :                 afterrightoff = OffsetNumberNext(afterrightoff);
    1861              :             }
    1862              :         }
    1863              : 
    1864              :         /* decide which page to put it on */
    1865      4344264 :         if (i < firstrightoff)
    1866              :         {
    1867      3350946 :             if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
    1868              :             {
    1869            0 :                 elog(ERROR, "failed to add old item to the left sibling"
    1870              :                      " while splitting block %u of index \"%s\"",
    1871              :                      origpagenumber, RelationGetRelationName(rel));
    1872              :             }
    1873      3350946 :             afterleftoff = OffsetNumberNext(afterleftoff);
    1874              :         }
    1875              :         else
    1876              :         {
    1877       993318 :             if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
    1878              :                               afterrightoff == minusinfoff))
    1879              :             {
    1880            0 :                 elog(ERROR, "failed to add old item to the right sibling"
    1881              :                      " while splitting block %u of index \"%s\"",
    1882              :                      origpagenumber, RelationGetRelationName(rel));
    1883              :             }
    1884       993318 :             afterrightoff = OffsetNumberNext(afterrightoff);
    1885              :         }
    1886              :     }
    1887              : 
    1888              :     /* Handle case where newitem goes at the end of rightpage */
    1889        14271 :     if (i <= newitemoff)
    1890              :     {
    1891              :         /*
    1892              :          * Can't have newitemonleft here; that would imply we were told to put
    1893              :          * *everything* on the left page, which cannot fit (if it could, we'd
    1894              :          * not be splitting the page).
    1895              :          */
    1896              :         Assert(!newitemonleft && newitemoff == maxoff + 1);
    1897         6248 :         if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1898              :                           afterrightoff == minusinfoff))
    1899              :         {
    1900            0 :             elog(ERROR, "failed to add new item to the right sibling"
    1901              :                  " while splitting block %u of index \"%s\"",
    1902              :                  origpagenumber, RelationGetRelationName(rel));
    1903              :         }
    1904         6248 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1905              :     }
    1906              : 
    1907              :     /*
    1908              :      * We have to grab the original right sibling (if any) and update its prev
    1909              :      * link.  We are guaranteed that this is deadlock-free, since we couple
    1910              :      * the locks in the standard order: left to right.
    1911              :      */
    1912        14271 :     if (!isrightmost)
    1913              :     {
    1914         5805 :         sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
    1915         5805 :         spage = BufferGetPage(sbuf);
    1916         5805 :         sopaque = BTPageGetOpaque(spage);
    1917         5805 :         if (sopaque->btpo_prev != origpagenumber)
    1918              :         {
    1919            0 :             ereport(ERROR,
    1920              :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1921              :                      errmsg_internal("right sibling's left-link doesn't match: "
    1922              :                                      "block %u links to %u instead of expected %u in index \"%s\"",
    1923              :                                      oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
    1924              :                                      RelationGetRelationName(rel))));
    1925              :         }
    1926              : 
    1927              :         /*
    1928              :          * Check to see if we can set the SPLIT_END flag in the right-hand
    1929              :          * split page; this can save some I/O for vacuum since it need not
    1930              :          * proceed to the right sibling.  We can set the flag if the right
    1931              :          * sibling has a different cycleid: that means it could not be part of
    1932              :          * a group of pages that were all split off from the same ancestor
    1933              :          * page.  If you're confused, imagine that page A splits to A B and
    1934              :          * then again, yielding A C B, while vacuum is in progress.  Tuples
    1935              :          * originally in A could now be in either B or C, hence vacuum must
    1936              :          * examine both pages.  But if D, our right sibling, has a different
    1937              :          * cycleid then it could not contain any tuples that were in A when
    1938              :          * the vacuum started.
    1939              :          */
    1940         5805 :         if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
    1941            1 :             ropaque->btpo_flags |= BTP_SPLIT_END;
    1942              :     }
    1943              : 
    1944              :     /*
    1945              :      * Right sibling is locked, new siblings are prepared, but original page
    1946              :      * is not updated yet.
    1947              :      *
    1948              :      * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
    1949              :      * not starting the critical section till here because we haven't been
    1950              :      * scribbling on the original page yet; see comments above.
    1951              :      */
    1952        14271 :     START_CRIT_SECTION();
    1953              : 
    1954              :     /*
    1955              :      * By here, the original data page has been split into two new halves, and
    1956              :      * these are correct.  The algorithm requires that the left page never
    1957              :      * move during a split, so we copy the new left page back on top of the
    1958              :      * original.  We need to do this before writing the WAL record, so that
    1959              :      * XLogInsert can WAL log an image of the page if necessary.
    1960              :      */
    1961        14271 :     memcpy(origpage, leftpage, BLCKSZ);
    1962              :     /* leftpage, lopaque must not be used below here */
    1963              : 
    1964              :     /*
    1965              :      * Move the contents of the right page from its temporary location to the
    1966              :      * destination buffer, before writing the WAL record.  Unlike the left
    1967              :      * page, the right page and its opaque area are still needed to complete
    1968              :      * the update of the page, so reinitialize them.
    1969              :      */
    1970        14271 :     rightpage = BufferGetPage(rbuf);
    1971        14271 :     memcpy(rightpage, rightpage_buf.data, BLCKSZ);
    1972        14271 :     ropaque = BTPageGetOpaque(rightpage);
    1973              : 
    1974        14271 :     MarkBufferDirty(buf);
    1975        14271 :     MarkBufferDirty(rbuf);
    1976              : 
    1977        14271 :     if (!isrightmost)
    1978              :     {
    1979         5805 :         sopaque->btpo_prev = rightpagenumber;
    1980         5805 :         MarkBufferDirty(sbuf);
    1981              :     }
    1982              : 
    1983              :     /*
    1984              :      * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
    1985              :      * a split
    1986              :      */
    1987        14271 :     if (!isleaf)
    1988              :     {
    1989          208 :         Page        cpage = BufferGetPage(cbuf);
    1990          208 :         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1991              : 
    1992          208 :         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1993          208 :         MarkBufferDirty(cbuf);
    1994              :     }
    1995              : 
    1996              :     /* XLOG stuff */
    1997        14271 :     if (RelationNeedsWAL(rel))
    1998        13594 :     {
    1999              :         xl_btree_split xlrec;
    2000              :         uint8       xlinfo;
    2001              : 
    2002        13594 :         xlrec.level = ropaque->btpo_level;
    2003              :         /* See comments below on newitem, orignewitem, and posting lists */
    2004        13594 :         xlrec.firstrightoff = firstrightoff;
    2005        13594 :         xlrec.newitemoff = newitemoff;
    2006        13594 :         xlrec.postingoff = 0;
    2007        13594 :         if (postingoff != 0 && origpagepostingoff < firstrightoff)
    2008           22 :             xlrec.postingoff = postingoff;
    2009              : 
    2010        13594 :         XLogBeginInsert();
    2011        13594 :         XLogRegisterData(&xlrec, SizeOfBtreeSplit);
    2012              : 
    2013        13594 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    2014        13594 :         XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
    2015              :         /* Log original right sibling, since we've changed its prev-pointer */
    2016        13594 :         if (!isrightmost)
    2017         5797 :             XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
    2018        13594 :         if (!isleaf)
    2019          208 :             XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
    2020              : 
    2021              :         /*
    2022              :          * Log the new item, if it was inserted on the left page. (If it was
    2023              :          * put on the right page, we don't need to explicitly WAL log it
    2024              :          * because it's included with all the other items on the right page.)
    2025              :          * Show the new item as belonging to the left page buffer, so that it
    2026              :          * is not stored if XLogInsert decides it needs a full-page image of
    2027              :          * the left page.  We always store newitemoff in the record, though.
    2028              :          *
    2029              :          * The details are sometimes slightly different for page splits that
    2030              :          * coincide with a posting list split.  If both the replacement
    2031              :          * posting list and newitem go on the right page, then we don't need
    2032              :          * to log anything extra, just like the simple !newitemonleft
    2033              :          * no-posting-split case (postingoff is set to zero in the WAL record,
    2034              :          * so recovery doesn't need to process a posting list split at all).
    2035              :          * Otherwise, we set postingoff and log orignewitem instead of
    2036              :          * newitem, despite having actually inserted newitem.  REDO routine
    2037              :          * must reconstruct nposting and newitem using _bt_swap_posting().
    2038              :          *
    2039              :          * Note: It's possible that our page split point is the point that
    2040              :          * makes the posting list lastleft and newitem firstright.  This is
    2041              :          * the only case where we log orignewitem/newitem despite newitem
    2042              :          * going on the right page.  If XLogInsert decides that it can omit
    2043              :          * orignewitem due to logging a full-page image of the left page,
    2044              :          * everything still works out, since recovery only needs to log
    2045              :          * orignewitem for items on the left page (just like the regular
    2046              :          * newitem-logged case).
    2047              :          */
    2048        13594 :         if (newitemonleft && xlrec.postingoff == 0)
    2049         2001 :             XLogRegisterBufData(0, newitem, newitemsz);
    2050        11593 :         else if (xlrec.postingoff != 0)
    2051              :         {
    2052              :             Assert(isleaf);
    2053              :             Assert(newitemonleft || firstrightoff == newitemoff);
    2054              :             Assert(newitemsz == IndexTupleSize(orignewitem));
    2055           22 :             XLogRegisterBufData(0, orignewitem, newitemsz);
    2056              :         }
    2057              : 
    2058              :         /* Log the left page's new high key */
    2059        13594 :         if (!isleaf)
    2060              :         {
    2061              :             /* lefthighkey isn't local copy, get current pointer */
    2062          208 :             itemid = PageGetItemId(origpage, P_HIKEY);
    2063          208 :             lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    2064              :         }
    2065        13594 :         XLogRegisterBufData(0, lefthighkey,
    2066        13594 :                             MAXALIGN(IndexTupleSize(lefthighkey)));
    2067              : 
    2068              :         /*
    2069              :          * Log the contents of the right page in the format understood by
    2070              :          * _bt_restore_page().  The whole right page will be recreated.
    2071              :          *
    2072              :          * Direct access to page is not good but faster - we should implement
    2073              :          * some new func in page API.  Note we only store the tuples
    2074              :          * themselves, knowing that they were inserted in item-number order
    2075              :          * and so the line pointers can be reconstructed.  See comments for
    2076              :          * _bt_restore_page().
    2077              :          */
    2078        13594 :         XLogRegisterBufData(1,
    2079        13594 :                             (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
    2080        13594 :                             ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
    2081              : 
    2082        13594 :         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
    2083        13594 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2084              :     }
    2085              :     else
    2086          677 :         recptr = XLogGetFakeLSN(rel);
    2087              : 
    2088        14271 :     PageSetLSN(origpage, recptr);
    2089        14271 :     PageSetLSN(rightpage, recptr);
    2090        14271 :     if (!isrightmost)
    2091         5805 :         PageSetLSN(spage, recptr);
    2092        14271 :     if (!isleaf)
    2093          208 :         PageSetLSN(BufferGetPage(cbuf), recptr);
    2094              : 
    2095        14271 :     END_CRIT_SECTION();
    2096              : 
    2097              :     /* release the old right sibling */
    2098        14271 :     if (!isrightmost)
    2099         5805 :         _bt_relbuf(rel, sbuf);
    2100              : 
    2101              :     /* release the child */
    2102        14271 :     if (!isleaf)
    2103          208 :         _bt_relbuf(rel, cbuf);
    2104              : 
    2105              :     /* be tidy */
    2106        14271 :     if (isleaf)
    2107        14063 :         pfree(lefthighkey);
    2108              : 
    2109              :     /* split's done */
    2110        14271 :     return rbuf;
    2111              : }
    2112              : 
    2113              : /*
    2114              :  * _bt_insert_parent() -- Insert downlink into parent, completing split.
    2115              :  *
    2116              :  * On entry, buf and rbuf are the left and right split pages, which we
    2117              :  * still hold write locks on.  Both locks will be released here.  We
    2118              :  * release the rbuf lock once we have a write lock on the page that we
    2119              :  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
    2120              :  * The lock on buf is released at the same point as the lock on the parent
    2121              :  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
    2122              :  * atomic operation that completes the split by inserting a new downlink.
    2123              :  *
    2124              :  * stack - stack showing how we got here.  Will be NULL when splitting true
    2125              :  *          root, or during concurrent root split, where we can be inefficient
    2126              :  * isroot - we split the true root
    2127              :  * isonly - we split a page alone on its level (might have been fast root)
    2128              :  */
    2129              : static void
    2130        14271 : _bt_insert_parent(Relation rel,
    2131              :                   Relation heaprel,
    2132              :                   Buffer buf,
    2133              :                   Buffer rbuf,
    2134              :                   BTStack stack,
    2135              :                   bool isroot,
    2136              :                   bool isonly)
    2137              : {
    2138              :     Assert(heaprel != NULL);
    2139              : 
    2140              :     /*
    2141              :      * Here we have to do something Lehman and Yao don't talk about: deal with
    2142              :      * a root split and construction of a new root.  If our stack is empty
    2143              :      * then we have just split a node on what had been the root level when we
    2144              :      * descended the tree.  If it was still the root then we perform a
    2145              :      * new-root construction.  If it *wasn't* the root anymore, search to find
    2146              :      * the next higher level that someone constructed meanwhile, and find the
    2147              :      * right place to insert as for the normal case.
    2148              :      *
    2149              :      * If we have to search for the parent level, we do so by re-descending
    2150              :      * from the root.  This is not super-efficient, but it's rare enough not
    2151              :      * to matter.
    2152              :      */
    2153        14271 :     if (isroot)
    2154              :     {
    2155              :         Buffer      rootbuf;
    2156              : 
    2157              :         Assert(stack == NULL);
    2158              :         Assert(isonly);
    2159              :         /* create a new root node one level up and update the metapage */
    2160          772 :         rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
    2161              :         /* release the split buffers */
    2162          772 :         _bt_relbuf(rel, rootbuf);
    2163          772 :         _bt_relbuf(rel, rbuf);
    2164          772 :         _bt_relbuf(rel, buf);
    2165              :     }
    2166              :     else
    2167              :     {
    2168        13499 :         BlockNumber bknum = BufferGetBlockNumber(buf);
    2169        13499 :         BlockNumber rbknum = BufferGetBlockNumber(rbuf);
    2170        13499 :         Page        page = BufferGetPage(buf);
    2171              :         IndexTuple  new_item;
    2172              :         BTStackData fakestack;
    2173              :         IndexTuple  ritem;
    2174              :         Buffer      pbuf;
    2175              : 
    2176        13499 :         if (stack == NULL)
    2177              :         {
    2178              :             BTPageOpaque opaque;
    2179              : 
    2180           16 :             elog(DEBUG2, "concurrent ROOT page split");
    2181           16 :             opaque = BTPageGetOpaque(page);
    2182              : 
    2183              :             /*
    2184              :              * We should never reach here when a leaf page split takes place
    2185              :              * despite the insert of newitem being able to apply the fastpath
    2186              :              * optimization.  Make sure of that with an assertion.
    2187              :              *
    2188              :              * This is more of a performance issue than a correctness issue.
    2189              :              * The fastpath won't have a descent stack.  Using a phony stack
    2190              :              * here works, but never rely on that.  The fastpath should be
    2191              :              * rejected within _bt_search_insert() when the rightmost leaf
    2192              :              * page will split, since it's faster to go through _bt_search()
    2193              :              * and get a stack in the usual way.
    2194              :              */
    2195              :             Assert(!(P_ISLEAF(opaque) &&
    2196              :                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
    2197              : 
    2198              :             /* Find the leftmost page at the next level up */
    2199           16 :             pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
    2200              :             /* Set up a phony stack entry pointing there */
    2201           16 :             stack = &fakestack;
    2202           16 :             stack->bts_blkno = BufferGetBlockNumber(pbuf);
    2203           16 :             stack->bts_offset = InvalidOffsetNumber;
    2204           16 :             stack->bts_parent = NULL;
    2205           16 :             _bt_relbuf(rel, pbuf);
    2206              :         }
    2207              : 
    2208              :         /* get high key from left, a strict lower bound for new right page */
    2209        13499 :         ritem = (IndexTuple) PageGetItem(page,
    2210        13499 :                                          PageGetItemId(page, P_HIKEY));
    2211              : 
    2212              :         /* form an index tuple that points at the new right page */
    2213        13499 :         new_item = CopyIndexTuple(ritem);
    2214        13499 :         BTreeTupleSetDownLink(new_item, rbknum);
    2215              : 
    2216              :         /*
    2217              :          * Re-find and write lock the parent of buf.
    2218              :          *
    2219              :          * It's possible that the location of buf's downlink has changed since
    2220              :          * our initial _bt_search() descent.  _bt_getstackbuf() will detect
    2221              :          * and recover from this, updating the stack, which ensures that the
    2222              :          * new downlink will be inserted at the correct offset. Even buf's
    2223              :          * parent may have changed.
    2224              :          */
    2225        13499 :         pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
    2226              : 
    2227              :         /*
    2228              :          * Unlock the right child.  The left child will be unlocked in
    2229              :          * _bt_insertonpg().
    2230              :          *
    2231              :          * Unlocking the right child must be delayed until here to ensure that
    2232              :          * no concurrent VACUUM operation can become confused.  Page deletion
    2233              :          * cannot be allowed to fail to re-find a downlink for the rbuf page.
    2234              :          * (Actually, this is just a vestige of how things used to work.  The
    2235              :          * page deletion code is expected to check for the INCOMPLETE_SPLIT
    2236              :          * flag on the left child.  It won't attempt deletion of the right
    2237              :          * child until the split is complete.  Despite all this, we opt to
    2238              :          * conservatively delay unlocking the right child until here.)
    2239              :          */
    2240        13499 :         _bt_relbuf(rel, rbuf);
    2241              : 
    2242        13499 :         if (pbuf == InvalidBuffer)
    2243            0 :             ereport(ERROR,
    2244              :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2245              :                      errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
    2246              :                                      RelationGetRelationName(rel), bknum, rbknum)));
    2247              : 
    2248              :         /* Recursively insert into the parent */
    2249        26998 :         _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
    2250        13499 :                        new_item, MAXALIGN(IndexTupleSize(new_item)),
    2251        13499 :                        stack->bts_offset + 1, 0, isonly);
    2252              : 
    2253              :         /* be tidy */
    2254        13499 :         pfree(new_item);
    2255              :     }
    2256        14271 : }
    2257              : 
    2258              : /*
    2259              :  * _bt_finish_split() -- Finish an incomplete split
    2260              :  *
    2261              :  * A crash or other failure can leave a split incomplete.  The insertion
    2262              :  * routines won't allow to insert on a page that is incompletely split.
    2263              :  * Before inserting on such a page, call _bt_finish_split().
    2264              :  *
    2265              :  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
    2266              :  * and unpinned.
    2267              :  *
    2268              :  * Caller must provide a valid heaprel, since finishing a page split requires
    2269              :  * allocating a new page if and when the parent page splits in turn.
    2270              :  */
    2271              : void
    2272            0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
    2273              : {
    2274            0 :     Page        lpage = BufferGetPage(lbuf);
    2275            0 :     BTPageOpaque lpageop = BTPageGetOpaque(lpage);
    2276              :     Buffer      rbuf;
    2277              :     Page        rpage;
    2278              :     BTPageOpaque rpageop;
    2279              :     bool        wasroot;
    2280              :     bool        wasonly;
    2281              : 
    2282              :     Assert(P_INCOMPLETE_SPLIT(lpageop));
    2283              :     Assert(heaprel != NULL);
    2284              : 
    2285              :     /* Lock right sibling, the one missing the downlink */
    2286            0 :     rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
    2287            0 :     rpage = BufferGetPage(rbuf);
    2288            0 :     rpageop = BTPageGetOpaque(rpage);
    2289              : 
    2290              :     /* Could this be a root split? */
    2291            0 :     if (!stack)
    2292              :     {
    2293              :         Buffer      metabuf;
    2294              :         Page        metapg;
    2295              :         BTMetaPageData *metad;
    2296              : 
    2297              :         /* acquire lock on the metapage */
    2298            0 :         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2299            0 :         metapg = BufferGetPage(metabuf);
    2300            0 :         metad = BTPageGetMeta(metapg);
    2301              : 
    2302            0 :         wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
    2303              : 
    2304            0 :         _bt_relbuf(rel, metabuf);
    2305              :     }
    2306              :     else
    2307            0 :         wasroot = false;
    2308              : 
    2309              :     /* Was this the only page on the level before split? */
    2310            0 :     wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
    2311              : 
    2312            0 :     INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
    2313            0 :     elog(DEBUG1, "finishing incomplete split of %u/%u",
    2314              :          BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
    2315              : 
    2316            0 :     _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
    2317            0 : }
    2318              : 
    2319              : /*
    2320              :  *  _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
    2321              :  *                       tuple whose downlink points to child page.
    2322              :  *
    2323              :  *      Caller passes child's block number, which is used to identify
    2324              :  *      associated pivot tuple in parent page using a linear search that
    2325              :  *      matches on pivot's downlink/block number.  The expected location of
    2326              :  *      the pivot tuple is taken from the stack one level above the child
    2327              :  *      page.  This is used as a starting point.  Insertions into the
    2328              :  *      parent level could cause the pivot tuple to move right; deletions
    2329              :  *      could cause it to move left, but not left of the page we previously
    2330              :  *      found it on.
    2331              :  *
    2332              :  *      Caller can use its stack to relocate the pivot tuple/downlink for
    2333              :  *      any same-level page to the right of the page found by its initial
    2334              :  *      descent.  This is necessary because of the possibility that caller
    2335              :  *      moved right to recover from a concurrent page split.  It's also
    2336              :  *      convenient for certain callers to be able to step right when there
    2337              :  *      wasn't a concurrent page split, while still using their original
    2338              :  *      stack.  For example, the checkingunique _bt_doinsert() case may
    2339              :  *      have to step right when there are many physical duplicates, and its
    2340              :  *      scantid forces an insertion to the right of the "first page the
    2341              :  *      value could be on".  (This is also relied on by all of our callers
    2342              :  *      when dealing with !heapkeyspace indexes.)
    2343              :  *
    2344              :  *      Returns write-locked parent page buffer, or InvalidBuffer if pivot
    2345              :  *      tuple not found (should not happen).  Adjusts bts_blkno &
    2346              :  *      bts_offset if changed.  Page split caller should insert its new
    2347              :  *      pivot tuple for its new right sibling page on parent page, at the
    2348              :  *      offset number bts_offset + 1.
    2349              :  */
    2350              : Buffer
    2351        16982 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
    2352              : {
    2353              :     BlockNumber blkno;
    2354              :     OffsetNumber start;
    2355              : 
    2356        16982 :     blkno = stack->bts_blkno;
    2357        16982 :     start = stack->bts_offset;
    2358              : 
    2359              :     for (;;)
    2360            2 :     {
    2361              :         Buffer      buf;
    2362              :         Page        page;
    2363              :         BTPageOpaque opaque;
    2364              : 
    2365        16984 :         buf = _bt_getbuf(rel, blkno, BT_WRITE);
    2366        16984 :         page = BufferGetPage(buf);
    2367        16984 :         opaque = BTPageGetOpaque(page);
    2368              : 
    2369              :         Assert(heaprel != NULL);
    2370        16984 :         if (P_INCOMPLETE_SPLIT(opaque))
    2371              :         {
    2372            0 :             _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
    2373            0 :             continue;
    2374              :         }
    2375              : 
    2376        16984 :         if (!P_IGNORE(opaque))
    2377              :         {
    2378              :             OffsetNumber offnum,
    2379              :                         minoff,
    2380              :                         maxoff;
    2381              :             ItemId      itemid;
    2382              :             IndexTuple  item;
    2383              : 
    2384        16982 :             minoff = P_FIRSTDATAKEY(opaque);
    2385        16982 :             maxoff = PageGetMaxOffsetNumber(page);
    2386              : 
    2387              :             /*
    2388              :              * start = InvalidOffsetNumber means "search the whole page". We
    2389              :              * need this test anyway due to possibility that page has a high
    2390              :              * key now when it didn't before.
    2391              :              */
    2392        16982 :             if (start < minoff)
    2393           18 :                 start = minoff;
    2394              : 
    2395              :             /*
    2396              :              * Need this check too, to guard against possibility that page
    2397              :              * split since we visited it originally.
    2398              :              */
    2399        16982 :             if (start > maxoff)
    2400            0 :                 start = OffsetNumberNext(maxoff);
    2401              : 
    2402              :             /*
    2403              :              * These loops will check every item on the page --- but in an
    2404              :              * order that's attuned to the probability of where it actually
    2405              :              * is.  Scan to the right first, then to the left.
    2406              :              */
    2407        16982 :             for (offnum = start;
    2408        17044 :                  offnum <= maxoff;
    2409           62 :                  offnum = OffsetNumberNext(offnum))
    2410              :             {
    2411        17044 :                 itemid = PageGetItemId(page, offnum);
    2412        17044 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2413              : 
    2414        17044 :                 if (BTreeTupleGetDownLink(item) == child)
    2415              :                 {
    2416              :                     /* Return accurate pointer to where link is now */
    2417        16982 :                     stack->bts_blkno = blkno;
    2418        16982 :                     stack->bts_offset = offnum;
    2419        16982 :                     return buf;
    2420              :                 }
    2421              :             }
    2422              : 
    2423            0 :             for (offnum = OffsetNumberPrev(start);
    2424            0 :                  offnum >= minoff;
    2425            0 :                  offnum = OffsetNumberPrev(offnum))
    2426              :             {
    2427            0 :                 itemid = PageGetItemId(page, offnum);
    2428            0 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2429              : 
    2430            0 :                 if (BTreeTupleGetDownLink(item) == child)
    2431              :                 {
    2432              :                     /* Return accurate pointer to where link is now */
    2433            0 :                     stack->bts_blkno = blkno;
    2434            0 :                     stack->bts_offset = offnum;
    2435            0 :                     return buf;
    2436              :                 }
    2437              :             }
    2438              :         }
    2439              : 
    2440              :         /*
    2441              :          * The item we're looking for moved right at least one page.
    2442              :          *
    2443              :          * Lehman and Yao couple/chain locks when moving right here, which we
    2444              :          * can avoid.  See nbtree/README.
    2445              :          */
    2446            2 :         if (P_RIGHTMOST(opaque))
    2447              :         {
    2448            0 :             _bt_relbuf(rel, buf);
    2449            0 :             return InvalidBuffer;
    2450              :         }
    2451            2 :         blkno = opaque->btpo_next;
    2452            2 :         start = InvalidOffsetNumber;
    2453            2 :         _bt_relbuf(rel, buf);
    2454              :     }
    2455              : }
    2456              : 
    2457              : /*
    2458              :  * _bt_freestack() -- free a retracement stack made by _bt_search_insert.
    2459              :  */
    2460              : static void
    2461      4081338 : _bt_freestack(BTStack stack)
    2462              : {
    2463              :     BTStack     ostack;
    2464              : 
    2465      8219136 :     while (stack != NULL)
    2466              :     {
    2467      4137798 :         ostack = stack;
    2468      4137798 :         stack = stack->bts_parent;
    2469      4137798 :         pfree(ostack);
    2470              :     }
    2471      4081338 : }
    2472              : 
    2473              : /*
    2474              :  *  _bt_newlevel() -- Create a new level above root page.
    2475              :  *
    2476              :  *      We've just split the old root page and need to create a new one.
    2477              :  *      In order to do this, we add a new root page to the file, then lock
    2478              :  *      the metadata page and update it.  This is guaranteed to be deadlock-
    2479              :  *      free, because all readers release their locks on the metadata page
    2480              :  *      before trying to lock the root, and all writers lock the root before
    2481              :  *      trying to lock the metadata page.  We have a write lock on the old
    2482              :  *      root page, so we have not introduced any cycles into the waits-for
    2483              :  *      graph.
    2484              :  *
    2485              :  *      On entry, lbuf (the old root) and rbuf (its new peer) are write-
    2486              :  *      locked. On exit, a new root page exists with entries for the
    2487              :  *      two new children, metapage is updated and unlocked/unpinned.
    2488              :  *      The new root buffer is returned to caller which has to unlock/unpin
    2489              :  *      lbuf, rbuf & rootbuf.
    2490              :  */
    2491              : static Buffer
    2492          772 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
    2493              : {
    2494              :     Buffer      rootbuf;
    2495              :     Page        lpage,
    2496              :                 rootpage;
    2497              :     BlockNumber lbkno,
    2498              :                 rbkno;
    2499              :     BlockNumber rootblknum;
    2500              :     BTPageOpaque rootopaque;
    2501              :     BTPageOpaque lopaque;
    2502              :     ItemId      itemid;
    2503              :     IndexTuple  item;
    2504              :     IndexTuple  left_item;
    2505              :     Size        left_item_sz;
    2506              :     IndexTuple  right_item;
    2507              :     Size        right_item_sz;
    2508              :     Buffer      metabuf;
    2509              :     Page        metapg;
    2510              :     BTMetaPageData *metad;
    2511              :     XLogRecPtr  recptr;
    2512              : 
    2513          772 :     lbkno = BufferGetBlockNumber(lbuf);
    2514          772 :     rbkno = BufferGetBlockNumber(rbuf);
    2515          772 :     lpage = BufferGetPage(lbuf);
    2516          772 :     lopaque = BTPageGetOpaque(lpage);
    2517              : 
    2518              :     /* get a new root page */
    2519          772 :     rootbuf = _bt_allocbuf(rel, heaprel);
    2520          772 :     rootpage = BufferGetPage(rootbuf);
    2521          772 :     rootblknum = BufferGetBlockNumber(rootbuf);
    2522              : 
    2523              :     /* acquire lock on the metapage */
    2524          772 :     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
    2525          772 :     metapg = BufferGetPage(metabuf);
    2526          772 :     metad = BTPageGetMeta(metapg);
    2527              : 
    2528              :     /*
    2529              :      * Create downlink item for left page (old root).  The key value used is
    2530              :      * "minus infinity", a sentinel value that's reliably less than any real
    2531              :      * key value that could appear in the left page.
    2532              :      */
    2533          772 :     left_item_sz = sizeof(IndexTupleData);
    2534          772 :     left_item = (IndexTuple) palloc(left_item_sz);
    2535          772 :     left_item->t_info = left_item_sz;
    2536          772 :     BTreeTupleSetDownLink(left_item, lbkno);
    2537          772 :     BTreeTupleSetNAtts(left_item, 0, false);
    2538              : 
    2539              :     /*
    2540              :      * Create downlink item for right page.  The key for it is obtained from
    2541              :      * the "high key" position in the left page.
    2542              :      */
    2543          772 :     itemid = PageGetItemId(lpage, P_HIKEY);
    2544          772 :     right_item_sz = ItemIdGetLength(itemid);
    2545          772 :     item = (IndexTuple) PageGetItem(lpage, itemid);
    2546          772 :     right_item = CopyIndexTuple(item);
    2547          772 :     BTreeTupleSetDownLink(right_item, rbkno);
    2548              : 
    2549              :     /* NO EREPORT(ERROR) from here till newroot op is logged */
    2550          772 :     START_CRIT_SECTION();
    2551              : 
    2552              :     /* upgrade metapage if needed */
    2553          772 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
    2554            0 :         _bt_upgrademetapage(metapg);
    2555              : 
    2556              :     /* set btree special data */
    2557          772 :     rootopaque = BTPageGetOpaque(rootpage);
    2558          772 :     rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
    2559          772 :     rootopaque->btpo_flags = BTP_ROOT;
    2560          772 :     rootopaque->btpo_level =
    2561          772 :         (BTPageGetOpaque(lpage))->btpo_level + 1;
    2562          772 :     rootopaque->btpo_cycleid = 0;
    2563              : 
    2564              :     /* update metapage data */
    2565          772 :     metad->btm_root = rootblknum;
    2566          772 :     metad->btm_level = rootopaque->btpo_level;
    2567          772 :     metad->btm_fastroot = rootblknum;
    2568          772 :     metad->btm_fastlevel = rootopaque->btpo_level;
    2569              : 
    2570              :     /*
    2571              :      * Insert the left page pointer into the new root page.  The root page is
    2572              :      * the rightmost page on its level so there is no "high key" in it; the
    2573              :      * two items will go into positions P_HIKEY and P_FIRSTKEY.
    2574              :      *
    2575              :      * Note: we *must* insert the two items in item-number order, for the
    2576              :      * benefit of _bt_restore_page().
    2577              :      */
    2578              :     Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
    2579          772 :     if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
    2580            0 :         elog(PANIC, "failed to add leftkey to new root page"
    2581              :              " while splitting block %u of index \"%s\"",
    2582              :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2583              : 
    2584              :     /*
    2585              :      * insert the right page pointer into the new root page.
    2586              :      */
    2587              :     Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
    2588              :     Assert(BTreeTupleGetNAtts(right_item, rel) <=
    2589              :            IndexRelationGetNumberOfKeyAttributes(rel));
    2590          772 :     if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
    2591            0 :         elog(PANIC, "failed to add rightkey to new root page"
    2592              :              " while splitting block %u of index \"%s\"",
    2593              :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2594              : 
    2595              :     /* Clear the incomplete-split flag in the left child */
    2596              :     Assert(P_INCOMPLETE_SPLIT(lopaque));
    2597          772 :     lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    2598          772 :     MarkBufferDirty(lbuf);
    2599              : 
    2600          772 :     MarkBufferDirty(rootbuf);
    2601          772 :     MarkBufferDirty(metabuf);
    2602              : 
    2603              :     /* XLOG stuff */
    2604          772 :     if (RelationNeedsWAL(rel))
    2605          751 :     {
    2606              :         xl_btree_newroot xlrec;
    2607              :         xl_btree_metadata md;
    2608              : 
    2609          751 :         xlrec.rootblk = rootblknum;
    2610          751 :         xlrec.level = metad->btm_level;
    2611              : 
    2612          751 :         XLogBeginInsert();
    2613          751 :         XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
    2614              : 
    2615          751 :         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
    2616          751 :         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2617          751 :         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2618              : 
    2619              :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2620          751 :         md.version = metad->btm_version;
    2621          751 :         md.root = rootblknum;
    2622          751 :         md.level = metad->btm_level;
    2623          751 :         md.fastroot = rootblknum;
    2624          751 :         md.fastlevel = metad->btm_level;
    2625          751 :         md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2626          751 :         md.allequalimage = metad->btm_allequalimage;
    2627              : 
    2628          751 :         XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
    2629              : 
    2630              :         /*
    2631              :          * Direct access to page is not good but faster - we should implement
    2632              :          * some new func in page API.
    2633              :          */
    2634          751 :         XLogRegisterBufData(0,
    2635          751 :                             (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
    2636          751 :                             ((PageHeader) rootpage)->pd_special -
    2637          751 :                             ((PageHeader) rootpage)->pd_upper);
    2638              : 
    2639          751 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
    2640              :     }
    2641              :     else
    2642           21 :         recptr = XLogGetFakeLSN(rel);
    2643              : 
    2644          772 :     PageSetLSN(lpage, recptr);
    2645          772 :     PageSetLSN(rootpage, recptr);
    2646          772 :     PageSetLSN(metapg, recptr);
    2647              : 
    2648          772 :     END_CRIT_SECTION();
    2649              : 
    2650              :     /* done with metapage */
    2651          772 :     _bt_relbuf(rel, metabuf);
    2652              : 
    2653          772 :     pfree(left_item);
    2654          772 :     pfree(right_item);
    2655              : 
    2656          772 :     return rootbuf;
    2657              : }
    2658              : 
    2659              : /*
    2660              :  *  _bt_pgaddtup() -- add a data item to a particular page during split.
    2661              :  *
    2662              :  *      The difference between this routine and a bare PageAddItem call is
    2663              :  *      that this code can deal with the first data item on an internal btree
    2664              :  *      page in passing.  This data item (which is called "firstright" within
    2665              :  *      _bt_split()) has a key that must be treated as minus infinity after
    2666              :  *      the split.  Therefore, we truncate away all attributes when caller
    2667              :  *      specifies it's the first data item on page (downlink is not changed,
    2668              :  *      though).  This extra step is only needed for the right page of an
    2669              :  *      internal page split.  There is no need to do this for the first data
    2670              :  *      item on the existing/left page, since that will already have been
    2671              :  *      truncated during an earlier page split.
    2672              :  *
    2673              :  *      See _bt_split() for a high level explanation of why we truncate here.
    2674              :  *      Note that this routine has nothing to do with suffix truncation,
    2675              :  *      despite using some of the same infrastructure.
    2676              :  */
    2677              : static inline bool
    2678      4358535 : _bt_pgaddtup(Page page,
    2679              :              Size itemsize,
    2680              :              const IndexTupleData *itup,
    2681              :              OffsetNumber itup_off,
    2682              :              bool newfirstdataitem)
    2683              : {
    2684              :     IndexTupleData trunctuple;
    2685              : 
    2686      4358535 :     if (newfirstdataitem)
    2687              :     {
    2688          208 :         trunctuple = *itup;
    2689          208 :         trunctuple.t_info = sizeof(IndexTupleData);
    2690          208 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
    2691          208 :         itup = &trunctuple;
    2692          208 :         itemsize = sizeof(IndexTupleData);
    2693              :     }
    2694              : 
    2695      4358535 :     if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
    2696            0 :         return false;
    2697              : 
    2698      4358535 :     return true;
    2699              : }
    2700              : 
    2701              : /*
    2702              :  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
    2703              :  *
    2704              :  * There are three operations performed here: simple index deletion, bottom-up
    2705              :  * index deletion, and deduplication.  If all three operations fail to free
    2706              :  * enough space for the incoming item then caller will go on to split the
    2707              :  * page.  We always consider simple deletion first.  If that doesn't work out
    2708              :  * we consider alternatives.  Callers that only want us to consider simple
    2709              :  * deletion (without any fallback) ask for that using the 'simpleonly'
    2710              :  * argument.
    2711              :  *
    2712              :  * We usually pick only one alternative "complex" operation when simple
    2713              :  * deletion alone won't prevent a page split.  The 'checkingunique',
    2714              :  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
    2715              :  *
    2716              :  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
    2717              :  * level flag was found set.  The flag was useful back when there wasn't
    2718              :  * necessarily one single page for a duplicate tuple to go on (before heap TID
    2719              :  * became a part of the key space in version 4 indexes).  But we don't
    2720              :  * actually look at the flag anymore (it's not a gating condition for our
    2721              :  * caller).  That would cause us to miss tuples that are safe to delete,
    2722              :  * without getting any benefit in return.  We know that the alternative is to
    2723              :  * split the page; scanning the line pointer array in passing won't have
    2724              :  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
    2725              :  * all this because !heapkeyspace indexes must still do a "getting tired"
    2726              :  * linear search, and so are likely to get some benefit from using it as a
    2727              :  * gating condition.)
    2728              :  */
    2729              : static void
    2730        33226 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
    2731              :                              BTInsertState insertstate,
    2732              :                              bool simpleonly, bool checkingunique,
    2733              :                              bool uniquedup, bool indexUnchanged)
    2734              : {
    2735              :     OffsetNumber deletable[MaxIndexTuplesPerPage];
    2736        33226 :     int         ndeletable = 0;
    2737              :     OffsetNumber offnum,
    2738              :                 minoff,
    2739              :                 maxoff;
    2740        33226 :     Buffer      buffer = insertstate->buf;
    2741        33226 :     BTScanInsert itup_key = insertstate->itup_key;
    2742        33226 :     Page        page = BufferGetPage(buffer);
    2743        33226 :     BTPageOpaque opaque = BTPageGetOpaque(page);
    2744              : 
    2745              :     Assert(P_ISLEAF(opaque));
    2746              :     Assert(simpleonly || itup_key->heapkeyspace);
    2747              :     Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
    2748              : 
    2749              :     /*
    2750              :      * Scan over all items to see which ones need to be deleted according to
    2751              :      * LP_DEAD flags.  We'll usually manage to delete a few extra items that
    2752              :      * are not marked LP_DEAD in passing.  Often the extra items that actually
    2753              :      * end up getting deleted are items that would have had their LP_DEAD bit
    2754              :      * set before long anyway (if we opted not to include them as extras).
    2755              :      */
    2756        33226 :     minoff = P_FIRSTDATAKEY(opaque);
    2757        33226 :     maxoff = PageGetMaxOffsetNumber(page);
    2758        33226 :     for (offnum = minoff;
    2759      8941203 :          offnum <= maxoff;
    2760      8907977 :          offnum = OffsetNumberNext(offnum))
    2761              :     {
    2762      8907977 :         ItemId      itemId = PageGetItemId(page, offnum);
    2763              : 
    2764      8907977 :         if (ItemIdIsDead(itemId))
    2765       155730 :             deletable[ndeletable++] = offnum;
    2766              :     }
    2767              : 
    2768        33226 :     if (ndeletable > 0)
    2769              :     {
    2770         5210 :         _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
    2771              :                            insertstate->itup, minoff, maxoff);
    2772         5210 :         insertstate->bounds_valid = false;
    2773              : 
    2774              :         /* Return when a page split has already been avoided */
    2775         5210 :         if (PageGetFreeSpace(page) >= insertstate->itemsz)
    2776        15023 :             return;
    2777              : 
    2778              :         /* Might as well assume duplicates (if checkingunique) */
    2779           53 :         uniquedup = true;
    2780              :     }
    2781              : 
    2782              :     /*
    2783              :      * We're done with simple deletion.  Return early with callers that only
    2784              :      * call here so that simple deletion can be considered.  This includes
    2785              :      * callers that explicitly ask for this and checkingunique callers that
    2786              :      * probably don't have any version churn duplicates on the page.
    2787              :      *
    2788              :      * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
    2789              :      * return at this point (or when we go on the try either or both of our
    2790              :      * other strategies and they also fail).  We do not bother expending a
    2791              :      * separate write to clear it, however.  Caller will definitely clear it
    2792              :      * when it goes on to split the page (note also that the deduplication
    2793              :      * process will clear the flag in passing, just to keep things tidy).
    2794              :      */
    2795        28069 :     if (simpleonly || (checkingunique && !uniquedup))
    2796              :     {
    2797              :         Assert(!indexUnchanged);
    2798         9596 :         return;
    2799              :     }
    2800              : 
    2801              :     /* Assume bounds about to be invalidated (this is almost certain now) */
    2802        18473 :     insertstate->bounds_valid = false;
    2803              : 
    2804              :     /*
    2805              :      * Perform bottom-up index deletion pass when executor hint indicated that
    2806              :      * incoming item is logically unchanged, or for a unique index that is
    2807              :      * known to have physical duplicates for some other reason.  (There is a
    2808              :      * large overlap between these two cases for a unique index.  It's worth
    2809              :      * having both triggering conditions in order to apply the optimization in
    2810              :      * the event of successive related INSERT and DELETE statements.)
    2811              :      *
    2812              :      * We'll go on to do a deduplication pass when a bottom-up pass fails to
    2813              :      * delete an acceptable amount of free space (a significant fraction of
    2814              :      * the page, or space for the new item, whichever is greater).
    2815              :      *
    2816              :      * Note: Bottom-up index deletion uses the same equality/equivalence
    2817              :      * routines as deduplication internally.  However, it does not merge
    2818              :      * together index tuples, so the same correctness considerations do not
    2819              :      * apply.  We deliberately omit an index-is-allequalimage test here.
    2820              :      */
    2821        20866 :     if ((indexUnchanged || uniquedup) &&
    2822         2393 :         _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
    2823          270 :         return;
    2824              : 
    2825              :     /* Perform deduplication pass (when enabled and index-is-allequalimage) */
    2826        18203 :     if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
    2827        18191 :         _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
    2828        18191 :                        (indexUnchanged || uniquedup));
    2829              : }
    2830              : 
    2831              : /*
    2832              :  * _bt_simpledel_pass - Simple index tuple deletion pass.
    2833              :  *
    2834              :  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
    2835              :  * of all such tuples are determined by caller (caller passes these to us as
    2836              :  * its 'deletable' argument).
    2837              :  *
    2838              :  * We might also delete extra index tuples that turn out to be safe to delete
    2839              :  * in passing (though they must be cheap to check in passing to begin with).
    2840              :  * There is no certainty that any extra tuples will be deleted, though.  The
    2841              :  * high level goal of the approach we take is to get the most out of each call
    2842              :  * here (without noticeably increasing the per-call overhead compared to what
    2843              :  * we need to do just to be able to delete the page's LP_DEAD-marked index
    2844              :  * tuples).
    2845              :  *
    2846              :  * The number of extra index tuples that turn out to be deletable might
    2847              :  * greatly exceed the number of LP_DEAD-marked index tuples due to various
    2848              :  * locality related effects.  For example, it's possible that the total number
    2849              :  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
    2850              :  * quite low, in which case we might end up checking if it's possible to
    2851              :  * delete _most_ index tuples on the page (without the tableam needing to
    2852              :  * access additional table blocks).  The tableam will sometimes stumble upon
    2853              :  * _many_ extra deletable index tuples in indexes where this pattern is
    2854              :  * common.
    2855              :  *
    2856              :  * See nbtree/README for further details on simple index tuple deletion.
    2857              :  */
    2858              : static void
    2859         5210 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
    2860              :                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
    2861              :                    OffsetNumber minoff, OffsetNumber maxoff)
    2862              : {
    2863         5210 :     Page        page = BufferGetPage(buffer);
    2864              :     BlockNumber *deadblocks;
    2865              :     int         ndeadblocks;
    2866              :     TM_IndexDeleteOp delstate;
    2867              :     OffsetNumber offnum;
    2868              : 
    2869              :     /* Get array of table blocks pointed to by LP_DEAD-set tuples */
    2870         5210 :     deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
    2871              :                                 &ndeadblocks);
    2872              : 
    2873              :     /* Initialize tableam state that describes index deletion operation */
    2874         5210 :     delstate.irel = rel;
    2875         5210 :     delstate.iblknum = BufferGetBlockNumber(buffer);
    2876         5210 :     delstate.bottomup = false;
    2877         5210 :     delstate.bottomupfreespace = 0;
    2878         5210 :     delstate.ndeltids = 0;
    2879         5210 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
    2880         5210 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
    2881              : 
    2882         5210 :     for (offnum = minoff;
    2883      1496264 :          offnum <= maxoff;
    2884      1491054 :          offnum = OffsetNumberNext(offnum))
    2885              :     {
    2886      1491054 :         ItemId      itemid = PageGetItemId(page, offnum);
    2887      1491054 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    2888      1491054 :         TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
    2889      1491054 :         TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
    2890              :         BlockNumber tidblock;
    2891              :         void       *match;
    2892              : 
    2893      1491054 :         if (!BTreeTupleIsPosting(itup))
    2894              :         {
    2895      1424252 :             tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
    2896      1424252 :             match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2897              :                             sizeof(BlockNumber), _bt_blk_cmp);
    2898              : 
    2899      1424252 :             if (!match)
    2900              :             {
    2901              :                 Assert(!ItemIdIsDead(itemid));
    2902       931245 :                 continue;
    2903              :             }
    2904              : 
    2905              :             /*
    2906              :              * TID's table block is among those pointed to by the TIDs from
    2907              :              * LP_DEAD-bit set tuples on page -- add TID to deltids
    2908              :              */
    2909       493007 :             odeltid->tid = itup->t_tid;
    2910       493007 :             odeltid->id = delstate.ndeltids;
    2911       493007 :             ostatus->idxoffnum = offnum;
    2912       493007 :             ostatus->knowndeletable = ItemIdIsDead(itemid);
    2913       493007 :             ostatus->promising = false; /* unused */
    2914       493007 :             ostatus->freespace = 0; /* unused */
    2915              : 
    2916       493007 :             delstate.ndeltids++;
    2917              :         }
    2918              :         else
    2919              :         {
    2920        66802 :             int         nitem = BTreeTupleGetNPosting(itup);
    2921              : 
    2922       314896 :             for (int p = 0; p < nitem; p++)
    2923              :             {
    2924       248094 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
    2925              : 
    2926       248094 :                 tidblock = ItemPointerGetBlockNumber(tid);
    2927       248094 :                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2928              :                                 sizeof(BlockNumber), _bt_blk_cmp);
    2929              : 
    2930       248094 :                 if (!match)
    2931              :                 {
    2932              :                     Assert(!ItemIdIsDead(itemid));
    2933       218480 :                     continue;
    2934              :                 }
    2935              : 
    2936              :                 /*
    2937              :                  * TID's table block is among those pointed to by the TIDs
    2938              :                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
    2939              :                  */
    2940        29614 :                 odeltid->tid = *tid;
    2941        29614 :                 odeltid->id = delstate.ndeltids;
    2942        29614 :                 ostatus->idxoffnum = offnum;
    2943        29614 :                 ostatus->knowndeletable = ItemIdIsDead(itemid);
    2944        29614 :                 ostatus->promising = false; /* unused */
    2945        29614 :                 ostatus->freespace = 0; /* unused */
    2946              : 
    2947        29614 :                 odeltid++;
    2948        29614 :                 ostatus++;
    2949        29614 :                 delstate.ndeltids++;
    2950              :             }
    2951              :         }
    2952              :     }
    2953              : 
    2954         5210 :     pfree(deadblocks);
    2955              : 
    2956              :     Assert(delstate.ndeltids >= ndeletable);
    2957              : 
    2958              :     /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
    2959         5210 :     _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
    2960              : 
    2961         5210 :     pfree(delstate.deltids);
    2962         5210 :     pfree(delstate.status);
    2963         5210 : }
    2964              : 
    2965              : /*
    2966              :  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
    2967              :  *
    2968              :  * Builds sorted and unique-ified array of table block numbers from index
    2969              :  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
    2970              :  * block from incoming newitem just in case it isn't among the LP_DEAD-related
    2971              :  * table blocks.
    2972              :  *
    2973              :  * Always counting the newitem's table block as an LP_DEAD related block makes
    2974              :  * sense because the cost is consistently low; it is practically certain that
    2975              :  * the table block will not incur a buffer miss in tableam.  On the other hand
    2976              :  * the benefit is often quite high.  There is a decent chance that there will
    2977              :  * be some deletable items from this block, since in general most garbage
    2978              :  * tuples became garbage in the recent past (in many cases this won't be the
    2979              :  * first logical row that core code added to/modified in table block
    2980              :  * recently).
    2981              :  *
    2982              :  * Returns final array, and sets *nblocks to its final size for caller.
    2983              :  */
    2984              : static BlockNumber *
    2985         5210 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
    2986              :                IndexTuple newitem, int *nblocks)
    2987              : {
    2988              :     int         spacentids,
    2989              :                 ntids;
    2990              :     BlockNumber *tidblocks;
    2991              : 
    2992              :     /*
    2993              :      * Accumulate each TID's block in array whose initial size has space for
    2994              :      * one table block per LP_DEAD-set tuple (plus space for the newitem table
    2995              :      * block).  Array will only need to grow when there are LP_DEAD-marked
    2996              :      * posting list tuples (which is not that common).
    2997              :      */
    2998         5210 :     spacentids = ndeletable + 1;
    2999         5210 :     ntids = 0;
    3000         5210 :     tidblocks = palloc_array(BlockNumber, spacentids);
    3001              : 
    3002              :     /*
    3003              :      * First add the table block for the incoming newitem.  This is the one
    3004              :      * case where simple deletion can visit a table block that doesn't have
    3005              :      * any known deletable items.
    3006              :      */
    3007              :     Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
    3008         5210 :     tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
    3009              : 
    3010       160940 :     for (int i = 0; i < ndeletable; i++)
    3011              :     {
    3012       155730 :         ItemId      itemid = PageGetItemId(page, deletable[i]);
    3013       155730 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    3014              : 
    3015              :         Assert(ItemIdIsDead(itemid));
    3016              : 
    3017       155730 :         if (!BTreeTupleIsPosting(itup))
    3018              :         {
    3019       151233 :             if (ntids + 1 > spacentids)
    3020              :             {
    3021          133 :                 spacentids *= 2;
    3022              :                 tidblocks = (BlockNumber *)
    3023          133 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    3024              :             }
    3025              : 
    3026       151233 :             tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
    3027              :         }
    3028              :         else
    3029              :         {
    3030         4497 :             int         nposting = BTreeTupleGetNPosting(itup);
    3031              : 
    3032         4497 :             if (ntids + nposting > spacentids)
    3033              :             {
    3034          119 :                 spacentids = Max(spacentids * 2, ntids + nposting);
    3035              :                 tidblocks = (BlockNumber *)
    3036          119 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    3037              :             }
    3038              : 
    3039        15631 :             for (int j = 0; j < nposting; j++)
    3040              :             {
    3041        11134 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
    3042              : 
    3043        11134 :                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
    3044              :             }
    3045              :         }
    3046              :     }
    3047              : 
    3048         5210 :     qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3049         5210 :     *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    3050              : 
    3051         5210 :     return tidblocks;
    3052              : }
    3053              : 
    3054              : /*
    3055              :  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
    3056              :  */
    3057              : static inline int
    3058      3637992 : _bt_blk_cmp(const void *arg1, const void *arg2)
    3059              : {
    3060      3637992 :     BlockNumber b1 = *((const BlockNumber *) arg1);
    3061      3637992 :     BlockNumber b2 = *((const BlockNumber *) arg2);
    3062              : 
    3063      3637992 :     return pg_cmp_u32(b1, b2);
    3064              : }
        

Generated by: LCOV version 2.0-1