LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 284 291 97.6 %
Date: 2025-01-18 04:15:08 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtdedup.c
       4             :  *    Deduplicate or bottom-up delete items in Postgres btrees.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtdedup.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include "access/nbtree.h"
      18             : #include "access/nbtxlog.h"
      19             : #include "access/xloginsert.h"
      20             : #include "miscadmin.h"
      21             : #include "utils/rel.h"
      22             : 
      23             : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
      24             :                                            TM_IndexDeleteOp *delstate);
      25             : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      26             :                              OffsetNumber minoff, IndexTuple newitem);
      27             : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      28             :                                      Size newitemsz);
      29             : #ifdef USE_ASSERT_CHECKING
      30             : static bool _bt_posting_valid(IndexTuple posting);
      31             : #endif
      32             : 
      33             : /*
      34             :  * Perform a deduplication pass.
      35             :  *
      36             :  * The general approach taken here is to perform as much deduplication as
      37             :  * possible to free as much space as possible.  Note, however, that "single
      38             :  * value" strategy is used for !bottomupdedup callers when the page is full of
      39             :  * tuples of a single value.  Deduplication passes that apply the strategy
      40             :  * will leave behind a few untouched tuples at the end of the page, preparing
      41             :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
      42             :  * value strategy.  Our high level goal is to delay merging the untouched
      43             :  * tuples until after the page splits.
      44             :  *
      45             :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
      46             :  * high level goal is to prevent a page split entirely by buying more time.
      47             :  * We still hope that a page split can be avoided altogether.  That's why
      48             :  * single value strategy is not even considered for bottomupdedup callers.
      49             :  *
      50             :  * The page will have to be split if we cannot successfully free at least
      51             :  * newitemsz (we also need space for newitem's line pointer, which isn't
      52             :  * included in caller's newitemsz).
      53             :  *
      54             :  * Note: Caller should have already deleted all existing items with their
      55             :  * LP_DEAD bits set.
      56             :  */
      57             : void
      58       25458 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
      59             :                bool bottomupdedup)
      60             : {
      61             :     OffsetNumber offnum,
      62             :                 minoff,
      63             :                 maxoff;
      64       25458 :     Page        page = BufferGetPage(buf);
      65       25458 :     BTPageOpaque opaque = BTPageGetOpaque(page);
      66             :     Page        newpage;
      67             :     BTDedupState state;
      68       25458 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
      69       25458 :     bool        singlevalstrat = false;
      70       25458 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      71             : 
      72             :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
      73       25458 :     newitemsz += sizeof(ItemIdData);
      74             : 
      75             :     /*
      76             :      * Initialize deduplication state.
      77             :      *
      78             :      * It would be possible for maxpostingsize (limit on posting list tuple
      79             :      * size) to be set to one third of the page.  However, it seems like a
      80             :      * good idea to limit the size of posting lists to one sixth of a page.
      81             :      * That ought to leave us with a good split point when pages full of
      82             :      * duplicates can be split several times.
      83             :      */
      84       25458 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
      85       25458 :     state->deduplicate = true;
      86       25458 :     state->nmaxitems = 0;
      87       25458 :     state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
      88             :     /* Metadata about base tuple of current pending posting list */
      89       25458 :     state->base = NULL;
      90       25458 :     state->baseoff = InvalidOffsetNumber;
      91       25458 :     state->basetupsize = 0;
      92             :     /* Metadata about current pending posting list TIDs */
      93       25458 :     state->htids = palloc(state->maxpostingsize);
      94       25458 :     state->nhtids = 0;
      95       25458 :     state->nitems = 0;
      96             :     /* Size of all physical tuples to be replaced by pending posting list */
      97       25458 :     state->phystupsize = 0;
      98             :     /* nintervals should be initialized to zero */
      99       25458 :     state->nintervals = 0;
     100             : 
     101       25458 :     minoff = P_FIRSTDATAKEY(opaque);
     102       25458 :     maxoff = PageGetMaxOffsetNumber(page);
     103             : 
     104             :     /*
     105             :      * Consider applying "single value" strategy, though only if the page
     106             :      * seems likely to be split in the near future
     107             :      */
     108       25458 :     if (!bottomupdedup)
     109       22360 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     110             : 
     111             :     /*
     112             :      * Deduplicate items from page, and write them to newpage.
     113             :      *
     114             :      * Copy the original page's LSN into newpage copy.  This will become the
     115             :      * updated version of the page.  We need this because XLogInsert will
     116             :      * examine the LSN and possibly dump it in a page image.
     117             :      */
     118       25458 :     newpage = PageGetTempPageCopySpecial(page);
     119       25458 :     PageSetLSN(newpage, PageGetLSN(page));
     120             : 
     121             :     /* Copy high key, if any */
     122       25458 :     if (!P_RIGHTMOST(opaque))
     123             :     {
     124       20376 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
     125       20376 :         Size        hitemsz = ItemIdGetLength(hitemid);
     126       20376 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
     127             : 
     128       20376 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
     129             :                         false, false) == InvalidOffsetNumber)
     130           0 :             elog(ERROR, "deduplication failed to add highkey");
     131             :     }
     132             : 
     133     5706220 :     for (offnum = minoff;
     134             :          offnum <= maxoff;
     135     5680762 :          offnum = OffsetNumberNext(offnum))
     136             :     {
     137     5680762 :         ItemId      itemid = PageGetItemId(page, offnum);
     138     5680762 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     139             : 
     140             :         Assert(!ItemIdIsDead(itemid));
     141             : 
     142     5680762 :         if (offnum == minoff)
     143             :         {
     144             :             /*
     145             :              * No previous/base tuple for the data item -- use the data item
     146             :              * as base tuple of pending posting list
     147             :              */
     148       25458 :             _bt_dedup_start_pending(state, itup, offnum);
     149             :         }
     150    11308138 :         else if (state->deduplicate &&
     151     6438060 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     152      785226 :                  _bt_dedup_save_htid(state, itup))
     153             :         {
     154             :             /*
     155             :              * Tuple is equal to base tuple of pending posting list.  Heap
     156             :              * TID(s) for itup have been saved in state.
     157             :              */
     158             :         }
     159             :         else
     160             :         {
     161             :             /*
     162             :              * Tuple is not equal to pending posting list tuple, or
     163             :              * _bt_dedup_save_htid() opted to not merge current item into
     164             :              * pending posting list for some other reason (e.g., adding more
     165             :              * TIDs would have caused posting list to exceed current
     166             :              * maxpostingsize).
     167             :              *
     168             :              * If state contains pending posting list with more than one item,
     169             :              * form new posting tuple and add it to our temp page (newpage).
     170             :              * Else add pending interval's base tuple to the temp page as-is.
     171             :              */
     172     4882350 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
     173             : 
     174     4882350 :             if (singlevalstrat)
     175             :             {
     176             :                 /*
     177             :                  * Single value strategy's extra steps.
     178             :                  *
     179             :                  * Lower maxpostingsize for sixth and final large posting list
     180             :                  * tuple at the point where 5 maxpostingsize-capped tuples
     181             :                  * have either been formed or observed.
     182             :                  *
     183             :                  * When a sixth maxpostingsize-capped item is formed/observed,
     184             :                  * stop merging together tuples altogether.  The few tuples
     185             :                  * that remain at the end of the page won't be merged together
     186             :                  * at all (at least not until after a future page split takes
     187             :                  * place, when this page's newly allocated right sibling page
     188             :                  * gets its first deduplication pass).
     189             :                  */
     190        5136 :                 if (state->nmaxitems == 5)
     191         640 :                     _bt_singleval_fillfactor(page, state, newitemsz);
     192        4496 :                 else if (state->nmaxitems == 6)
     193             :                 {
     194         210 :                     state->deduplicate = false;
     195         210 :                     singlevalstrat = false; /* won't be back here */
     196             :                 }
     197             :             }
     198             : 
     199             :             /* itup starts new pending posting list */
     200     4882350 :             _bt_dedup_start_pending(state, itup, offnum);
     201             :         }
     202             :     }
     203             : 
     204             :     /* Handle the last item */
     205       25458 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
     206             : 
     207             :     /*
     208             :      * If no items suitable for deduplication were found, newpage must be
     209             :      * exactly the same as the original page, so just return from function.
     210             :      *
     211             :      * We could determine whether or not to proceed on the basis the space
     212             :      * savings being sufficient to avoid an immediate page split instead.  We
     213             :      * don't do that because there is some small value in nbtsplitloc.c always
     214             :      * operating against a page that is fully deduplicated (apart from
     215             :      * newitem).  Besides, most of the cost has already been paid.
     216             :      */
     217       25458 :     if (state->nintervals == 0)
     218             :     {
     219             :         /* cannot leak memory here */
     220        4372 :         pfree(newpage);
     221        4372 :         pfree(state->htids);
     222        4372 :         pfree(state);
     223        4372 :         return;
     224             :     }
     225             : 
     226             :     /*
     227             :      * By here, it's clear that deduplication will definitely go ahead.
     228             :      *
     229             :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
     230             :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
     231             :      * But keep things tidy.
     232             :      */
     233       21086 :     if (P_HAS_GARBAGE(opaque))
     234             :     {
     235           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
     236             : 
     237           0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     238             :     }
     239             : 
     240       21086 :     START_CRIT_SECTION();
     241             : 
     242       21086 :     PageRestoreTempPage(newpage, page);
     243       21086 :     MarkBufferDirty(buf);
     244             : 
     245             :     /* XLOG stuff */
     246       21086 :     if (RelationNeedsWAL(rel))
     247             :     {
     248             :         XLogRecPtr  recptr;
     249             :         xl_btree_dedup xlrec_dedup;
     250             : 
     251       20960 :         xlrec_dedup.nintervals = state->nintervals;
     252             : 
     253       20960 :         XLogBeginInsert();
     254       20960 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     255       20960 :         XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
     256             : 
     257             :         /*
     258             :          * The intervals array is not in the buffer, but pretend that it is.
     259             :          * When XLogInsert stores the whole buffer, the array need not be
     260             :          * stored too.
     261             :          */
     262       20960 :         XLogRegisterBufData(0, (char *) state->intervals,
     263       20960 :                             state->nintervals * sizeof(BTDedupInterval));
     264             : 
     265       20960 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     266             : 
     267       20960 :         PageSetLSN(page, recptr);
     268             :     }
     269             : 
     270       21086 :     END_CRIT_SECTION();
     271             : 
     272             :     /* Local space accounting should agree with page accounting */
     273             :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     274             : 
     275             :     /* cannot leak memory here */
     276       21086 :     pfree(state->htids);
     277       21086 :     pfree(state);
     278             : }
     279             : 
     280             : /*
     281             :  * Perform bottom-up index deletion pass.
     282             :  *
     283             :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
     284             :  * be deleted via bottom-up index deletion.  The high level goal here is to
     285             :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
     286             :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
     287             :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
     288             :  * do not particularly care about once-off opportunities to delete many index
     289             :  * tuples together.
     290             :  *
     291             :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
     292             :  * See access/tableam.h for a description of how we're expected to cooperate
     293             :  * with the tableam.
     294             :  *
     295             :  * Returns true on success, in which case caller can assume page split will be
     296             :  * avoided for a reasonable amount of time.  Returns false when caller should
     297             :  * deduplicate the page (if possible at all).
     298             :  *
     299             :  * Note: Occasionally we return true despite failing to delete enough items to
     300             :  * avoid a split.  This makes caller skip deduplication and go split the page
     301             :  * right away.  Our return value is always just advisory information.
     302             :  *
     303             :  * Note: Caller should have already deleted all existing items with their
     304             :  * LP_DEAD bits set.
     305             :  */
     306             : bool
     307        3504 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
     308             :                      Size newitemsz)
     309             : {
     310             :     OffsetNumber offnum,
     311             :                 minoff,
     312             :                 maxoff;
     313        3504 :     Page        page = BufferGetPage(buf);
     314        3504 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     315             :     BTDedupState state;
     316             :     TM_IndexDeleteOp delstate;
     317             :     bool        neverdedup;
     318        3504 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     319             : 
     320             :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     321        3504 :     newitemsz += sizeof(ItemIdData);
     322             : 
     323             :     /* Initialize deduplication state */
     324        3504 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
     325        3504 :     state->deduplicate = true;
     326        3504 :     state->nmaxitems = 0;
     327        3504 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
     328        3504 :     state->base = NULL;
     329        3504 :     state->baseoff = InvalidOffsetNumber;
     330        3504 :     state->basetupsize = 0;
     331        3504 :     state->htids = palloc(state->maxpostingsize);
     332        3504 :     state->nhtids = 0;
     333        3504 :     state->nitems = 0;
     334        3504 :     state->phystupsize = 0;
     335        3504 :     state->nintervals = 0;
     336             : 
     337             :     /*
     338             :      * Initialize tableam state that describes bottom-up index deletion
     339             :      * operation.
     340             :      *
     341             :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
     342             :      * can safely delete.  The tableam will search until our leaf page space
     343             :      * target is satisfied, or until the cost of continuing with the tableam
     344             :      * operation seems too high.  It focuses its efforts on TIDs associated
     345             :      * with duplicate index tuples that we mark "promising".
     346             :      *
     347             :      * This space target is a little arbitrary.  The tableam must be able to
     348             :      * keep the costs and benefits in balance.  We provide the tableam with
     349             :      * exhaustive information about what might work, without directly
     350             :      * concerning ourselves with avoiding work during the tableam call.  Our
     351             :      * role in costing the bottom-up deletion process is strictly advisory.
     352             :      */
     353        3504 :     delstate.irel = rel;
     354        3504 :     delstate.iblknum = BufferGetBlockNumber(buf);
     355        3504 :     delstate.bottomup = true;
     356        3504 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
     357        3504 :     delstate.ndeltids = 0;
     358        3504 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
     359        3504 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
     360             : 
     361        3504 :     minoff = P_FIRSTDATAKEY(opaque);
     362        3504 :     maxoff = PageGetMaxOffsetNumber(page);
     363     1007508 :     for (offnum = minoff;
     364             :          offnum <= maxoff;
     365     1004004 :          offnum = OffsetNumberNext(offnum))
     366             :     {
     367     1004004 :         ItemId      itemid = PageGetItemId(page, offnum);
     368     1004004 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     369             : 
     370             :         Assert(!ItemIdIsDead(itemid));
     371             : 
     372     1004004 :         if (offnum == minoff)
     373             :         {
     374             :             /* itup starts first pending interval */
     375        3504 :             _bt_dedup_start_pending(state, itup, offnum);
     376             :         }
     377     1136582 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     378      136082 :                  _bt_dedup_save_htid(state, itup))
     379             :         {
     380             :             /* Tuple is equal; just added its TIDs to pending interval */
     381             :         }
     382             :         else
     383             :         {
     384             :             /* Finalize interval -- move its TIDs to delete state */
     385      864418 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
     386             : 
     387             :             /* itup starts new pending interval */
     388      864418 :             _bt_dedup_start_pending(state, itup, offnum);
     389             :         }
     390             :     }
     391             :     /* Finalize final interval -- move its TIDs to delete state */
     392        3504 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
     393             : 
     394             :     /*
     395             :      * We don't give up now in the event of having few (or even zero)
     396             :      * promising tuples for the tableam because it's not up to us as the index
     397             :      * AM to manage costs (note that the tableam might have heuristics of its
     398             :      * own that work out what to do).  We should at least avoid having our
     399             :      * caller do a useless deduplication pass after we return in the event of
     400             :      * zero promising tuples, though.
     401             :      */
     402        3504 :     neverdedup = false;
     403        3504 :     if (state->nintervals == 0)
     404           2 :         neverdedup = true;
     405             : 
     406        3504 :     pfree(state->htids);
     407        3504 :     pfree(state);
     408             : 
     409             :     /* Ask tableam which TIDs are deletable, then physically delete them */
     410        3504 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
     411             : 
     412        3504 :     pfree(delstate.deltids);
     413        3504 :     pfree(delstate.status);
     414             : 
     415             :     /* Report "success" to caller unconditionally to avoid deduplication */
     416        3504 :     if (neverdedup)
     417           2 :         return true;
     418             : 
     419             :     /* Don't dedup when we won't end up back here any time soon anyway */
     420        3502 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
     421             : }
     422             : 
     423             : /*
     424             :  * Create a new pending posting list tuple based on caller's base tuple.
     425             :  *
     426             :  * Every tuple processed by deduplication either becomes the base tuple for a
     427             :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     428             :  * A tuple that starts out as the base tuple for a posting list will only
     429             :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     430             :  * that there are duplicates that can be merged into the base tuple.
     431             :  */
     432             : void
     433    11306780 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     434             :                         OffsetNumber baseoff)
     435             : {
     436             :     Assert(state->nhtids == 0);
     437             :     Assert(state->nitems == 0);
     438             :     Assert(!BTreeTupleIsPivot(base));
     439             : 
     440             :     /*
     441             :      * Copy heap TID(s) from new base tuple for new candidate posting list
     442             :      * into working state's array
     443             :      */
     444    11306780 :     if (!BTreeTupleIsPosting(base))
     445             :     {
     446     9379096 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     447     9379096 :         state->nhtids = 1;
     448     9379096 :         state->basetupsize = IndexTupleSize(base);
     449             :     }
     450             :     else
     451             :     {
     452             :         int         nposting;
     453             : 
     454     1927684 :         nposting = BTreeTupleGetNPosting(base);
     455     1927684 :         memcpy(state->htids, BTreeTupleGetPosting(base),
     456             :                sizeof(ItemPointerData) * nposting);
     457     1927684 :         state->nhtids = nposting;
     458             :         /* basetupsize should not include existing posting list */
     459     1927684 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
     460             :     }
     461             : 
     462             :     /*
     463             :      * Save new base tuple itself -- it'll be needed if we actually create a
     464             :      * new posting list from new pending posting list.
     465             :      *
     466             :      * Must maintain physical size of all existing tuples (including line
     467             :      * pointer overhead) so that we can calculate space savings on page.
     468             :      */
     469    11306780 :     state->nitems = 1;
     470    11306780 :     state->base = base;
     471    11306780 :     state->baseoff = baseoff;
     472    11306780 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     473             :     /* Also save baseoff in pending state for interval */
     474    11306780 :     state->intervals[state->nintervals].baseoff = state->baseoff;
     475    11306780 : }
     476             : 
     477             : /*
     478             :  * Save itup heap TID(s) into pending posting list where possible.
     479             :  *
     480             :  * Returns bool indicating if the pending posting list managed by state now
     481             :  * includes itup's heap TID(s).
     482             :  */
     483             : bool
     484     2453636 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     485             : {
     486             :     int         nhtids;
     487             :     ItemPointer htids;
     488             :     Size        mergedtupsz;
     489             : 
     490             :     Assert(!BTreeTupleIsPivot(itup));
     491             : 
     492     2453636 :     if (!BTreeTupleIsPosting(itup))
     493             :     {
     494     2441486 :         nhtids = 1;
     495     2441486 :         htids = &itup->t_tid;
     496             :     }
     497             :     else
     498             :     {
     499       12150 :         nhtids = BTreeTupleGetNPosting(itup);
     500       12150 :         htids = BTreeTupleGetPosting(itup);
     501             :     }
     502             : 
     503             :     /*
     504             :      * Don't append (have caller finish pending posting list as-is) if
     505             :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
     506             :      *
     507             :      * This calculation needs to match the code used within _bt_form_posting()
     508             :      * for new posting list tuples.
     509             :      */
     510     2453636 :     mergedtupsz = MAXALIGN(state->basetupsize +
     511             :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
     512             : 
     513     2453636 :     if (mergedtupsz > state->maxpostingsize)
     514             :     {
     515             :         /*
     516             :          * Count this as an oversized item for single value strategy, though
     517             :          * only when there are 50 TIDs in the final posting list tuple.  This
     518             :          * limit (which is fairly arbitrary) avoids confusion about how many
     519             :          * 1/6 of a page tuples have been encountered/created by the current
     520             :          * deduplication pass.
     521             :          *
     522             :          * Note: We deliberately don't consider which deduplication pass
     523             :          * merged together tuples to create this item (could be a previous
     524             :          * deduplication pass, or current pass).  See _bt_do_singleval()
     525             :          * comments.
     526             :          */
     527       20188 :         if (state->nhtids > 50)
     528       19346 :             state->nmaxitems++;
     529             : 
     530       20188 :         return false;
     531             :     }
     532             : 
     533             :     /*
     534             :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
     535             :      * pending posting list
     536             :      */
     537     2433448 :     state->nitems++;
     538     2433448 :     memcpy(state->htids + state->nhtids, htids,
     539             :            sizeof(ItemPointerData) * nhtids);
     540     2433448 :     state->nhtids += nhtids;
     541     2433448 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     542             : 
     543     2433448 :     return true;
     544             : }
     545             : 
     546             : /*
     547             :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     548             :  * is based on saved base tuple, and saved list of heap TIDs.
     549             :  *
     550             :  * Returns space saving from deduplicating to make a new posting list tuple.
     551             :  * Note that this includes line pointer overhead.  This is zero in the case
     552             :  * where no deduplication was possible.
     553             :  */
     554             : Size
     555     5726214 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     556             : {
     557             :     OffsetNumber tupoff;
     558             :     Size        tuplesz;
     559             :     Size        spacesaving;
     560             : 
     561             :     Assert(state->nitems > 0);
     562             :     Assert(state->nitems <= state->nhtids);
     563             :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     564             : 
     565     5726214 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     566     5726214 :     if (state->nitems == 1)
     567             :     {
     568             :         /* Use original, unchanged base tuple */
     569     5347300 :         tuplesz = IndexTupleSize(state->base);
     570             :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
     571             :         Assert(tuplesz <= BTMaxItemSize(newpage));
     572     5347300 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
     573             :                         false, false) == InvalidOffsetNumber)
     574           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     575             : 
     576     5347300 :         spacesaving = 0;
     577             :     }
     578             :     else
     579             :     {
     580             :         IndexTuple  final;
     581             : 
     582             :         /* Form a tuple with a posting list */
     583      378914 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
     584      378914 :         tuplesz = IndexTupleSize(final);
     585             :         Assert(tuplesz <= state->maxpostingsize);
     586             : 
     587             :         /* Save final number of items for posting list */
     588      378914 :         state->intervals[state->nintervals].nitems = state->nitems;
     589             : 
     590             :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     591             :         Assert(tuplesz <= BTMaxItemSize(newpage));
     592      378914 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
     593             :                         false) == InvalidOffsetNumber)
     594           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     595             : 
     596      378914 :         pfree(final);
     597      378914 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     598             :         /* Increment nintervals, since we wrote a new posting list tuple */
     599      378914 :         state->nintervals++;
     600             :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     601             :     }
     602             : 
     603             :     /* Reset state for next pending posting list */
     604     5726214 :     state->nhtids = 0;
     605     5726214 :     state->nitems = 0;
     606     5726214 :     state->phystupsize = 0;
     607             : 
     608     5726214 :     return spacesaving;
     609             : }
     610             : 
     611             : /*
     612             :  * Finalize interval during bottom-up index deletion.
     613             :  *
     614             :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
     615             :  * first, and then get moved over to delstate (in variable-sized batches) by
     616             :  * calling here.  Call here happens when the number of TIDs in a dedup
     617             :  * interval is known, and interval gets finalized (i.e. when caller sees next
     618             :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
     619             :  * process from leaf page).
     620             :  *
     621             :  * This is where bottom-up deletion determines and remembers which entries are
     622             :  * duplicates.  This will be important information to the tableam delete
     623             :  * infrastructure later on.  Plain index tuple duplicates are marked
     624             :  * "promising" here, per tableam contract.
     625             :  *
     626             :  * Our approach to marking entries whose TIDs come from posting lists is more
     627             :  * complicated.  Posting lists can only be formed by a deduplication pass (or
     628             :  * during an index build), so recent version churn affecting the pointed-to
     629             :  * logical rows is not particularly likely.  We may still give a weak signal
     630             :  * about posting list tuples' entries (by marking just one of its TIDs/entries
     631             :  * promising), though this is only a possibility in the event of further
     632             :  * duplicate index tuples in final interval that covers posting list tuple (as
     633             :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
     634             :  * when it has no stronger signal to go with for the deletion operation as a
     635             :  * whole.
     636             :  *
     637             :  * The heuristics we use work well in practice because we only need to give
     638             :  * the tableam the right _general_ idea about where to look.  Garbage tends to
     639             :  * naturally get concentrated in relatively few table blocks with workloads
     640             :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
     641             :  * available table blocks sensibly based on the hints we provide, but that's
     642             :  * okay -- only the extremes matter.  The tableam just needs to be able to
     643             :  * predict which few table blocks will have the most tuples that are safe to
     644             :  * delete for each deletion operation, with low variance across related
     645             :  * deletion operations.
     646             :  */
     647             : static void
     648      867922 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
     649             :                                TM_IndexDeleteOp *delstate)
     650             : {
     651      867922 :     bool        dupinterval = (state->nitems > 1);
     652             : 
     653             :     Assert(state->nitems > 0);
     654             :     Assert(state->nitems <= state->nhtids);
     655             :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     656             : 
     657     1871926 :     for (int i = 0; i < state->nitems; i++)
     658             :     {
     659     1004004 :         OffsetNumber offnum = state->baseoff + i;
     660     1004004 :         ItemId      itemid = PageGetItemId(page, offnum);
     661     1004004 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     662     1004004 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
     663     1004004 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
     664             : 
     665     1004004 :         if (!BTreeTupleIsPosting(itup))
     666             :         {
     667             :             /* Simple case: A plain non-pivot tuple */
     668      796626 :             ideltid->tid = itup->t_tid;
     669      796626 :             ideltid->id = delstate->ndeltids;
     670      796626 :             istatus->idxoffnum = offnum;
     671      796626 :             istatus->knowndeletable = false; /* for now */
     672      796626 :             istatus->promising = dupinterval;    /* simple rule */
     673      796626 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
     674             : 
     675      796626 :             delstate->ndeltids++;
     676             :         }
     677             :         else
     678             :         {
     679             :             /*
     680             :              * Complicated case: A posting list tuple.
     681             :              *
     682             :              * We make the conservative assumption that there can only be at
     683             :              * most one affected logical row per posting list tuple.  There
     684             :              * will be at most one promising entry in deltids to represent
     685             :              * this presumed lone logical row.  Note that this isn't even
     686             :              * considered unless the posting list tuple is also in an interval
     687             :              * of duplicates -- this complicated rule is just a variant of the
     688             :              * simple rule used to decide if plain index tuples are promising.
     689             :              */
     690      207378 :             int         nitem = BTreeTupleGetNPosting(itup);
     691      207378 :             bool        firstpromising = false;
     692      207378 :             bool        lastpromising = false;
     693             : 
     694             :             Assert(_bt_posting_valid(itup));
     695             : 
     696      207378 :             if (dupinterval)
     697             :             {
     698             :                 /*
     699             :                  * Complicated rule: either the first or last TID in the
     700             :                  * posting list gets marked promising (if any at all)
     701             :                  */
     702             :                 BlockNumber minblocklist,
     703             :                             midblocklist,
     704             :                             maxblocklist;
     705             :                 ItemPointer mintid,
     706             :                             midtid,
     707             :                             maxtid;
     708             : 
     709       18562 :                 mintid = BTreeTupleGetHeapTID(itup);
     710       18562 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
     711       18562 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
     712       18562 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
     713       18562 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
     714       18562 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
     715             : 
     716             :                 /* Only entry with predominant table block can be promising */
     717       18562 :                 firstpromising = (minblocklist == midblocklist);
     718       18562 :                 lastpromising = (!firstpromising &&
     719             :                                  midblocklist == maxblocklist);
     720             :             }
     721             : 
     722     1167210 :             for (int p = 0; p < nitem; p++)
     723             :             {
     724      959832 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
     725             : 
     726      959832 :                 ideltid->tid = *htid;
     727      959832 :                 ideltid->id = delstate->ndeltids;
     728      959832 :                 istatus->idxoffnum = offnum;
     729      959832 :                 istatus->knowndeletable = false; /* for now */
     730      959832 :                 istatus->promising = false;
     731      959832 :                 if ((firstpromising && p == 0) ||
     732      126330 :                     (lastpromising && p == nitem - 1))
     733       12356 :                     istatus->promising = true;
     734      959832 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
     735             : 
     736      959832 :                 ideltid++;
     737      959832 :                 istatus++;
     738      959832 :                 delstate->ndeltids++;
     739             :             }
     740             :         }
     741             :     }
     742             : 
     743      867922 :     if (dupinterval)
     744             :     {
     745       91050 :         state->intervals[state->nintervals].nitems = state->nitems;
     746       91050 :         state->nintervals++;
     747             :     }
     748             : 
     749             :     /* Reset state for next interval */
     750      867922 :     state->nhtids = 0;
     751      867922 :     state->nitems = 0;
     752      867922 :     state->phystupsize = 0;
     753      867922 : }
     754             : 
     755             : /*
     756             :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     757             :  * same value -- if they are, deduplication's "single value" strategy should
     758             :  * be applied.  The general goal of this strategy is to ensure that
     759             :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     760             :  * split point as further duplicates are inserted, and successive rightmost
     761             :  * page splits occur among pages that store the same duplicate value.  When
     762             :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     763             :  * just like it would if deduplication were disabled.
     764             :  *
     765             :  * We expect that affected workloads will require _several_ single value
     766             :  * strategy deduplication passes (over a page that only stores duplicates)
     767             :  * before the page is finally split.  The first deduplication pass should only
     768             :  * find regular non-pivot tuples.  Later deduplication passes will find
     769             :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     770             :  * over.  The penultimate pass is generally the first pass that actually
     771             :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     772             :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     773             :  * any space -- it will skip over everything without merging anything (it
     774             :  * retraces the steps of the penultimate pass).
     775             :  *
     776             :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     777             :  * the first pass) won't spend many cycles on the large posting list tuples
     778             :  * left by previous passes.  Each pass will find a large contiguous group of
     779             :  * smaller duplicate tuples to merge together at the end of the page.
     780             :  */
     781             : static bool
     782       22360 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     783             :                  OffsetNumber minoff, IndexTuple newitem)
     784             : {
     785       22360 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     786             :     ItemId      itemid;
     787             :     IndexTuple  itup;
     788             : 
     789       22360 :     itemid = PageGetItemId(page, minoff);
     790       22360 :     itup = (IndexTuple) PageGetItem(page, itemid);
     791             : 
     792       22360 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     793             :     {
     794        2044 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     795        2044 :         itup = (IndexTuple) PageGetItem(page, itemid);
     796             : 
     797        2044 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     798        1296 :             return true;
     799             :     }
     800             : 
     801       21064 :     return false;
     802             : }
     803             : 
     804             : /*
     805             :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     806             :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     807             :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     808             :  * five tuples could actually just be very large duplicate tuples that
     809             :  * couldn't be merged together at all.  Deduplication will simply not modify
     810             :  * the page when that happens.)
     811             :  *
     812             :  * When there are six posting lists on the page (after current deduplication
     813             :  * pass goes on to create/observe a sixth very large tuple), caller should end
     814             :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     815             :  * are supposed to end up on the new right sibling page following the
     816             :  * anticipated page split.  A future deduplication pass of future right
     817             :  * sibling page might take care of it.  (This is why the first single value
     818             :  * strategy deduplication pass for a given leaf page will generally find only
     819             :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     820             :  */
     821             : static void
     822         640 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     823             : {
     824             :     Size        leftfree;
     825             :     int         reduction;
     826             : 
     827             :     /* This calculation needs to match nbtsplitloc.c */
     828         640 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     829             :         MAXALIGN(sizeof(BTPageOpaqueData));
     830             :     /* Subtract size of new high key (includes pivot heap TID space) */
     831         640 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     832             : 
     833             :     /*
     834             :      * Reduce maxpostingsize by an amount equal to target free space on left
     835             :      * half of page
     836             :      */
     837         640 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     838         640 :     if (state->maxpostingsize > reduction)
     839         640 :         state->maxpostingsize -= reduction;
     840             :     else
     841           0 :         state->maxpostingsize = 0;
     842         640 : }
     843             : 
     844             : /*
     845             :  * Build a posting list tuple based on caller's "base" index tuple and list of
     846             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     847             :  * posting list. (Posting list tuples can never have a single heap TID, partly
     848             :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     849             :  * size of entire tuple.)
     850             :  *
     851             :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     852             :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     853             :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     854             :  * This sometimes wastes a little space that was only needed as alignment
     855             :  * padding in the original tuple.  Following this convention simplifies the
     856             :  * space accounting used when deduplicating a page (the same convention
     857             :  * simplifies the accounting for choosing a point to split a page at).
     858             :  *
     859             :  * Note: Caller's "htids" array must be unique and already in ascending TID
     860             :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     861             :  * returned posting list tuple (they must be included in htids array.)
     862             :  */
     863             : IndexTuple
     864      480322 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
     865             : {
     866             :     uint32      keysize,
     867             :                 newsize;
     868             :     IndexTuple  itup;
     869             : 
     870      480322 :     if (BTreeTupleIsPosting(base))
     871      145512 :         keysize = BTreeTupleGetPostingOffset(base);
     872             :     else
     873      334810 :         keysize = IndexTupleSize(base);
     874             : 
     875             :     Assert(!BTreeTupleIsPivot(base));
     876             :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     877             :     Assert(keysize == MAXALIGN(keysize));
     878             : 
     879             :     /* Determine final size of new tuple */
     880      480322 :     if (nhtids > 1)
     881      417816 :         newsize = MAXALIGN(keysize +
     882             :                            nhtids * sizeof(ItemPointerData));
     883             :     else
     884       62506 :         newsize = keysize;
     885             : 
     886             :     Assert(newsize <= INDEX_SIZE_MASK);
     887             :     Assert(newsize == MAXALIGN(newsize));
     888             : 
     889             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     890      480322 :     itup = palloc0(newsize);
     891      480322 :     memcpy(itup, base, keysize);
     892      480322 :     itup->t_info &= ~INDEX_SIZE_MASK;
     893      480322 :     itup->t_info |= newsize;
     894      480322 :     if (nhtids > 1)
     895             :     {
     896             :         /* Form posting list tuple */
     897      417816 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     898      417816 :         memcpy(BTreeTupleGetPosting(itup), htids,
     899             :                sizeof(ItemPointerData) * nhtids);
     900             :         Assert(_bt_posting_valid(itup));
     901             :     }
     902             :     else
     903             :     {
     904             :         /* Form standard non-pivot tuple */
     905       62506 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     906       62506 :         ItemPointerCopy(htids, &itup->t_tid);
     907             :         Assert(ItemPointerIsValid(&itup->t_tid));
     908             :     }
     909             : 
     910      480322 :     return itup;
     911             : }
     912             : 
     913             : /*
     914             :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     915             :  * no longer has TIDs that need to be deleted.
     916             :  *
     917             :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
     918             :  * points to the existing posting list tuple to be updated.
     919             :  *
     920             :  * On return, caller's vacposting argument will point to final "updated"
     921             :  * tuple, which will be palloc()'d in caller's memory context.
     922             :  */
     923             : void
     924       45018 : _bt_update_posting(BTVacuumPosting vacposting)
     925             : {
     926       45018 :     IndexTuple  origtuple = vacposting->itup;
     927             :     uint32      keysize,
     928             :                 newsize;
     929             :     IndexTuple  itup;
     930             :     int         nhtids;
     931             :     int         ui,
     932             :                 d;
     933             :     ItemPointer htids;
     934             : 
     935       45018 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     936             : 
     937             :     Assert(_bt_posting_valid(origtuple));
     938             :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     939             : 
     940             :     /*
     941             :      * Determine final size of new tuple.
     942             :      *
     943             :      * This calculation needs to match the code used within _bt_form_posting()
     944             :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
     945             :      * to save ourselves a second memory allocation for a htids workspace.
     946             :      */
     947       45018 :     keysize = BTreeTupleGetPostingOffset(origtuple);
     948       45018 :     if (nhtids > 1)
     949        8030 :         newsize = MAXALIGN(keysize +
     950             :                            nhtids * sizeof(ItemPointerData));
     951             :     else
     952       36988 :         newsize = keysize;
     953             : 
     954             :     Assert(newsize <= INDEX_SIZE_MASK);
     955             :     Assert(newsize == MAXALIGN(newsize));
     956             : 
     957             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     958       45018 :     itup = palloc0(newsize);
     959       45018 :     memcpy(itup, origtuple, keysize);
     960       45018 :     itup->t_info &= ~INDEX_SIZE_MASK;
     961       45018 :     itup->t_info |= newsize;
     962             : 
     963       45018 :     if (nhtids > 1)
     964             :     {
     965             :         /* Form posting list tuple */
     966        8030 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     967        8030 :         htids = BTreeTupleGetPosting(itup);
     968             :     }
     969             :     else
     970             :     {
     971             :         /* Form standard non-pivot tuple */
     972       36988 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     973       36988 :         htids = &itup->t_tid;
     974             :     }
     975             : 
     976       45018 :     ui = 0;
     977       45018 :     d = 0;
     978      236348 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     979             :     {
     980      191330 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     981             :         {
     982       95414 :             d++;
     983       95414 :             continue;
     984             :         }
     985       95916 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     986             :     }
     987             :     Assert(ui == nhtids);
     988             :     Assert(d == vacposting->ndeletedtids);
     989             :     Assert(nhtids == 1 || _bt_posting_valid(itup));
     990             :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     991             : 
     992             :     /* vacposting arg's itup will now point to updated version */
     993       45018 :     vacposting->itup = itup;
     994       45018 : }
     995             : 
     996             : /*
     997             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     998             :  * TID from original posting list (the 'oposting' heap TID located at offset
     999             :  * 'postingoff').  Modifies newitem, so caller should pass their own private
    1000             :  * copy that can safely be modified.
    1001             :  *
    1002             :  * Returns new posting list tuple, which is palloc()'d in caller's context.
    1003             :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
    1004             :  * what caller actually inserts. (This happens inside the same critical
    1005             :  * section that performs an in-place update of old posting list using new
    1006             :  * posting list returned here.)
    1007             :  *
    1008             :  * While the keys from newitem and oposting must be opclass equal, and must
    1009             :  * generate identical output when run through the underlying type's output
    1010             :  * function, it doesn't follow that their representations match exactly.
    1011             :  * Caller must avoid assuming that there can't be representational differences
    1012             :  * that make datums from oposting bigger or smaller than the corresponding
    1013             :  * datums from newitem.  For example, differences in TOAST input state might
    1014             :  * break a faulty assumption about tuple size (the executor is entitled to
    1015             :  * apply TOAST compression based on its own criteria).  It also seems possible
    1016             :  * that further representational variation will be introduced in the future,
    1017             :  * in order to support nbtree features like page-level prefix compression.
    1018             :  *
    1019             :  * See nbtree/README for details on the design of posting list splits.
    1020             :  */
    1021             : IndexTuple
    1022       26098 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
    1023             : {
    1024             :     int         nhtids;
    1025             :     char       *replacepos;
    1026             :     char       *replaceposright;
    1027             :     Size        nmovebytes;
    1028             :     IndexTuple  nposting;
    1029             : 
    1030       26098 :     nhtids = BTreeTupleGetNPosting(oposting);
    1031             :     Assert(_bt_posting_valid(oposting));
    1032             : 
    1033             :     /*
    1034             :      * The postingoff argument originated as a _bt_binsrch_posting() return
    1035             :      * value.  It will be 0 in the event of corruption that makes a leaf page
    1036             :      * contain a non-pivot tuple that's somehow identical to newitem (no two
    1037             :      * non-pivot tuples should ever have the same TID).  This has been known
    1038             :      * to happen in the field from time to time.
    1039             :      *
    1040             :      * Perform a basic sanity check to catch this case now.
    1041             :      */
    1042       26098 :     if (!(postingoff > 0 && postingoff < nhtids))
    1043           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
    1044             :              nhtids, postingoff);
    1045             : 
    1046             :     /*
    1047             :      * Move item pointers in posting list to make a gap for the new item's
    1048             :      * heap TID.  We shift TIDs one place to the right, losing original
    1049             :      * rightmost TID. (nmovebytes must not include TIDs to the left of
    1050             :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
    1051             :      */
    1052       26098 :     nposting = CopyIndexTuple(oposting);
    1053       26098 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
    1054       26098 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
    1055       26098 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
    1056       26098 :     memmove(replaceposright, replacepos, nmovebytes);
    1057             : 
    1058             :     /* Fill the gap at postingoff with TID of new item (original new TID) */
    1059             :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
    1060       26098 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
    1061             : 
    1062             :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
    1063       26098 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
    1064             : 
    1065             :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
    1066             :                               BTreeTupleGetHeapTID(newitem)) < 0);
    1067             :     Assert(_bt_posting_valid(nposting));
    1068             : 
    1069       26098 :     return nposting;
    1070             : }
    1071             : 
    1072             : /*
    1073             :  * Verify posting list invariants for "posting", which must be a posting list
    1074             :  * tuple.  Used within assertions.
    1075             :  */
    1076             : #ifdef USE_ASSERT_CHECKING
    1077             : static bool
    1078             : _bt_posting_valid(IndexTuple posting)
    1079             : {
    1080             :     ItemPointerData last;
    1081             :     ItemPointer htid;
    1082             : 
    1083             :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
    1084             :         return false;
    1085             : 
    1086             :     /* Remember first heap TID for loop */
    1087             :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
    1088             :     if (!ItemPointerIsValid(&last))
    1089             :         return false;
    1090             : 
    1091             :     /* Iterate, starting from second TID */
    1092             :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
    1093             :     {
    1094             :         htid = BTreeTupleGetPostingN(posting, i);
    1095             : 
    1096             :         if (!ItemPointerIsValid(htid))
    1097             :             return false;
    1098             :         if (ItemPointerCompare(htid, &last) <= 0)
    1099             :             return false;
    1100             :         ItemPointerCopy(htid, &last);
    1101             :     }
    1102             : 
    1103             :     return true;
    1104             : }
    1105             : #endif

Generated by: LCOV version 1.14