LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 286 293 97.6 %
Date: 2025-08-31 07:17:45 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtdedup.c
       4             :  *    Deduplicate or bottom-up delete items in Postgres btrees.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtdedup.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include "access/nbtree.h"
      18             : #include "access/nbtxlog.h"
      19             : #include "access/tableam.h"
      20             : #include "access/xloginsert.h"
      21             : #include "miscadmin.h"
      22             : #include "utils/rel.h"
      23             : 
      24             : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
      25             :                                            TM_IndexDeleteOp *delstate);
      26             : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      27             :                              OffsetNumber minoff, IndexTuple newitem);
      28             : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      29             :                                      Size newitemsz);
      30             : #ifdef USE_ASSERT_CHECKING
      31             : static bool _bt_posting_valid(IndexTuple posting);
      32             : #endif
      33             : 
      34             : /*
      35             :  * Perform a deduplication pass.
      36             :  *
      37             :  * The general approach taken here is to perform as much deduplication as
      38             :  * possible to free as much space as possible.  Note, however, that "single
      39             :  * value" strategy is used for !bottomupdedup callers when the page is full of
      40             :  * tuples of a single value.  Deduplication passes that apply the strategy
      41             :  * will leave behind a few untouched tuples at the end of the page, preparing
      42             :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
      43             :  * value strategy.  Our high level goal is to delay merging the untouched
      44             :  * tuples until after the page splits.
      45             :  *
      46             :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
      47             :  * high level goal is to prevent a page split entirely by buying more time.
      48             :  * We still hope that a page split can be avoided altogether.  That's why
      49             :  * single value strategy is not even considered for bottomupdedup callers.
      50             :  *
      51             :  * The page will have to be split if we cannot successfully free at least
      52             :  * newitemsz (we also need space for newitem's line pointer, which isn't
      53             :  * included in caller's newitemsz).
      54             :  *
      55             :  * Note: Caller should have already deleted all existing items with their
      56             :  * LP_DEAD bits set.
      57             :  */
      58             : void
      59       27284 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
      60             :                bool bottomupdedup)
      61             : {
      62             :     OffsetNumber offnum,
      63             :                 minoff,
      64             :                 maxoff;
      65       27284 :     Page        page = BufferGetPage(buf);
      66       27284 :     BTPageOpaque opaque = BTPageGetOpaque(page);
      67             :     Page        newpage;
      68             :     BTDedupState state;
      69       27284 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
      70       27284 :     bool        singlevalstrat = false;
      71       27284 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      72             : 
      73             :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
      74       27284 :     newitemsz += sizeof(ItemIdData);
      75             : 
      76             :     /*
      77             :      * Initialize deduplication state.
      78             :      *
      79             :      * It would be possible for maxpostingsize (limit on posting list tuple
      80             :      * size) to be set to one third of the page.  However, it seems like a
      81             :      * good idea to limit the size of posting lists to one sixth of a page.
      82             :      * That ought to leave us with a good split point when pages full of
      83             :      * duplicates can be split several times.
      84             :      */
      85       27284 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
      86       27284 :     state->deduplicate = true;
      87       27284 :     state->nmaxitems = 0;
      88       27284 :     state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
      89             :     /* Metadata about base tuple of current pending posting list */
      90       27284 :     state->base = NULL;
      91       27284 :     state->baseoff = InvalidOffsetNumber;
      92       27284 :     state->basetupsize = 0;
      93             :     /* Metadata about current pending posting list TIDs */
      94       27284 :     state->htids = palloc(state->maxpostingsize);
      95       27284 :     state->nhtids = 0;
      96       27284 :     state->nitems = 0;
      97             :     /* Size of all physical tuples to be replaced by pending posting list */
      98       27284 :     state->phystupsize = 0;
      99             :     /* nintervals should be initialized to zero */
     100       27284 :     state->nintervals = 0;
     101             : 
     102       27284 :     minoff = P_FIRSTDATAKEY(opaque);
     103       27284 :     maxoff = PageGetMaxOffsetNumber(page);
     104             : 
     105             :     /*
     106             :      * Consider applying "single value" strategy, though only if the page
     107             :      * seems likely to be split in the near future
     108             :      */
     109       27284 :     if (!bottomupdedup)
     110       23748 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     111             : 
     112             :     /*
     113             :      * Deduplicate items from page, and write them to newpage.
     114             :      *
     115             :      * Copy the original page's LSN into newpage copy.  This will become the
     116             :      * updated version of the page.  We need this because XLogInsert will
     117             :      * examine the LSN and possibly dump it in a page image.
     118             :      */
     119       27284 :     newpage = PageGetTempPageCopySpecial(page);
     120       27284 :     PageSetLSN(newpage, PageGetLSN(page));
     121             : 
     122             :     /* Copy high key, if any */
     123       27284 :     if (!P_RIGHTMOST(opaque))
     124             :     {
     125       21672 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
     126       21672 :         Size        hitemsz = ItemIdGetLength(hitemid);
     127       21672 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
     128             : 
     129       21672 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
     130             :                         false, false) == InvalidOffsetNumber)
     131           0 :             elog(ERROR, "deduplication failed to add highkey");
     132             :     }
     133             : 
     134       27284 :     for (offnum = minoff;
     135     6112752 :          offnum <= maxoff;
     136     6085468 :          offnum = OffsetNumberNext(offnum))
     137             :     {
     138     6085468 :         ItemId      itemid = PageGetItemId(page, offnum);
     139     6085468 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     140             : 
     141             :         Assert(!ItemIdIsDead(itemid));
     142             : 
     143     6085468 :         if (offnum == minoff)
     144             :         {
     145             :             /*
     146             :              * No previous/base tuple for the data item -- use the data item
     147             :              * as base tuple of pending posting list
     148             :              */
     149       27284 :             _bt_dedup_start_pending(state, itup, offnum);
     150             :         }
     151    12114140 :         else if (state->deduplicate &&
     152     6902376 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     153      846420 :                  _bt_dedup_save_htid(state, itup))
     154             :         {
     155             :             /*
     156             :              * Tuple is equal to base tuple of pending posting list.  Heap
     157             :              * TID(s) for itup have been saved in state.
     158             :              */
     159             :         }
     160             :         else
     161             :         {
     162             :             /*
     163             :              * Tuple is not equal to pending posting list tuple, or
     164             :              * _bt_dedup_save_htid() opted to not merge current item into
     165             :              * pending posting list for some other reason (e.g., adding more
     166             :              * TIDs would have caused posting list to exceed current
     167             :              * maxpostingsize).
     168             :              *
     169             :              * If state contains pending posting list with more than one item,
     170             :              * form new posting tuple and add it to our temp page (newpage).
     171             :              * Else add pending interval's base tuple to the temp page as-is.
     172             :              */
     173     5224910 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
     174             : 
     175     5224910 :             if (singlevalstrat)
     176             :             {
     177             :                 /*
     178             :                  * Single value strategy's extra steps.
     179             :                  *
     180             :                  * Lower maxpostingsize for sixth and final large posting list
     181             :                  * tuple at the point where 5 maxpostingsize-capped tuples
     182             :                  * have either been formed or observed.
     183             :                  *
     184             :                  * When a sixth maxpostingsize-capped item is formed/observed,
     185             :                  * stop merging together tuples altogether.  The few tuples
     186             :                  * that remain at the end of the page won't be merged together
     187             :                  * at all (at least not until after a future page split takes
     188             :                  * place, when this page's newly allocated right sibling page
     189             :                  * gets its first deduplication pass).
     190             :                  */
     191        5552 :                 if (state->nmaxitems == 5)
     192         684 :                     _bt_singleval_fillfactor(page, state, newitemsz);
     193        4868 :                 else if (state->nmaxitems == 6)
     194             :                 {
     195         242 :                     state->deduplicate = false;
     196         242 :                     singlevalstrat = false; /* won't be back here */
     197             :                 }
     198             :             }
     199             : 
     200             :             /* itup starts new pending posting list */
     201     5224910 :             _bt_dedup_start_pending(state, itup, offnum);
     202             :         }
     203             :     }
     204             : 
     205             :     /* Handle the last item */
     206       27284 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
     207             : 
     208             :     /*
     209             :      * If no items suitable for deduplication were found, newpage must be
     210             :      * exactly the same as the original page, so just return from function.
     211             :      *
     212             :      * We could determine whether or not to proceed on the basis the space
     213             :      * savings being sufficient to avoid an immediate page split instead.  We
     214             :      * don't do that because there is some small value in nbtsplitloc.c always
     215             :      * operating against a page that is fully deduplicated (apart from
     216             :      * newitem).  Besides, most of the cost has already been paid.
     217             :      */
     218       27284 :     if (state->nintervals == 0)
     219             :     {
     220             :         /* cannot leak memory here */
     221        4752 :         pfree(newpage);
     222        4752 :         pfree(state->htids);
     223        4752 :         pfree(state);
     224        4752 :         return;
     225             :     }
     226             : 
     227             :     /*
     228             :      * By here, it's clear that deduplication will definitely go ahead.
     229             :      *
     230             :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
     231             :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
     232             :      * But keep things tidy.
     233             :      */
     234       22532 :     if (P_HAS_GARBAGE(opaque))
     235             :     {
     236           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
     237             : 
     238           0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     239             :     }
     240             : 
     241       22532 :     START_CRIT_SECTION();
     242             : 
     243       22532 :     PageRestoreTempPage(newpage, page);
     244       22532 :     MarkBufferDirty(buf);
     245             : 
     246             :     /* XLOG stuff */
     247       22532 :     if (RelationNeedsWAL(rel))
     248             :     {
     249             :         XLogRecPtr  recptr;
     250             :         xl_btree_dedup xlrec_dedup;
     251             : 
     252       22406 :         xlrec_dedup.nintervals = state->nintervals;
     253             : 
     254       22406 :         XLogBeginInsert();
     255       22406 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     256       22406 :         XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
     257             : 
     258             :         /*
     259             :          * The intervals array is not in the buffer, but pretend that it is.
     260             :          * When XLogInsert stores the whole buffer, the array need not be
     261             :          * stored too.
     262             :          */
     263       22406 :         XLogRegisterBufData(0, state->intervals,
     264       22406 :                             state->nintervals * sizeof(BTDedupInterval));
     265             : 
     266       22406 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     267             : 
     268       22406 :         PageSetLSN(page, recptr);
     269             :     }
     270             : 
     271       22532 :     END_CRIT_SECTION();
     272             : 
     273             :     /* Local space accounting should agree with page accounting */
     274             :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     275             : 
     276             :     /* cannot leak memory here */
     277       22532 :     pfree(state->htids);
     278       22532 :     pfree(state);
     279             : }
     280             : 
     281             : /*
     282             :  * Perform bottom-up index deletion pass.
     283             :  *
     284             :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
     285             :  * be deleted via bottom-up index deletion.  The high level goal here is to
     286             :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
     287             :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
     288             :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
     289             :  * do not particularly care about once-off opportunities to delete many index
     290             :  * tuples together.
     291             :  *
     292             :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
     293             :  * See access/tableam.h for a description of how we're expected to cooperate
     294             :  * with the tableam.
     295             :  *
     296             :  * Returns true on success, in which case caller can assume page split will be
     297             :  * avoided for a reasonable amount of time.  Returns false when caller should
     298             :  * deduplicate the page (if possible at all).
     299             :  *
     300             :  * Note: Occasionally we return true despite failing to delete enough items to
     301             :  * avoid a split.  This makes caller skip deduplication and go split the page
     302             :  * right away.  Our return value is always just advisory information.
     303             :  *
     304             :  * Note: Caller should have already deleted all existing items with their
     305             :  * LP_DEAD bits set.
     306             :  */
     307             : bool
     308        3954 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
     309             :                      Size newitemsz)
     310             : {
     311             :     OffsetNumber offnum,
     312             :                 minoff,
     313             :                 maxoff;
     314        3954 :     Page        page = BufferGetPage(buf);
     315        3954 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     316             :     BTDedupState state;
     317             :     TM_IndexDeleteOp delstate;
     318             :     bool        neverdedup;
     319        3954 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     320             : 
     321             :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     322        3954 :     newitemsz += sizeof(ItemIdData);
     323             : 
     324             :     /* Initialize deduplication state */
     325        3954 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
     326        3954 :     state->deduplicate = true;
     327        3954 :     state->nmaxitems = 0;
     328        3954 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
     329        3954 :     state->base = NULL;
     330        3954 :     state->baseoff = InvalidOffsetNumber;
     331        3954 :     state->basetupsize = 0;
     332        3954 :     state->htids = palloc(state->maxpostingsize);
     333        3954 :     state->nhtids = 0;
     334        3954 :     state->nitems = 0;
     335        3954 :     state->phystupsize = 0;
     336        3954 :     state->nintervals = 0;
     337             : 
     338             :     /*
     339             :      * Initialize tableam state that describes bottom-up index deletion
     340             :      * operation.
     341             :      *
     342             :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
     343             :      * can safely delete.  The tableam will search until our leaf page space
     344             :      * target is satisfied, or until the cost of continuing with the tableam
     345             :      * operation seems too high.  It focuses its efforts on TIDs associated
     346             :      * with duplicate index tuples that we mark "promising".
     347             :      *
     348             :      * This space target is a little arbitrary.  The tableam must be able to
     349             :      * keep the costs and benefits in balance.  We provide the tableam with
     350             :      * exhaustive information about what might work, without directly
     351             :      * concerning ourselves with avoiding work during the tableam call.  Our
     352             :      * role in costing the bottom-up deletion process is strictly advisory.
     353             :      */
     354        3954 :     delstate.irel = rel;
     355        3954 :     delstate.iblknum = BufferGetBlockNumber(buf);
     356        3954 :     delstate.bottomup = true;
     357        3954 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
     358        3954 :     delstate.ndeltids = 0;
     359        3954 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
     360        3954 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
     361             : 
     362        3954 :     minoff = P_FIRSTDATAKEY(opaque);
     363        3954 :     maxoff = PageGetMaxOffsetNumber(page);
     364        3954 :     for (offnum = minoff;
     365     1088576 :          offnum <= maxoff;
     366     1084622 :          offnum = OffsetNumberNext(offnum))
     367             :     {
     368     1084622 :         ItemId      itemid = PageGetItemId(page, offnum);
     369     1084622 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     370             : 
     371             :         Assert(!ItemIdIsDead(itemid));
     372             : 
     373     1084622 :         if (offnum == minoff)
     374             :         {
     375             :             /* itup starts first pending interval */
     376        3954 :             _bt_dedup_start_pending(state, itup, offnum);
     377             :         }
     378     1229326 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     379      148658 :                  _bt_dedup_save_htid(state, itup))
     380             :         {
     381             :             /* Tuple is equal; just added its TIDs to pending interval */
     382             :         }
     383             :         else
     384             :         {
     385             :             /* Finalize interval -- move its TIDs to delete state */
     386      932010 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
     387             : 
     388             :             /* itup starts new pending interval */
     389      932010 :             _bt_dedup_start_pending(state, itup, offnum);
     390             :         }
     391             :     }
     392             :     /* Finalize final interval -- move its TIDs to delete state */
     393        3954 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
     394             : 
     395             :     /*
     396             :      * We don't give up now in the event of having few (or even zero)
     397             :      * promising tuples for the tableam because it's not up to us as the index
     398             :      * AM to manage costs (note that the tableam might have heuristics of its
     399             :      * own that work out what to do).  We should at least avoid having our
     400             :      * caller do a useless deduplication pass after we return in the event of
     401             :      * zero promising tuples, though.
     402             :      */
     403        3954 :     neverdedup = false;
     404        3954 :     if (state->nintervals == 0)
     405          10 :         neverdedup = true;
     406             : 
     407        3954 :     pfree(state->htids);
     408        3954 :     pfree(state);
     409             : 
     410             :     /* Ask tableam which TIDs are deletable, then physically delete them */
     411        3954 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
     412             : 
     413        3954 :     pfree(delstate.deltids);
     414        3954 :     pfree(delstate.status);
     415             : 
     416             :     /* Report "success" to caller unconditionally to avoid deduplication */
     417        3954 :     if (neverdedup)
     418          10 :         return true;
     419             : 
     420             :     /* Don't dedup when we won't end up back here any time soon anyway */
     421        3944 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
     422             : }
     423             : 
     424             : /*
     425             :  * Create a new pending posting list tuple based on caller's base tuple.
     426             :  *
     427             :  * Every tuple processed by deduplication either becomes the base tuple for a
     428             :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     429             :  * A tuple that starts out as the base tuple for a posting list will only
     430             :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     431             :  * that there are duplicates that can be merged into the base tuple.
     432             :  */
     433             : void
     434    11770152 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     435             :                         OffsetNumber baseoff)
     436             : {
     437             :     Assert(state->nhtids == 0);
     438             :     Assert(state->nitems == 0);
     439             :     Assert(!BTreeTupleIsPivot(base));
     440             : 
     441             :     /*
     442             :      * Copy heap TID(s) from new base tuple for new candidate posting list
     443             :      * into working state's array
     444             :      */
     445    11770152 :     if (!BTreeTupleIsPosting(base))
     446             :     {
     447     9824008 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     448     9824008 :         state->nhtids = 1;
     449     9824008 :         state->basetupsize = IndexTupleSize(base);
     450             :     }
     451             :     else
     452             :     {
     453             :         int         nposting;
     454             : 
     455     1946144 :         nposting = BTreeTupleGetNPosting(base);
     456     1946144 :         memcpy(state->htids, BTreeTupleGetPosting(base),
     457             :                sizeof(ItemPointerData) * nposting);
     458     1946144 :         state->nhtids = nposting;
     459             :         /* basetupsize should not include existing posting list */
     460     1946144 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
     461             :     }
     462             : 
     463             :     /*
     464             :      * Save new base tuple itself -- it'll be needed if we actually create a
     465             :      * new posting list from new pending posting list.
     466             :      *
     467             :      * Must maintain physical size of all existing tuples (including line
     468             :      * pointer overhead) so that we can calculate space savings on page.
     469             :      */
     470    11770152 :     state->nitems = 1;
     471    11770152 :     state->base = base;
     472    11770152 :     state->baseoff = baseoff;
     473    11770152 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     474             :     /* Also save baseoff in pending state for interval */
     475    11770152 :     state->intervals[state->nintervals].baseoff = state->baseoff;
     476    11770152 : }
     477             : 
     478             : /*
     479             :  * Save itup heap TID(s) into pending posting list where possible.
     480             :  *
     481             :  * Returns bool indicating if the pending posting list managed by state now
     482             :  * includes itup's heap TID(s).
     483             :  */
     484             : bool
     485     2542634 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     486             : {
     487             :     int         nhtids;
     488             :     ItemPointer htids;
     489             :     Size        mergedtupsz;
     490             : 
     491             :     Assert(!BTreeTupleIsPivot(itup));
     492             : 
     493     2542634 :     if (!BTreeTupleIsPosting(itup))
     494             :     {
     495     2529704 :         nhtids = 1;
     496     2529704 :         htids = &itup->t_tid;
     497             :     }
     498             :     else
     499             :     {
     500       12930 :         nhtids = BTreeTupleGetNPosting(itup);
     501       12930 :         htids = BTreeTupleGetPosting(itup);
     502             :     }
     503             : 
     504             :     /*
     505             :      * Don't append (have caller finish pending posting list as-is) if
     506             :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
     507             :      *
     508             :      * This calculation needs to match the code used within _bt_form_posting()
     509             :      * for new posting list tuples.
     510             :      */
     511     2542634 :     mergedtupsz = MAXALIGN(state->basetupsize +
     512             :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
     513             : 
     514     2542634 :     if (mergedtupsz > state->maxpostingsize)
     515             :     {
     516             :         /*
     517             :          * Count this as an oversized item for single value strategy, though
     518             :          * only when there are 50 TIDs in the final posting list tuple.  This
     519             :          * limit (which is fairly arbitrary) avoids confusion about how many
     520             :          * 1/6 of a page tuples have been encountered/created by the current
     521             :          * deduplication pass.
     522             :          *
     523             :          * Note: We deliberately don't consider which deduplication pass
     524             :          * merged together tuples to create this item (could be a previous
     525             :          * deduplication pass, or current pass).  See _bt_do_singleval()
     526             :          * comments.
     527             :          */
     528       21062 :         if (state->nhtids > 50)
     529       20024 :             state->nmaxitems++;
     530             : 
     531       21062 :         return false;
     532             :     }
     533             : 
     534             :     /*
     535             :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
     536             :      * pending posting list
     537             :      */
     538     2521572 :     state->nitems++;
     539     2521572 :     memcpy(state->htids + state->nhtids, htids,
     540             :            sizeof(ItemPointerData) * nhtids);
     541     2521572 :     state->nhtids += nhtids;
     542     2521572 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     543             : 
     544     2521572 :     return true;
     545             : }
     546             : 
     547             : /*
     548             :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     549             :  * is based on saved base tuple, and saved list of heap TIDs.
     550             :  *
     551             :  * Returns space saving from deduplicating to make a new posting list tuple.
     552             :  * Note that this includes line pointer overhead.  This is zero in the case
     553             :  * where no deduplication was possible.
     554             :  */
     555             : Size
     556     6093696 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     557             : {
     558             :     OffsetNumber tupoff;
     559             :     Size        tuplesz;
     560             :     Size        spacesaving;
     561             : 
     562             :     Assert(state->nitems > 0);
     563             :     Assert(state->nitems <= state->nhtids);
     564             :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     565             : 
     566     6093696 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     567     6093696 :     if (state->nitems == 1)
     568             :     {
     569             :         /* Use original, unchanged base tuple */
     570     5684708 :         tuplesz = IndexTupleSize(state->base);
     571             :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
     572             :         Assert(tuplesz <= BTMaxItemSize);
     573     5684708 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
     574             :                         false, false) == InvalidOffsetNumber)
     575           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     576             : 
     577     5684708 :         spacesaving = 0;
     578             :     }
     579             :     else
     580             :     {
     581             :         IndexTuple  final;
     582             : 
     583             :         /* Form a tuple with a posting list */
     584      408988 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
     585      408988 :         tuplesz = IndexTupleSize(final);
     586             :         Assert(tuplesz <= state->maxpostingsize);
     587             : 
     588             :         /* Save final number of items for posting list */
     589      408988 :         state->intervals[state->nintervals].nitems = state->nitems;
     590             : 
     591             :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     592             :         Assert(tuplesz <= BTMaxItemSize);
     593      408988 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
     594             :                         false) == InvalidOffsetNumber)
     595           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     596             : 
     597      408988 :         pfree(final);
     598      408988 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     599             :         /* Increment nintervals, since we wrote a new posting list tuple */
     600      408988 :         state->nintervals++;
     601             :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     602             :     }
     603             : 
     604             :     /* Reset state for next pending posting list */
     605     6093696 :     state->nhtids = 0;
     606     6093696 :     state->nitems = 0;
     607     6093696 :     state->phystupsize = 0;
     608             : 
     609     6093696 :     return spacesaving;
     610             : }
     611             : 
     612             : /*
     613             :  * Finalize interval during bottom-up index deletion.
     614             :  *
     615             :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
     616             :  * first, and then get moved over to delstate (in variable-sized batches) by
     617             :  * calling here.  Call here happens when the number of TIDs in a dedup
     618             :  * interval is known, and interval gets finalized (i.e. when caller sees next
     619             :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
     620             :  * process from leaf page).
     621             :  *
     622             :  * This is where bottom-up deletion determines and remembers which entries are
     623             :  * duplicates.  This will be important information to the tableam delete
     624             :  * infrastructure later on.  Plain index tuple duplicates are marked
     625             :  * "promising" here, per tableam contract.
     626             :  *
     627             :  * Our approach to marking entries whose TIDs come from posting lists is more
     628             :  * complicated.  Posting lists can only be formed by a deduplication pass (or
     629             :  * during an index build), so recent version churn affecting the pointed-to
     630             :  * logical rows is not particularly likely.  We may still give a weak signal
     631             :  * about posting list tuples' entries (by marking just one of its TIDs/entries
     632             :  * promising), though this is only a possibility in the event of further
     633             :  * duplicate index tuples in final interval that covers posting list tuple (as
     634             :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
     635             :  * when it has no stronger signal to go with for the deletion operation as a
     636             :  * whole.
     637             :  *
     638             :  * The heuristics we use work well in practice because we only need to give
     639             :  * the tableam the right _general_ idea about where to look.  Garbage tends to
     640             :  * naturally get concentrated in relatively few table blocks with workloads
     641             :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
     642             :  * available table blocks sensibly based on the hints we provide, but that's
     643             :  * okay -- only the extremes matter.  The tableam just needs to be able to
     644             :  * predict which few table blocks will have the most tuples that are safe to
     645             :  * delete for each deletion operation, with low variance across related
     646             :  * deletion operations.
     647             :  */
     648             : static void
     649      935964 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
     650             :                                TM_IndexDeleteOp *delstate)
     651             : {
     652      935964 :     bool        dupinterval = (state->nitems > 1);
     653             : 
     654             :     Assert(state->nitems > 0);
     655             :     Assert(state->nitems <= state->nhtids);
     656             :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     657             : 
     658     2020586 :     for (int i = 0; i < state->nitems; i++)
     659             :     {
     660     1084622 :         OffsetNumber offnum = state->baseoff + i;
     661     1084622 :         ItemId      itemid = PageGetItemId(page, offnum);
     662     1084622 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     663     1084622 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
     664     1084622 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
     665             : 
     666     1084622 :         if (!BTreeTupleIsPosting(itup))
     667             :         {
     668             :             /* Simple case: A plain non-pivot tuple */
     669      869634 :             ideltid->tid = itup->t_tid;
     670      869634 :             ideltid->id = delstate->ndeltids;
     671      869634 :             istatus->idxoffnum = offnum;
     672      869634 :             istatus->knowndeletable = false; /* for now */
     673      869634 :             istatus->promising = dupinterval;    /* simple rule */
     674      869634 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
     675             : 
     676      869634 :             delstate->ndeltids++;
     677             :         }
     678             :         else
     679             :         {
     680             :             /*
     681             :              * Complicated case: A posting list tuple.
     682             :              *
     683             :              * We make the conservative assumption that there can only be at
     684             :              * most one affected logical row per posting list tuple.  There
     685             :              * will be at most one promising entry in deltids to represent
     686             :              * this presumed lone logical row.  Note that this isn't even
     687             :              * considered unless the posting list tuple is also in an interval
     688             :              * of duplicates -- this complicated rule is just a variant of the
     689             :              * simple rule used to decide if plain index tuples are promising.
     690             :              */
     691      214988 :             int         nitem = BTreeTupleGetNPosting(itup);
     692      214988 :             bool        firstpromising = false;
     693      214988 :             bool        lastpromising = false;
     694             : 
     695             :             Assert(_bt_posting_valid(itup));
     696             : 
     697      214988 :             if (dupinterval)
     698             :             {
     699             :                 /*
     700             :                  * Complicated rule: either the first or last TID in the
     701             :                  * posting list gets marked promising (if any at all)
     702             :                  */
     703             :                 BlockNumber minblocklist,
     704             :                             midblocklist,
     705             :                             maxblocklist;
     706             :                 ItemPointer mintid,
     707             :                             midtid,
     708             :                             maxtid;
     709             : 
     710       17472 :                 mintid = BTreeTupleGetHeapTID(itup);
     711       17472 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
     712       17472 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
     713       17472 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
     714       17472 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
     715       17472 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
     716             : 
     717             :                 /* Only entry with predominant table block can be promising */
     718       17472 :                 firstpromising = (minblocklist == midblocklist);
     719       17472 :                 lastpromising = (!firstpromising &&
     720             :                                  midblocklist == maxblocklist);
     721             :             }
     722             : 
     723     1208032 :             for (int p = 0; p < nitem; p++)
     724             :             {
     725      993044 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
     726             : 
     727      993044 :                 ideltid->tid = *htid;
     728      993044 :                 ideltid->id = delstate->ndeltids;
     729      993044 :                 istatus->idxoffnum = offnum;
     730      993044 :                 istatus->knowndeletable = false; /* for now */
     731      993044 :                 istatus->promising = false;
     732      993044 :                 if ((firstpromising && p == 0) ||
     733      123110 :                     (lastpromising && p == nitem - 1))
     734       11374 :                     istatus->promising = true;
     735      993044 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
     736             : 
     737      993044 :                 ideltid++;
     738      993044 :                 istatus++;
     739      993044 :                 delstate->ndeltids++;
     740             :             }
     741             :         }
     742             :     }
     743             : 
     744      935964 :     if (dupinterval)
     745             :     {
     746       98342 :         state->intervals[state->nintervals].nitems = state->nitems;
     747       98342 :         state->nintervals++;
     748             :     }
     749             : 
     750             :     /* Reset state for next interval */
     751      935964 :     state->nhtids = 0;
     752      935964 :     state->nitems = 0;
     753      935964 :     state->phystupsize = 0;
     754      935964 : }
     755             : 
     756             : /*
     757             :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     758             :  * same value -- if they are, deduplication's "single value" strategy should
     759             :  * be applied.  The general goal of this strategy is to ensure that
     760             :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     761             :  * split point as further duplicates are inserted, and successive rightmost
     762             :  * page splits occur among pages that store the same duplicate value.  When
     763             :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     764             :  * just like it would if deduplication were disabled.
     765             :  *
     766             :  * We expect that affected workloads will require _several_ single value
     767             :  * strategy deduplication passes (over a page that only stores duplicates)
     768             :  * before the page is finally split.  The first deduplication pass should only
     769             :  * find regular non-pivot tuples.  Later deduplication passes will find
     770             :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     771             :  * over.  The penultimate pass is generally the first pass that actually
     772             :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     773             :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     774             :  * any space -- it will skip over everything without merging anything (it
     775             :  * retraces the steps of the penultimate pass).
     776             :  *
     777             :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     778             :  * the first pass) won't spend many cycles on the large posting list tuples
     779             :  * left by previous passes.  Each pass will find a large contiguous group of
     780             :  * smaller duplicate tuples to merge together at the end of the page.
     781             :  */
     782             : static bool
     783       23748 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     784             :                  OffsetNumber minoff, IndexTuple newitem)
     785             : {
     786       23748 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     787             :     ItemId      itemid;
     788             :     IndexTuple  itup;
     789             : 
     790       23748 :     itemid = PageGetItemId(page, minoff);
     791       23748 :     itup = (IndexTuple) PageGetItem(page, itemid);
     792             : 
     793       23748 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     794             :     {
     795        2174 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     796        2174 :         itup = (IndexTuple) PageGetItem(page, itemid);
     797             : 
     798        2174 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     799        1394 :             return true;
     800             :     }
     801             : 
     802       22354 :     return false;
     803             : }
     804             : 
     805             : /*
     806             :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     807             :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     808             :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     809             :  * five tuples could actually just be very large duplicate tuples that
     810             :  * couldn't be merged together at all.  Deduplication will simply not modify
     811             :  * the page when that happens.)
     812             :  *
     813             :  * When there are six posting lists on the page (after current deduplication
     814             :  * pass goes on to create/observe a sixth very large tuple), caller should end
     815             :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     816             :  * are supposed to end up on the new right sibling page following the
     817             :  * anticipated page split.  A future deduplication pass of future right
     818             :  * sibling page might take care of it.  (This is why the first single value
     819             :  * strategy deduplication pass for a given leaf page will generally find only
     820             :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     821             :  */
     822             : static void
     823         684 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     824             : {
     825             :     Size        leftfree;
     826             :     int         reduction;
     827             : 
     828             :     /* This calculation needs to match nbtsplitloc.c */
     829         684 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     830             :         MAXALIGN(sizeof(BTPageOpaqueData));
     831             :     /* Subtract size of new high key (includes pivot heap TID space) */
     832         684 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     833             : 
     834             :     /*
     835             :      * Reduce maxpostingsize by an amount equal to target free space on left
     836             :      * half of page
     837             :      */
     838         684 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     839         684 :     if (state->maxpostingsize > reduction)
     840         684 :         state->maxpostingsize -= reduction;
     841             :     else
     842           0 :         state->maxpostingsize = 0;
     843         684 : }
     844             : 
     845             : /*
     846             :  * Build a posting list tuple based on caller's "base" index tuple and list of
     847             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     848             :  * posting list. (Posting list tuples can never have a single heap TID, partly
     849             :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     850             :  * size of entire tuple.)
     851             :  *
     852             :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     853             :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     854             :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     855             :  * This sometimes wastes a little space that was only needed as alignment
     856             :  * padding in the original tuple.  Following this convention simplifies the
     857             :  * space accounting used when deduplicating a page (the same convention
     858             :  * simplifies the accounting for choosing a point to split a page at).
     859             :  *
     860             :  * Note: Caller's "htids" array must be unique and already in ascending TID
     861             :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     862             :  * returned posting list tuple (they must be included in htids array.)
     863             :  */
     864             : IndexTuple
     865      502856 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
     866             : {
     867             :     uint32      keysize,
     868             :                 newsize;
     869             :     IndexTuple  itup;
     870             : 
     871      502856 :     if (BTreeTupleIsPosting(base))
     872      138638 :         keysize = BTreeTupleGetPostingOffset(base);
     873             :     else
     874      364218 :         keysize = IndexTupleSize(base);
     875             : 
     876             :     Assert(!BTreeTupleIsPivot(base));
     877             :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     878             :     Assert(keysize == MAXALIGN(keysize));
     879             : 
     880             :     /* Determine final size of new tuple */
     881      502856 :     if (nhtids > 1)
     882      449146 :         newsize = MAXALIGN(keysize +
     883             :                            nhtids * sizeof(ItemPointerData));
     884             :     else
     885       53710 :         newsize = keysize;
     886             : 
     887             :     Assert(newsize <= INDEX_SIZE_MASK);
     888             :     Assert(newsize == MAXALIGN(newsize));
     889             : 
     890             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     891      502856 :     itup = palloc0(newsize);
     892      502856 :     memcpy(itup, base, keysize);
     893      502856 :     itup->t_info &= ~INDEX_SIZE_MASK;
     894      502856 :     itup->t_info |= newsize;
     895      502856 :     if (nhtids > 1)
     896             :     {
     897             :         /* Form posting list tuple */
     898      449146 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     899      449146 :         memcpy(BTreeTupleGetPosting(itup), htids,
     900             :                sizeof(ItemPointerData) * nhtids);
     901             :         Assert(_bt_posting_valid(itup));
     902             :     }
     903             :     else
     904             :     {
     905             :         /* Form standard non-pivot tuple */
     906       53710 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     907       53710 :         ItemPointerCopy(htids, &itup->t_tid);
     908             :         Assert(ItemPointerIsValid(&itup->t_tid));
     909             :     }
     910             : 
     911      502856 :     return itup;
     912             : }
     913             : 
     914             : /*
     915             :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     916             :  * no longer has TIDs that need to be deleted.
     917             :  *
     918             :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
     919             :  * points to the existing posting list tuple to be updated.
     920             :  *
     921             :  * On return, caller's vacposting argument will point to final "updated"
     922             :  * tuple, which will be palloc()'d in caller's memory context.
     923             :  */
     924             : void
     925       64664 : _bt_update_posting(BTVacuumPosting vacposting)
     926             : {
     927       64664 :     IndexTuple  origtuple = vacposting->itup;
     928             :     uint32      keysize,
     929             :                 newsize;
     930             :     IndexTuple  itup;
     931             :     int         nhtids;
     932             :     int         ui,
     933             :                 d;
     934             :     ItemPointer htids;
     935             : 
     936       64664 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     937             : 
     938             :     Assert(_bt_posting_valid(origtuple));
     939             :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     940             : 
     941             :     /*
     942             :      * Determine final size of new tuple.
     943             :      *
     944             :      * This calculation needs to match the code used within _bt_form_posting()
     945             :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
     946             :      * to save ourselves a second memory allocation for a htids workspace.
     947             :      */
     948       64664 :     keysize = BTreeTupleGetPostingOffset(origtuple);
     949       64664 :     if (nhtids > 1)
     950       15698 :         newsize = MAXALIGN(keysize +
     951             :                            nhtids * sizeof(ItemPointerData));
     952             :     else
     953       48966 :         newsize = keysize;
     954             : 
     955             :     Assert(newsize <= INDEX_SIZE_MASK);
     956             :     Assert(newsize == MAXALIGN(newsize));
     957             : 
     958             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     959       64664 :     itup = palloc0(newsize);
     960       64664 :     memcpy(itup, origtuple, keysize);
     961       64664 :     itup->t_info &= ~INDEX_SIZE_MASK;
     962       64664 :     itup->t_info |= newsize;
     963             : 
     964       64664 :     if (nhtids > 1)
     965             :     {
     966             :         /* Form posting list tuple */
     967       15698 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     968       15698 :         htids = BTreeTupleGetPosting(itup);
     969             :     }
     970             :     else
     971             :     {
     972             :         /* Form standard non-pivot tuple */
     973       48966 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     974       48966 :         htids = &itup->t_tid;
     975             :     }
     976             : 
     977       64664 :     ui = 0;
     978       64664 :     d = 0;
     979      346270 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     980             :     {
     981      281606 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     982             :         {
     983      125630 :             d++;
     984      125630 :             continue;
     985             :         }
     986      155976 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     987             :     }
     988             :     Assert(ui == nhtids);
     989             :     Assert(d == vacposting->ndeletedtids);
     990             :     Assert(nhtids == 1 || _bt_posting_valid(itup));
     991             :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     992             : 
     993             :     /* vacposting arg's itup will now point to updated version */
     994       64664 :     vacposting->itup = itup;
     995       64664 : }
     996             : 
     997             : /*
     998             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     999             :  * TID from original posting list (the 'oposting' heap TID located at offset
    1000             :  * 'postingoff').  Modifies newitem, so caller should pass their own private
    1001             :  * copy that can safely be modified.
    1002             :  *
    1003             :  * Returns new posting list tuple, which is palloc()'d in caller's context.
    1004             :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
    1005             :  * what caller actually inserts. (This happens inside the same critical
    1006             :  * section that performs an in-place update of old posting list using new
    1007             :  * posting list returned here.)
    1008             :  *
    1009             :  * While the keys from newitem and oposting must be opclass equal, and must
    1010             :  * generate identical output when run through the underlying type's output
    1011             :  * function, it doesn't follow that their representations match exactly.
    1012             :  * Caller must avoid assuming that there can't be representational differences
    1013             :  * that make datums from oposting bigger or smaller than the corresponding
    1014             :  * datums from newitem.  For example, differences in TOAST input state might
    1015             :  * break a faulty assumption about tuple size (the executor is entitled to
    1016             :  * apply TOAST compression based on its own criteria).  It also seems possible
    1017             :  * that further representational variation will be introduced in the future,
    1018             :  * in order to support nbtree features like page-level prefix compression.
    1019             :  *
    1020             :  * See nbtree/README for details on the design of posting list splits.
    1021             :  */
    1022             : IndexTuple
    1023       28252 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
    1024             : {
    1025             :     int         nhtids;
    1026             :     char       *replacepos;
    1027             :     char       *replaceposright;
    1028             :     Size        nmovebytes;
    1029             :     IndexTuple  nposting;
    1030             : 
    1031       28252 :     nhtids = BTreeTupleGetNPosting(oposting);
    1032             :     Assert(_bt_posting_valid(oposting));
    1033             : 
    1034             :     /*
    1035             :      * The postingoff argument originated as a _bt_binsrch_posting() return
    1036             :      * value.  It will be 0 in the event of corruption that makes a leaf page
    1037             :      * contain a non-pivot tuple that's somehow identical to newitem (no two
    1038             :      * non-pivot tuples should ever have the same TID).  This has been known
    1039             :      * to happen in the field from time to time.
    1040             :      *
    1041             :      * Perform a basic sanity check to catch this case now.
    1042             :      */
    1043       28252 :     if (!(postingoff > 0 && postingoff < nhtids))
    1044           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
    1045             :              nhtids, postingoff);
    1046             : 
    1047             :     /*
    1048             :      * Move item pointers in posting list to make a gap for the new item's
    1049             :      * heap TID.  We shift TIDs one place to the right, losing original
    1050             :      * rightmost TID. (nmovebytes must not include TIDs to the left of
    1051             :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
    1052             :      */
    1053       28252 :     nposting = CopyIndexTuple(oposting);
    1054       28252 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
    1055       28252 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
    1056       28252 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
    1057       28252 :     memmove(replaceposright, replacepos, nmovebytes);
    1058             : 
    1059             :     /* Fill the gap at postingoff with TID of new item (original new TID) */
    1060             :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
    1061       28252 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
    1062             : 
    1063             :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
    1064       28252 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
    1065             : 
    1066             :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
    1067             :                               BTreeTupleGetHeapTID(newitem)) < 0);
    1068             :     Assert(_bt_posting_valid(nposting));
    1069             : 
    1070       28252 :     return nposting;
    1071             : }
    1072             : 
    1073             : /*
    1074             :  * Verify posting list invariants for "posting", which must be a posting list
    1075             :  * tuple.  Used within assertions.
    1076             :  */
    1077             : #ifdef USE_ASSERT_CHECKING
    1078             : static bool
    1079             : _bt_posting_valid(IndexTuple posting)
    1080             : {
    1081             :     ItemPointerData last;
    1082             :     ItemPointer htid;
    1083             : 
    1084             :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
    1085             :         return false;
    1086             : 
    1087             :     /* Remember first heap TID for loop */
    1088             :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
    1089             :     if (!ItemPointerIsValid(&last))
    1090             :         return false;
    1091             : 
    1092             :     /* Iterate, starting from second TID */
    1093             :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
    1094             :     {
    1095             :         htid = BTreeTupleGetPostingN(posting, i);
    1096             : 
    1097             :         if (!ItemPointerIsValid(htid))
    1098             :             return false;
    1099             :         if (ItemPointerCompare(htid, &last) <= 0)
    1100             :             return false;
    1101             :         ItemPointerCopy(htid, &last);
    1102             :     }
    1103             : 
    1104             :     return true;
    1105             : }
    1106             : #endif

Generated by: LCOV version 1.16