LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 97.6 % 295 288
Test Date: 2026-03-14 23:15:27 Functions: 100.0 % 11 11
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtdedup.c
       4              :  *    Deduplicate or bottom-up delete items in Postgres btrees.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/nbtree/nbtdedup.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #include "postgres.h"
      16              : 
      17              : #include "access/nbtree.h"
      18              : #include "access/nbtxlog.h"
      19              : #include "access/tableam.h"
      20              : #include "access/xloginsert.h"
      21              : #include "miscadmin.h"
      22              : #include "utils/rel.h"
      23              : 
      24              : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
      25              :                                            TM_IndexDeleteOp *delstate);
      26              : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      27              :                              OffsetNumber minoff, IndexTuple newitem);
      28              : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      29              :                                      Size newitemsz);
      30              : #ifdef USE_ASSERT_CHECKING
      31              : static bool _bt_posting_valid(IndexTuple posting);
      32              : #endif
      33              : 
      34              : /*
      35              :  * Perform a deduplication pass.
      36              :  *
      37              :  * The general approach taken here is to perform as much deduplication as
      38              :  * possible to free as much space as possible.  Note, however, that "single
      39              :  * value" strategy is used for !bottomupdedup callers when the page is full of
      40              :  * tuples of a single value.  Deduplication passes that apply the strategy
      41              :  * will leave behind a few untouched tuples at the end of the page, preparing
      42              :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
      43              :  * value strategy.  Our high level goal is to delay merging the untouched
      44              :  * tuples until after the page splits.
      45              :  *
      46              :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
      47              :  * high level goal is to prevent a page split entirely by buying more time.
      48              :  * We still hope that a page split can be avoided altogether.  That's why
      49              :  * single value strategy is not even considered for bottomupdedup callers.
      50              :  *
      51              :  * The page will have to be split if we cannot successfully free at least
      52              :  * newitemsz (we also need space for newitem's line pointer, which isn't
      53              :  * included in caller's newitemsz).
      54              :  *
      55              :  * Note: Caller should have already deleted all existing items with their
      56              :  * LP_DEAD bits set.
      57              :  */
      58              : void
      59        14567 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
      60              :                bool bottomupdedup)
      61              : {
      62              :     OffsetNumber offnum,
      63              :                 minoff,
      64              :                 maxoff;
      65        14567 :     Page        page = BufferGetPage(buf);
      66        14567 :     BTPageOpaque opaque = BTPageGetOpaque(page);
      67              :     Page        newpage;
      68              :     BTDedupState state;
      69        14567 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
      70        14567 :     bool        singlevalstrat = false;
      71        14567 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      72              :     XLogRecPtr  recptr;
      73              : 
      74              :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
      75        14567 :     newitemsz += sizeof(ItemIdData);
      76              : 
      77              :     /*
      78              :      * Initialize deduplication state.
      79              :      *
      80              :      * It would be possible for maxpostingsize (limit on posting list tuple
      81              :      * size) to be set to one third of the page.  However, it seems like a
      82              :      * good idea to limit the size of posting lists to one sixth of a page.
      83              :      * That ought to leave us with a good split point when pages full of
      84              :      * duplicates can be split several times.
      85              :      */
      86        14567 :     state = palloc_object(BTDedupStateData);
      87        14567 :     state->deduplicate = true;
      88        14567 :     state->nmaxitems = 0;
      89        14567 :     state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
      90              :     /* Metadata about base tuple of current pending posting list */
      91        14567 :     state->base = NULL;
      92        14567 :     state->baseoff = InvalidOffsetNumber;
      93        14567 :     state->basetupsize = 0;
      94              :     /* Metadata about current pending posting list TIDs */
      95        14567 :     state->htids = palloc(state->maxpostingsize);
      96        14567 :     state->nhtids = 0;
      97        14567 :     state->nitems = 0;
      98              :     /* Size of all physical tuples to be replaced by pending posting list */
      99        14567 :     state->phystupsize = 0;
     100              :     /* nintervals should be initialized to zero */
     101        14567 :     state->nintervals = 0;
     102              : 
     103        14567 :     minoff = P_FIRSTDATAKEY(opaque);
     104        14567 :     maxoff = PageGetMaxOffsetNumber(page);
     105              : 
     106              :     /*
     107              :      * Consider applying "single value" strategy, though only if the page
     108              :      * seems likely to be split in the near future
     109              :      */
     110        14567 :     if (!bottomupdedup)
     111        12807 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     112              : 
     113              :     /*
     114              :      * Deduplicate items from page, and write them to newpage.
     115              :      *
     116              :      * Copy the original page's LSN into newpage copy.  This will become the
     117              :      * updated version of the page.  We need this because XLogInsert will
     118              :      * examine the LSN and possibly dump it in a page image.
     119              :      */
     120        14567 :     newpage = PageGetTempPageCopySpecial(page);
     121        14567 :     PageSetLSN(newpage, PageGetLSN(page));
     122              : 
     123              :     /* Copy high key, if any */
     124        14567 :     if (!P_RIGHTMOST(opaque))
     125              :     {
     126        11539 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
     127        11539 :         Size        hitemsz = ItemIdGetLength(hitemid);
     128        11539 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
     129              : 
     130        11539 :         if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
     131            0 :             elog(ERROR, "deduplication failed to add highkey");
     132              :     }
     133              : 
     134        14567 :     for (offnum = minoff;
     135      3270640 :          offnum <= maxoff;
     136      3256073 :          offnum = OffsetNumberNext(offnum))
     137              :     {
     138      3256073 :         ItemId      itemid = PageGetItemId(page, offnum);
     139      3256073 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     140              : 
     141              :         Assert(!ItemIdIsDead(itemid));
     142              : 
     143      3256073 :         if (offnum == minoff)
     144              :         {
     145              :             /*
     146              :              * No previous/base tuple for the data item -- use the data item
     147              :              * as base tuple of pending posting list
     148              :              */
     149        14567 :             _bt_dedup_start_pending(state, itup, offnum);
     150              :         }
     151      6481697 :         else if (state->deduplicate &&
     152      3685506 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     153       445315 :                  _bt_dedup_save_htid(state, itup))
     154              :         {
     155              :             /*
     156              :              * Tuple is equal to base tuple of pending posting list.  Heap
     157              :              * TID(s) for itup have been saved in state.
     158              :              */
     159              :         }
     160              :         else
     161              :         {
     162              :             /*
     163              :              * Tuple is not equal to pending posting list tuple, or
     164              :              * _bt_dedup_save_htid() opted to not merge current item into
     165              :              * pending posting list for some other reason (e.g., adding more
     166              :              * TIDs would have caused posting list to exceed current
     167              :              * maxpostingsize).
     168              :              *
     169              :              * If state contains pending posting list with more than one item,
     170              :              * form new posting tuple and add it to our temp page (newpage).
     171              :              * Else add pending interval's base tuple to the temp page as-is.
     172              :              */
     173      2802950 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
     174              : 
     175      2802950 :             if (singlevalstrat)
     176              :             {
     177              :                 /*
     178              :                  * Single value strategy's extra steps.
     179              :                  *
     180              :                  * Lower maxpostingsize for sixth and final large posting list
     181              :                  * tuple at the point where 5 maxpostingsize-capped tuples
     182              :                  * have either been formed or observed.
     183              :                  *
     184              :                  * When a sixth maxpostingsize-capped item is formed/observed,
     185              :                  * stop merging together tuples altogether.  The few tuples
     186              :                  * that remain at the end of the page won't be merged together
     187              :                  * at all (at least not until after a future page split takes
     188              :                  * place, when this page's newly allocated right sibling page
     189              :                  * gets its first deduplication pass).
     190              :                  */
     191         2664 :                 if (state->nmaxitems == 5)
     192          325 :                     _bt_singleval_fillfactor(page, state, newitemsz);
     193         2339 :                 else if (state->nmaxitems == 6)
     194              :                 {
     195          121 :                     state->deduplicate = false;
     196          121 :                     singlevalstrat = false; /* won't be back here */
     197              :                 }
     198              :             }
     199              : 
     200              :             /* itup starts new pending posting list */
     201      2802950 :             _bt_dedup_start_pending(state, itup, offnum);
     202              :         }
     203              :     }
     204              : 
     205              :     /* Handle the last item */
     206        14567 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
     207              : 
     208              :     /*
     209              :      * If no items suitable for deduplication were found, newpage must be
     210              :      * exactly the same as the original page, so just return from function.
     211              :      *
     212              :      * We could determine whether or not to proceed on the basis the space
     213              :      * savings being sufficient to avoid an immediate page split instead.  We
     214              :      * don't do that because there is some small value in nbtsplitloc.c always
     215              :      * operating against a page that is fully deduplicated (apart from
     216              :      * newitem).  Besides, most of the cost has already been paid.
     217              :      */
     218        14567 :     if (state->nintervals == 0)
     219              :     {
     220              :         /* cannot leak memory here */
     221         2541 :         pfree(newpage);
     222         2541 :         pfree(state->htids);
     223         2541 :         pfree(state);
     224         2541 :         return;
     225              :     }
     226              : 
     227              :     /*
     228              :      * By here, it's clear that deduplication will definitely go ahead.
     229              :      *
     230              :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
     231              :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
     232              :      * But keep things tidy.
     233              :      */
     234        12026 :     if (P_HAS_GARBAGE(opaque))
     235              :     {
     236            0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
     237              : 
     238            0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     239              :     }
     240              : 
     241        12026 :     START_CRIT_SECTION();
     242              : 
     243        12026 :     PageRestoreTempPage(newpage, page);
     244        12026 :     MarkBufferDirty(buf);
     245              : 
     246              :     /* XLOG stuff */
     247        12026 :     if (RelationNeedsWAL(rel))
     248        11963 :     {
     249              :         xl_btree_dedup xlrec_dedup;
     250              : 
     251        11963 :         xlrec_dedup.nintervals = state->nintervals;
     252              : 
     253        11963 :         XLogBeginInsert();
     254        11963 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     255        11963 :         XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
     256              : 
     257              :         /*
     258              :          * The intervals array is not in the buffer, but pretend that it is.
     259              :          * When XLogInsert stores the whole buffer, the array need not be
     260              :          * stored too.
     261              :          */
     262        11963 :         XLogRegisterBufData(0, state->intervals,
     263        11963 :                             state->nintervals * sizeof(BTDedupInterval));
     264              : 
     265        11963 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     266              :     }
     267              :     else
     268           63 :         recptr = XLogGetFakeLSN(rel);
     269              : 
     270        12026 :     PageSetLSN(page, recptr);
     271              : 
     272        12026 :     END_CRIT_SECTION();
     273              : 
     274              :     /* Local space accounting should agree with page accounting */
     275              :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     276              : 
     277              :     /* cannot leak memory here */
     278        12026 :     pfree(state->htids);
     279        12026 :     pfree(state);
     280              : }
     281              : 
     282              : /*
     283              :  * Perform bottom-up index deletion pass.
     284              :  *
     285              :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
     286              :  * be deleted via bottom-up index deletion.  The high level goal here is to
     287              :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
     288              :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
     289              :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
     290              :  * do not particularly care about once-off opportunities to delete many index
     291              :  * tuples together.
     292              :  *
     293              :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
     294              :  * See access/tableam.h for a description of how we're expected to cooperate
     295              :  * with the tableam.
     296              :  *
     297              :  * Returns true on success, in which case caller can assume page split will be
     298              :  * avoided for a reasonable amount of time.  Returns false when caller should
     299              :  * deduplicate the page (if possible at all).
     300              :  *
     301              :  * Note: Occasionally we return true despite failing to delete enough items to
     302              :  * avoid a split.  This makes caller skip deduplication and go split the page
     303              :  * right away.  Our return value is always just advisory information.
     304              :  *
     305              :  * Note: Caller should have already deleted all existing items with their
     306              :  * LP_DEAD bits set.
     307              :  */
     308              : bool
     309         1968 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
     310              :                      Size newitemsz)
     311              : {
     312              :     OffsetNumber offnum,
     313              :                 minoff,
     314              :                 maxoff;
     315         1968 :     Page        page = BufferGetPage(buf);
     316         1968 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     317              :     BTDedupState state;
     318              :     TM_IndexDeleteOp delstate;
     319              :     bool        neverdedup;
     320         1968 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     321              : 
     322              :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     323         1968 :     newitemsz += sizeof(ItemIdData);
     324              : 
     325              :     /* Initialize deduplication state */
     326         1968 :     state = palloc_object(BTDedupStateData);
     327         1968 :     state->deduplicate = true;
     328         1968 :     state->nmaxitems = 0;
     329         1968 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
     330         1968 :     state->base = NULL;
     331         1968 :     state->baseoff = InvalidOffsetNumber;
     332         1968 :     state->basetupsize = 0;
     333         1968 :     state->htids = palloc(state->maxpostingsize);
     334         1968 :     state->nhtids = 0;
     335         1968 :     state->nitems = 0;
     336         1968 :     state->phystupsize = 0;
     337         1968 :     state->nintervals = 0;
     338              : 
     339              :     /*
     340              :      * Initialize tableam state that describes bottom-up index deletion
     341              :      * operation.
     342              :      *
     343              :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
     344              :      * can safely delete.  The tableam will search until our leaf page space
     345              :      * target is satisfied, or until the cost of continuing with the tableam
     346              :      * operation seems too high.  It focuses its efforts on TIDs associated
     347              :      * with duplicate index tuples that we mark "promising".
     348              :      *
     349              :      * This space target is a little arbitrary.  The tableam must be able to
     350              :      * keep the costs and benefits in balance.  We provide the tableam with
     351              :      * exhaustive information about what might work, without directly
     352              :      * concerning ourselves with avoiding work during the tableam call.  Our
     353              :      * role in costing the bottom-up deletion process is strictly advisory.
     354              :      */
     355         1968 :     delstate.irel = rel;
     356         1968 :     delstate.iblknum = BufferGetBlockNumber(buf);
     357         1968 :     delstate.bottomup = true;
     358         1968 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
     359         1968 :     delstate.ndeltids = 0;
     360         1968 :     delstate.deltids = palloc_array(TM_IndexDelete, MaxTIDsPerBTreePage);
     361         1968 :     delstate.status = palloc_array(TM_IndexStatus, MaxTIDsPerBTreePage);
     362              : 
     363         1968 :     minoff = P_FIRSTDATAKEY(opaque);
     364         1968 :     maxoff = PageGetMaxOffsetNumber(page);
     365         1968 :     for (offnum = minoff;
     366       559636 :          offnum <= maxoff;
     367       557668 :          offnum = OffsetNumberNext(offnum))
     368              :     {
     369       557668 :         ItemId      itemid = PageGetItemId(page, offnum);
     370       557668 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     371              : 
     372              :         Assert(!ItemIdIsDead(itemid));
     373              : 
     374       557668 :         if (offnum == minoff)
     375              :         {
     376              :             /* itup starts first pending interval */
     377         1968 :             _bt_dedup_start_pending(state, itup, offnum);
     378              :         }
     379       631760 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     380        76060 :                  _bt_dedup_save_htid(state, itup))
     381              :         {
     382              :             /* Tuple is equal; just added its TIDs to pending interval */
     383              :         }
     384              :         else
     385              :         {
     386              :             /* Finalize interval -- move its TIDs to delete state */
     387       479640 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
     388              : 
     389              :             /* itup starts new pending interval */
     390       479640 :             _bt_dedup_start_pending(state, itup, offnum);
     391              :         }
     392              :     }
     393              :     /* Finalize final interval -- move its TIDs to delete state */
     394         1968 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
     395              : 
     396              :     /*
     397              :      * We don't give up now in the event of having few (or even zero)
     398              :      * promising tuples for the tableam because it's not up to us as the index
     399              :      * AM to manage costs (note that the tableam might have heuristics of its
     400              :      * own that work out what to do).  We should at least avoid having our
     401              :      * caller do a useless deduplication pass after we return in the event of
     402              :      * zero promising tuples, though.
     403              :      */
     404         1968 :     neverdedup = false;
     405         1968 :     if (state->nintervals == 0)
     406            9 :         neverdedup = true;
     407              : 
     408         1968 :     pfree(state->htids);
     409         1968 :     pfree(state);
     410              : 
     411              :     /* Ask tableam which TIDs are deletable, then physically delete them */
     412         1968 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
     413              : 
     414         1968 :     pfree(delstate.deltids);
     415         1968 :     pfree(delstate.status);
     416              : 
     417              :     /* Report "success" to caller unconditionally to avoid deduplication */
     418         1968 :     if (neverdedup)
     419            9 :         return true;
     420              : 
     421              :     /* Don't dedup when we won't end up back here any time soon anyway */
     422         1959 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
     423              : }
     424              : 
     425              : /*
     426              :  * Create a new pending posting list tuple based on caller's base tuple.
     427              :  *
     428              :  * Every tuple processed by deduplication either becomes the base tuple for a
     429              :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     430              :  * A tuple that starts out as the base tuple for a posting list will only
     431              :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     432              :  * that there are duplicates that can be merged into the base tuple.
     433              :  */
     434              : void
     435      6127829 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     436              :                         OffsetNumber baseoff)
     437              : {
     438              :     Assert(state->nhtids == 0);
     439              :     Assert(state->nitems == 0);
     440              :     Assert(!BTreeTupleIsPivot(base));
     441              : 
     442              :     /*
     443              :      * Copy heap TID(s) from new base tuple for new candidate posting list
     444              :      * into working state's array
     445              :      */
     446      6127829 :     if (!BTreeTupleIsPosting(base))
     447              :     {
     448      5078222 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     449      5078222 :         state->nhtids = 1;
     450      5078222 :         state->basetupsize = IndexTupleSize(base);
     451              :     }
     452              :     else
     453              :     {
     454              :         int         nposting;
     455              : 
     456      1049607 :         nposting = BTreeTupleGetNPosting(base);
     457      1049607 :         memcpy(state->htids, BTreeTupleGetPosting(base),
     458              :                sizeof(ItemPointerData) * nposting);
     459      1049607 :         state->nhtids = nposting;
     460              :         /* basetupsize should not include existing posting list */
     461      1049607 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
     462              :     }
     463              : 
     464              :     /*
     465              :      * Save new base tuple itself -- it'll be needed if we actually create a
     466              :      * new posting list from new pending posting list.
     467              :      *
     468              :      * Must maintain physical size of all existing tuples (including line
     469              :      * pointer overhead) so that we can calculate space savings on page.
     470              :      */
     471      6127829 :     state->nitems = 1;
     472      6127829 :     state->base = base;
     473      6127829 :     state->baseoff = baseoff;
     474      6127829 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     475              :     /* Also save baseoff in pending state for interval */
     476      6127829 :     state->intervals[state->nintervals].baseoff = state->baseoff;
     477      6127829 : }
     478              : 
     479              : /*
     480              :  * Save itup heap TID(s) into pending posting list where possible.
     481              :  *
     482              :  * Returns bool indicating if the pending posting list managed by state now
     483              :  * includes itup's heap TID(s).
     484              :  */
     485              : bool
     486      1404717 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     487              : {
     488              :     int         nhtids;
     489              :     ItemPointer htids;
     490              :     Size        mergedtupsz;
     491              : 
     492              :     Assert(!BTreeTupleIsPivot(itup));
     493              : 
     494      1404717 :     if (!BTreeTupleIsPosting(itup))
     495              :     {
     496      1398101 :         nhtids = 1;
     497      1398101 :         htids = &itup->t_tid;
     498              :     }
     499              :     else
     500              :     {
     501         6616 :         nhtids = BTreeTupleGetNPosting(itup);
     502         6616 :         htids = BTreeTupleGetPosting(itup);
     503              :     }
     504              : 
     505              :     /*
     506              :      * Don't append (have caller finish pending posting list as-is) if
     507              :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
     508              :      *
     509              :      * This calculation needs to match the code used within _bt_form_posting()
     510              :      * for new posting list tuples.
     511              :      */
     512      1404717 :     mergedtupsz = MAXALIGN(state->basetupsize +
     513              :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
     514              : 
     515      1404717 :     if (mergedtupsz > state->maxpostingsize)
     516              :     {
     517              :         /*
     518              :          * Count this as an oversized item for single value strategy, though
     519              :          * only when there are 50 TIDs in the final posting list tuple.  This
     520              :          * limit (which is fairly arbitrary) avoids confusion about how many
     521              :          * 1/6 of a page tuples have been encountered/created by the current
     522              :          * deduplication pass.
     523              :          *
     524              :          * Note: We deliberately don't consider which deduplication pass
     525              :          * merged together tuples to create this item (could be a previous
     526              :          * deduplication pass, or current pass).  See _bt_do_singleval()
     527              :          * comments.
     528              :          */
     529        11501 :         if (state->nhtids > 50)
     530        11009 :             state->nmaxitems++;
     531              : 
     532        11501 :         return false;
     533              :     }
     534              : 
     535              :     /*
     536              :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
     537              :      * pending posting list
     538              :      */
     539      1393216 :     state->nitems++;
     540      1393216 :     memcpy(state->htids + state->nhtids, htids,
     541              :            sizeof(ItemPointerData) * nhtids);
     542      1393216 :     state->nhtids += nhtids;
     543      1393216 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     544              : 
     545      1393216 :     return true;
     546              : }
     547              : 
     548              : /*
     549              :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     550              :  * is based on saved base tuple, and saved list of heap TIDs.
     551              :  *
     552              :  * Returns space saving from deduplicating to make a new posting list tuple.
     553              :  * Note that this includes line pointer overhead.  This is zero in the case
     554              :  * where no deduplication was possible.
     555              :  */
     556              : Size
     557      3264422 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     558              : {
     559              :     OffsetNumber tupoff;
     560              :     Size        tuplesz;
     561              :     Size        spacesaving;
     562              : 
     563              :     Assert(state->nitems > 0);
     564              :     Assert(state->nitems <= state->nhtids);
     565              :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     566              : 
     567      3264422 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     568      3264422 :     if (state->nitems == 1)
     569              :     {
     570              :         /* Use original, unchanged base tuple */
     571      3051822 :         tuplesz = IndexTupleSize(state->base);
     572              :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
     573              :         Assert(tuplesz <= BTMaxItemSize);
     574      3051822 :         if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
     575            0 :             elog(ERROR, "deduplication failed to add tuple to page");
     576              : 
     577      3051822 :         spacesaving = 0;
     578              :     }
     579              :     else
     580              :     {
     581              :         IndexTuple  final;
     582              : 
     583              :         /* Form a tuple with a posting list */
     584       212600 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
     585       212600 :         tuplesz = IndexTupleSize(final);
     586              :         Assert(tuplesz <= state->maxpostingsize);
     587              : 
     588              :         /* Save final number of items for posting list */
     589       212600 :         state->intervals[state->nintervals].nitems = state->nitems;
     590              : 
     591              :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     592              :         Assert(tuplesz <= BTMaxItemSize);
     593       212600 :         if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
     594            0 :             elog(ERROR, "deduplication failed to add tuple to page");
     595              : 
     596       212600 :         pfree(final);
     597       212600 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     598              :         /* Increment nintervals, since we wrote a new posting list tuple */
     599       212600 :         state->nintervals++;
     600              :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     601              :     }
     602              : 
     603              :     /* Reset state for next pending posting list */
     604      3264422 :     state->nhtids = 0;
     605      3264422 :     state->nitems = 0;
     606      3264422 :     state->phystupsize = 0;
     607              : 
     608      3264422 :     return spacesaving;
     609              : }
     610              : 
     611              : /*
     612              :  * Finalize interval during bottom-up index deletion.
     613              :  *
     614              :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
     615              :  * first, and then get moved over to delstate (in variable-sized batches) by
     616              :  * calling here.  Call here happens when the number of TIDs in a dedup
     617              :  * interval is known, and interval gets finalized (i.e. when caller sees next
     618              :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
     619              :  * process from leaf page).
     620              :  *
     621              :  * This is where bottom-up deletion determines and remembers which entries are
     622              :  * duplicates.  This will be important information to the tableam delete
     623              :  * infrastructure later on.  Plain index tuple duplicates are marked
     624              :  * "promising" here, per tableam contract.
     625              :  *
     626              :  * Our approach to marking entries whose TIDs come from posting lists is more
     627              :  * complicated.  Posting lists can only be formed by a deduplication pass (or
     628              :  * during an index build), so recent version churn affecting the pointed-to
     629              :  * logical rows is not particularly likely.  We may still give a weak signal
     630              :  * about posting list tuples' entries (by marking just one of its TIDs/entries
     631              :  * promising), though this is only a possibility in the event of further
     632              :  * duplicate index tuples in final interval that covers posting list tuple (as
     633              :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
     634              :  * when it has no stronger signal to go with for the deletion operation as a
     635              :  * whole.
     636              :  *
     637              :  * The heuristics we use work well in practice because we only need to give
     638              :  * the tableam the right _general_ idea about where to look.  Garbage tends to
     639              :  * naturally get concentrated in relatively few table blocks with workloads
     640              :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
     641              :  * available table blocks sensibly based on the hints we provide, but that's
     642              :  * okay -- only the extremes matter.  The tableam just needs to be able to
     643              :  * predict which few table blocks will have the most tuples that are safe to
     644              :  * delete for each deletion operation, with low variance across related
     645              :  * deletion operations.
     646              :  */
     647              : static void
     648       481608 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
     649              :                                TM_IndexDeleteOp *delstate)
     650              : {
     651       481608 :     bool        dupinterval = (state->nitems > 1);
     652              : 
     653              :     Assert(state->nitems > 0);
     654              :     Assert(state->nitems <= state->nhtids);
     655              :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     656              : 
     657      1039276 :     for (int i = 0; i < state->nitems; i++)
     658              :     {
     659       557668 :         OffsetNumber offnum = state->baseoff + i;
     660       557668 :         ItemId      itemid = PageGetItemId(page, offnum);
     661       557668 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     662       557668 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
     663       557668 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
     664              : 
     665       557668 :         if (!BTreeTupleIsPosting(itup))
     666              :         {
     667              :             /* Simple case: A plain non-pivot tuple */
     668       448878 :             ideltid->tid = itup->t_tid;
     669       448878 :             ideltid->id = delstate->ndeltids;
     670       448878 :             istatus->idxoffnum = offnum;
     671       448878 :             istatus->knowndeletable = false; /* for now */
     672       448878 :             istatus->promising = dupinterval;    /* simple rule */
     673       448878 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
     674              : 
     675       448878 :             delstate->ndeltids++;
     676              :         }
     677              :         else
     678              :         {
     679              :             /*
     680              :              * Complicated case: A posting list tuple.
     681              :              *
     682              :              * We make the conservative assumption that there can only be at
     683              :              * most one affected logical row per posting list tuple.  There
     684              :              * will be at most one promising entry in deltids to represent
     685              :              * this presumed lone logical row.  Note that this isn't even
     686              :              * considered unless the posting list tuple is also in an interval
     687              :              * of duplicates -- this complicated rule is just a variant of the
     688              :              * simple rule used to decide if plain index tuples are promising.
     689              :              */
     690       108790 :             int         nitem = BTreeTupleGetNPosting(itup);
     691       108790 :             bool        firstpromising = false;
     692       108790 :             bool        lastpromising = false;
     693              : 
     694              :             Assert(_bt_posting_valid(itup));
     695              : 
     696       108790 :             if (dupinterval)
     697              :             {
     698              :                 /*
     699              :                  * Complicated rule: either the first or last TID in the
     700              :                  * posting list gets marked promising (if any at all)
     701              :                  */
     702              :                 BlockNumber minblocklist,
     703              :                             midblocklist,
     704              :                             maxblocklist;
     705              :                 ItemPointer mintid,
     706              :                             midtid,
     707              :                             maxtid;
     708              : 
     709        10179 :                 mintid = BTreeTupleGetHeapTID(itup);
     710        10179 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
     711        10179 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
     712        10179 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
     713        10179 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
     714        10179 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
     715              : 
     716              :                 /* Only entry with predominant table block can be promising */
     717        10179 :                 firstpromising = (minblocklist == midblocklist);
     718        10179 :                 lastpromising = (!firstpromising &&
     719              :                                  midblocklist == maxblocklist);
     720              :             }
     721              : 
     722       605032 :             for (int p = 0; p < nitem; p++)
     723              :             {
     724       496242 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
     725              : 
     726       496242 :                 ideltid->tid = *htid;
     727       496242 :                 ideltid->id = delstate->ndeltids;
     728       496242 :                 istatus->idxoffnum = offnum;
     729       496242 :                 istatus->knowndeletable = false; /* for now */
     730       496242 :                 istatus->promising = false;
     731       496242 :                 if ((firstpromising && p == 0) ||
     732        64196 :                     (lastpromising && p == nitem - 1))
     733         6978 :                     istatus->promising = true;
     734       496242 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
     735              : 
     736       496242 :                 ideltid++;
     737       496242 :                 istatus++;
     738       496242 :                 delstate->ndeltids++;
     739              :             }
     740              :         }
     741              :     }
     742              : 
     743       481608 :     if (dupinterval)
     744              :     {
     745        50079 :         state->intervals[state->nintervals].nitems = state->nitems;
     746        50079 :         state->nintervals++;
     747              :     }
     748              : 
     749              :     /* Reset state for next interval */
     750       481608 :     state->nhtids = 0;
     751       481608 :     state->nitems = 0;
     752       481608 :     state->phystupsize = 0;
     753       481608 : }
     754              : 
     755              : /*
     756              :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     757              :  * same value -- if they are, deduplication's "single value" strategy should
     758              :  * be applied.  The general goal of this strategy is to ensure that
     759              :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     760              :  * split point as further duplicates are inserted, and successive rightmost
     761              :  * page splits occur among pages that store the same duplicate value.  When
     762              :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     763              :  * just like it would if deduplication were disabled.
     764              :  *
     765              :  * We expect that affected workloads will require _several_ single value
     766              :  * strategy deduplication passes (over a page that only stores duplicates)
     767              :  * before the page is finally split.  The first deduplication pass should only
     768              :  * find regular non-pivot tuples.  Later deduplication passes will find
     769              :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     770              :  * over.  The penultimate pass is generally the first pass that actually
     771              :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     772              :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     773              :  * any space -- it will skip over everything without merging anything (it
     774              :  * retraces the steps of the penultimate pass).
     775              :  *
     776              :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     777              :  * the first pass) won't spend many cycles on the large posting list tuples
     778              :  * left by previous passes.  Each pass will find a large contiguous group of
     779              :  * smaller duplicate tuples to merge together at the end of the page.
     780              :  */
     781              : static bool
     782        12807 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     783              :                  OffsetNumber minoff, IndexTuple newitem)
     784              : {
     785        12807 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     786              :     ItemId      itemid;
     787              :     IndexTuple  itup;
     788              : 
     789        12807 :     itemid = PageGetItemId(page, minoff);
     790        12807 :     itup = (IndexTuple) PageGetItem(page, itemid);
     791              : 
     792        12807 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     793              :     {
     794         1119 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     795         1119 :         itup = (IndexTuple) PageGetItem(page, itemid);
     796              : 
     797         1119 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     798          680 :             return true;
     799              :     }
     800              : 
     801        12127 :     return false;
     802              : }
     803              : 
     804              : /*
     805              :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     806              :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     807              :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     808              :  * five tuples could actually just be very large duplicate tuples that
     809              :  * couldn't be merged together at all.  Deduplication will simply not modify
     810              :  * the page when that happens.)
     811              :  *
     812              :  * When there are six posting lists on the page (after current deduplication
     813              :  * pass goes on to create/observe a sixth very large tuple), caller should end
     814              :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     815              :  * are supposed to end up on the new right sibling page following the
     816              :  * anticipated page split.  A future deduplication pass of future right
     817              :  * sibling page might take care of it.  (This is why the first single value
     818              :  * strategy deduplication pass for a given leaf page will generally find only
     819              :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     820              :  */
     821              : static void
     822          325 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     823              : {
     824              :     Size        leftfree;
     825              :     int         reduction;
     826              : 
     827              :     /* This calculation needs to match nbtsplitloc.c */
     828          325 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     829              :         MAXALIGN(sizeof(BTPageOpaqueData));
     830              :     /* Subtract size of new high key (includes pivot heap TID space) */
     831          325 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     832              : 
     833              :     /*
     834              :      * Reduce maxpostingsize by an amount equal to target free space on left
     835              :      * half of page
     836              :      */
     837          325 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     838          325 :     if (state->maxpostingsize > reduction)
     839          325 :         state->maxpostingsize -= reduction;
     840              :     else
     841            0 :         state->maxpostingsize = 0;
     842          325 : }
     843              : 
     844              : /*
     845              :  * Build a posting list tuple based on caller's "base" index tuple and list of
     846              :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     847              :  * posting list. (Posting list tuples can never have a single heap TID, partly
     848              :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     849              :  * size of entire tuple.)
     850              :  *
     851              :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     852              :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     853              :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     854              :  * This sometimes wastes a little space that was only needed as alignment
     855              :  * padding in the original tuple.  Following this convention simplifies the
     856              :  * space accounting used when deduplicating a page (the same convention
     857              :  * simplifies the accounting for choosing a point to split a page at).
     858              :  *
     859              :  * Note: Caller's "htids" array must be unique and already in ascending TID
     860              :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     861              :  * returned posting list tuple (they must be included in htids array.)
     862              :  */
     863              : IndexTuple
     864       261662 : _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
     865              : {
     866              :     uint32      keysize,
     867              :                 newsize;
     868              :     IndexTuple  itup;
     869              : 
     870       261662 :     if (BTreeTupleIsPosting(base))
     871        72060 :         keysize = BTreeTupleGetPostingOffset(base);
     872              :     else
     873       189602 :         keysize = IndexTupleSize(base);
     874              : 
     875              :     Assert(!BTreeTupleIsPivot(base));
     876              :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     877              :     Assert(keysize == MAXALIGN(keysize));
     878              : 
     879              :     /* Determine final size of new tuple */
     880       261662 :     if (nhtids > 1)
     881       233580 :         newsize = MAXALIGN(keysize +
     882              :                            nhtids * sizeof(ItemPointerData));
     883              :     else
     884        28082 :         newsize = keysize;
     885              : 
     886              :     Assert(newsize <= INDEX_SIZE_MASK);
     887              :     Assert(newsize == MAXALIGN(newsize));
     888              : 
     889              :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     890       261662 :     itup = palloc0(newsize);
     891       261662 :     memcpy(itup, base, keysize);
     892       261662 :     itup->t_info &= ~INDEX_SIZE_MASK;
     893       261662 :     itup->t_info |= newsize;
     894       261662 :     if (nhtids > 1)
     895              :     {
     896              :         /* Form posting list tuple */
     897       233580 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     898       233580 :         memcpy(BTreeTupleGetPosting(itup), htids,
     899              :                sizeof(ItemPointerData) * nhtids);
     900              :         Assert(_bt_posting_valid(itup));
     901              :     }
     902              :     else
     903              :     {
     904              :         /* Form standard non-pivot tuple */
     905        28082 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     906        28082 :         ItemPointerCopy(htids, &itup->t_tid);
     907              :         Assert(ItemPointerIsValid(&itup->t_tid));
     908              :     }
     909              : 
     910       261662 :     return itup;
     911              : }
     912              : 
     913              : /*
     914              :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     915              :  * no longer has TIDs that need to be deleted.
     916              :  *
     917              :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
     918              :  * points to the existing posting list tuple to be updated.
     919              :  *
     920              :  * On return, caller's vacposting argument will point to final "updated"
     921              :  * tuple, which will be palloc()'d in caller's memory context.
     922              :  */
     923              : void
     924        28504 : _bt_update_posting(BTVacuumPosting vacposting)
     925              : {
     926        28504 :     IndexTuple  origtuple = vacposting->itup;
     927              :     uint32      keysize,
     928              :                 newsize;
     929              :     IndexTuple  itup;
     930              :     int         nhtids;
     931              :     int         ui,
     932              :                 d;
     933              :     ItemPointer htids;
     934              : 
     935        28504 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     936              : 
     937              :     Assert(_bt_posting_valid(origtuple));
     938              :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     939              : 
     940              :     /*
     941              :      * Determine final size of new tuple.
     942              :      *
     943              :      * This calculation needs to match the code used within _bt_form_posting()
     944              :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
     945              :      * to save ourselves a second memory allocation for a htids workspace.
     946              :      */
     947        28504 :     keysize = BTreeTupleGetPostingOffset(origtuple);
     948        28504 :     if (nhtids > 1)
     949         5086 :         newsize = MAXALIGN(keysize +
     950              :                            nhtids * sizeof(ItemPointerData));
     951              :     else
     952        23418 :         newsize = keysize;
     953              : 
     954              :     Assert(newsize <= INDEX_SIZE_MASK);
     955              :     Assert(newsize == MAXALIGN(newsize));
     956              : 
     957              :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     958        28504 :     itup = palloc0(newsize);
     959        28504 :     memcpy(itup, origtuple, keysize);
     960        28504 :     itup->t_info &= ~INDEX_SIZE_MASK;
     961        28504 :     itup->t_info |= newsize;
     962              : 
     963        28504 :     if (nhtids > 1)
     964              :     {
     965              :         /* Form posting list tuple */
     966         5086 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     967         5086 :         htids = BTreeTupleGetPosting(itup);
     968              :     }
     969              :     else
     970              :     {
     971              :         /* Form standard non-pivot tuple */
     972        23418 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     973        23418 :         htids = &itup->t_tid;
     974              :     }
     975              : 
     976        28504 :     ui = 0;
     977        28504 :     d = 0;
     978       160636 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     979              :     {
     980       132132 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     981              :         {
     982        67021 :             d++;
     983        67021 :             continue;
     984              :         }
     985        65111 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     986              :     }
     987              :     Assert(ui == nhtids);
     988              :     Assert(d == vacposting->ndeletedtids);
     989              :     Assert(nhtids == 1 || _bt_posting_valid(itup));
     990              :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     991              : 
     992              :     /* vacposting arg's itup will now point to updated version */
     993        28504 :     vacposting->itup = itup;
     994        28504 : }
     995              : 
     996              : /*
     997              :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     998              :  * TID from original posting list (the 'oposting' heap TID located at offset
     999              :  * 'postingoff').  Modifies newitem, so caller should pass their own private
    1000              :  * copy that can safely be modified.
    1001              :  *
    1002              :  * Returns new posting list tuple, which is palloc()'d in caller's context.
    1003              :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
    1004              :  * what caller actually inserts. (This happens inside the same critical
    1005              :  * section that performs an in-place update of old posting list using new
    1006              :  * posting list returned here.)
    1007              :  *
    1008              :  * While the keys from newitem and oposting must be opclass equal, and must
    1009              :  * generate identical output when run through the underlying type's output
    1010              :  * function, it doesn't follow that their representations match exactly.
    1011              :  * Caller must avoid assuming that there can't be representational differences
    1012              :  * that make datums from oposting bigger or smaller than the corresponding
    1013              :  * datums from newitem.  For example, differences in TOAST input state might
    1014              :  * break a faulty assumption about tuple size (the executor is entitled to
    1015              :  * apply TOAST compression based on its own criteria).  It also seems possible
    1016              :  * that further representational variation will be introduced in the future,
    1017              :  * in order to support nbtree features like page-level prefix compression.
    1018              :  *
    1019              :  * See nbtree/README for details on the design of posting list splits.
    1020              :  */
    1021              : IndexTuple
    1022        19270 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
    1023              : {
    1024              :     int         nhtids;
    1025              :     char       *replacepos;
    1026              :     char       *replaceposright;
    1027              :     Size        nmovebytes;
    1028              :     IndexTuple  nposting;
    1029              : 
    1030        19270 :     nhtids = BTreeTupleGetNPosting(oposting);
    1031              :     Assert(_bt_posting_valid(oposting));
    1032              : 
    1033              :     /*
    1034              :      * The postingoff argument originated as a _bt_binsrch_posting() return
    1035              :      * value.  It will be 0 in the event of corruption that makes a leaf page
    1036              :      * contain a non-pivot tuple that's somehow identical to newitem (no two
    1037              :      * non-pivot tuples should ever have the same TID).  This has been known
    1038              :      * to happen in the field from time to time.
    1039              :      *
    1040              :      * Perform a basic sanity check to catch this case now.
    1041              :      */
    1042        19270 :     if (!(postingoff > 0 && postingoff < nhtids))
    1043            0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
    1044              :              nhtids, postingoff);
    1045              : 
    1046              :     /*
    1047              :      * Move item pointers in posting list to make a gap for the new item's
    1048              :      * heap TID.  We shift TIDs one place to the right, losing original
    1049              :      * rightmost TID. (nmovebytes must not include TIDs to the left of
    1050              :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
    1051              :      */
    1052        19270 :     nposting = CopyIndexTuple(oposting);
    1053        19270 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
    1054        19270 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
    1055        19270 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
    1056        19270 :     memmove(replaceposright, replacepos, nmovebytes);
    1057              : 
    1058              :     /* Fill the gap at postingoff with TID of new item (original new TID) */
    1059              :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
    1060        19270 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
    1061              : 
    1062              :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
    1063        19270 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
    1064              : 
    1065              :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
    1066              :                               BTreeTupleGetHeapTID(newitem)) < 0);
    1067              :     Assert(_bt_posting_valid(nposting));
    1068              : 
    1069        19270 :     return nposting;
    1070              : }
    1071              : 
    1072              : /*
    1073              :  * Verify posting list invariants for "posting", which must be a posting list
    1074              :  * tuple.  Used within assertions.
    1075              :  */
    1076              : #ifdef USE_ASSERT_CHECKING
    1077              : static bool
    1078              : _bt_posting_valid(IndexTuple posting)
    1079              : {
    1080              :     ItemPointerData last;
    1081              :     ItemPointer htid;
    1082              : 
    1083              :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
    1084              :         return false;
    1085              : 
    1086              :     /* Remember first heap TID for loop */
    1087              :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
    1088              :     if (!ItemPointerIsValid(&last))
    1089              :         return false;
    1090              : 
    1091              :     /* Iterate, starting from second TID */
    1092              :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
    1093              :     {
    1094              :         htid = BTreeTupleGetPostingN(posting, i);
    1095              : 
    1096              :         if (!ItemPointerIsValid(htid))
    1097              :             return false;
    1098              :         if (ItemPointerCompare(htid, &last) <= 0)
    1099              :             return false;
    1100              :         ItemPointerCopy(htid, &last);
    1101              :     }
    1102              : 
    1103              :     return true;
    1104              : }
    1105              : #endif
        

Generated by: LCOV version 2.0-1