LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 193 204 94.6 %
Date: 2020-06-01 07:06:57 Functions: 9 9 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtdedup.c
       4             :  *    Deduplicate items in Postgres btrees.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtdedup.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "postgres.h"
      16             : 
      17             : #include "access/nbtree.h"
      18             : #include "access/nbtxlog.h"
      19             : #include "miscadmin.h"
      20             : #include "utils/rel.h"
      21             : 
      22             : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      23             :                              OffsetNumber minoff, IndexTuple newitem);
      24             : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      25             :                                      Size newitemsz);
      26             : #ifdef USE_ASSERT_CHECKING
      27             : static bool _bt_posting_valid(IndexTuple posting);
      28             : #endif
      29             : 
      30             : /*
      31             :  * Deduplicate items on a leaf page.  The page will have to be split by caller
      32             :  * if we cannot successfully free at least newitemsz (we also need space for
      33             :  * newitem's line pointer, which isn't included in caller's newitemsz).
      34             :  *
      35             :  * The general approach taken here is to perform as much deduplication as
      36             :  * possible to free as much space as possible.  Note, however, that "single
      37             :  * value" strategy is sometimes used for !checkingunique callers, in which
      38             :  * case deduplication will leave a few tuples untouched at the end of the
      39             :  * page.  The general idea is to prepare the page for an anticipated page
      40             :  * split that uses nbtsplitloc.c's "single value" strategy to determine a
      41             :  * split point.  (There is no reason to deduplicate items that will end up on
      42             :  * the right half of the page after the anticipated page split; better to
      43             :  * handle those if and when the anticipated right half page gets its own
      44             :  * deduplication pass, following further inserts of duplicates.)
      45             :  *
      46             :  * This function should be called during insertion, when the page doesn't have
      47             :  * enough space to fit an incoming newitem.  If the BTP_HAS_GARBAGE page flag
      48             :  * was set, caller should have removed any LP_DEAD items by calling
      49             :  * _bt_vacuum_one_page() before calling here.  We may still have to kill
      50             :  * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
      51             :  * but that should be rare.  Also, _bt_vacuum_one_page() won't unset the
      52             :  * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
      53             :  * deduplication pass will always clear it, just to keep things tidy.
      54             :  */
      55             : void
      56        2936 : _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
      57             :                    IndexTuple newitem, Size newitemsz, bool checkingunique)
      58             : {
      59             :     OffsetNumber offnum,
      60             :                 minoff,
      61             :                 maxoff;
      62        2936 :     Page        page = BufferGetPage(buf);
      63             :     BTPageOpaque opaque;
      64             :     Page        newpage;
      65        2936 :     int         newpagendataitems = 0;
      66             :     OffsetNumber deletable[MaxIndexTuplesPerPage];
      67             :     BTDedupState state;
      68        2936 :     int         ndeletable = 0;
      69        2936 :     Size        pagesaving = 0;
      70        2936 :     bool        singlevalstrat = false;
      71        2936 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      72             : 
      73             :     /*
      74             :      * We can't assume that there are no LP_DEAD items.  For one thing, VACUUM
      75             :      * will clear the BTP_HAS_GARBAGE hint without reliably removing items
      76             :      * that are marked LP_DEAD.  We don't want to unnecessarily unset LP_DEAD
      77             :      * bits when deduplicating items.  Allowing it would be correct, though
      78             :      * wasteful.
      79             :      */
      80        2936 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
      81        2936 :     minoff = P_FIRSTDATAKEY(opaque);
      82        2936 :     maxoff = PageGetMaxOffsetNumber(page);
      83      490496 :     for (offnum = minoff;
      84             :          offnum <= maxoff;
      85      487560 :          offnum = OffsetNumberNext(offnum))
      86             :     {
      87      487560 :         ItemId      itemid = PageGetItemId(page, offnum);
      88             : 
      89      487560 :         if (ItemIdIsDead(itemid))
      90           0 :             deletable[ndeletable++] = offnum;
      91             :     }
      92             : 
      93        2936 :     if (ndeletable > 0)
      94             :     {
      95           0 :         _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
      96             : 
      97             :         /*
      98             :          * Return when a split will be avoided.  This is equivalent to
      99             :          * avoiding a split using the usual _bt_vacuum_one_page() path.
     100             :          */
     101           0 :         if (PageGetFreeSpace(page) >= newitemsz)
     102         360 :             return;
     103             : 
     104             :         /*
     105             :          * Reconsider number of items on page, in case _bt_delitems_delete()
     106             :          * managed to delete an item or two
     107             :          */
     108           0 :         minoff = P_FIRSTDATAKEY(opaque);
     109           0 :         maxoff = PageGetMaxOffsetNumber(page);
     110             :     }
     111             : 
     112             :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     113        2936 :     newitemsz += sizeof(ItemIdData);
     114             : 
     115             :     /*
     116             :      * By here, it's clear that deduplication will definitely be attempted.
     117             :      * Initialize deduplication state.
     118             :      *
     119             :      * It would be possible for maxpostingsize (limit on posting list tuple
     120             :      * size) to be set to one third of the page.  However, it seems like a
     121             :      * good idea to limit the size of posting lists to one sixth of a page.
     122             :      * That ought to leave us with a good split point when pages full of
     123             :      * duplicates can be split several times.
     124             :      */
     125        2936 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
     126        2936 :     state->deduplicate = true;
     127        2936 :     state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
     128             :     /* Metadata about base tuple of current pending posting list */
     129        2936 :     state->base = NULL;
     130        2936 :     state->baseoff = InvalidOffsetNumber;
     131        2936 :     state->basetupsize = 0;
     132             :     /* Metadata about current pending posting list TIDs */
     133        2936 :     state->htids = palloc(state->maxpostingsize);
     134        2936 :     state->nhtids = 0;
     135        2936 :     state->nitems = 0;
     136             :     /* Size of all physical tuples to be replaced by pending posting list */
     137        2936 :     state->phystupsize = 0;
     138             :     /* nintervals should be initialized to zero */
     139        2936 :     state->nintervals = 0;
     140             : 
     141             :     /* Determine if "single value" strategy should be used */
     142        2936 :     if (!checkingunique)
     143        2640 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     144             : 
     145             :     /*
     146             :      * Deduplicate items from page, and write them to newpage.
     147             :      *
     148             :      * Copy the original page's LSN into newpage copy.  This will become the
     149             :      * updated version of the page.  We need this because XLogInsert will
     150             :      * examine the LSN and possibly dump it in a page image.
     151             :      */
     152        2936 :     newpage = PageGetTempPageCopySpecial(page);
     153        2936 :     PageSetLSN(newpage, PageGetLSN(page));
     154             : 
     155             :     /* Copy high key, if any */
     156        2936 :     if (!P_RIGHTMOST(opaque))
     157             :     {
     158        2078 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
     159        2078 :         Size        hitemsz = ItemIdGetLength(hitemid);
     160        2078 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
     161             : 
     162        2078 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
     163             :                         false, false) == InvalidOffsetNumber)
     164           0 :             elog(ERROR, "deduplication failed to add highkey");
     165             :     }
     166             : 
     167      490496 :     for (offnum = minoff;
     168             :          offnum <= maxoff;
     169      487560 :          offnum = OffsetNumberNext(offnum))
     170             :     {
     171      487560 :         ItemId      itemid = PageGetItemId(page, offnum);
     172      487560 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     173             : 
     174             :         Assert(!ItemIdIsDead(itemid));
     175             : 
     176      487560 :         if (offnum == minoff)
     177             :         {
     178             :             /*
     179             :              * No previous/base tuple for the data item -- use the data item
     180             :              * as base tuple of pending posting list
     181             :              */
     182        2936 :             _bt_dedup_start_pending(state, itup, offnum);
     183             :         }
     184      969210 :         else if (state->deduplicate &&
     185      605814 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     186      121228 :                  _bt_dedup_save_htid(state, itup))
     187             :         {
     188             :             /*
     189             :              * Tuple is equal to base tuple of pending posting list.  Heap
     190             :              * TID(s) for itup have been saved in state.
     191             :              */
     192             :         }
     193             :         else
     194             :         {
     195             :             /*
     196             :              * Tuple is not equal to pending posting list tuple, or
     197             :              * _bt_dedup_save_htid() opted to not merge current item into
     198             :              * pending posting list for some other reason (e.g., adding more
     199             :              * TIDs would have caused posting list to exceed current
     200             :              * maxpostingsize).
     201             :              *
     202             :              * If state contains pending posting list with more than one item,
     203             :              * form new posting tuple, and actually update the page.  Else
     204             :              * reset the state and move on without modifying the page.
     205             :              */
     206      365066 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
     207      365066 :             newpagendataitems++;
     208             : 
     209      365066 :             if (singlevalstrat)
     210             :             {
     211             :                 /*
     212             :                  * Single value strategy's extra steps.
     213             :                  *
     214             :                  * Lower maxpostingsize for sixth and final item that might be
     215             :                  * deduplicated by current deduplication pass.  When sixth
     216             :                  * item formed/observed, stop deduplicating items.
     217             :                  *
     218             :                  * Note: It's possible that this will be reached even when
     219             :                  * current deduplication pass has yet to merge together some
     220             :                  * existing items.  It doesn't matter whether or not the
     221             :                  * current call generated the maxpostingsize-capped duplicate
     222             :                  * tuples at the start of the page.
     223             :                  */
     224         350 :                 if (newpagendataitems == 5)
     225          16 :                     _bt_singleval_fillfactor(page, state, newitemsz);
     226         334 :                 else if (newpagendataitems == 6)
     227             :                 {
     228           8 :                     state->deduplicate = false;
     229           8 :                     singlevalstrat = false; /* won't be back here */
     230             :                 }
     231             :             }
     232             : 
     233             :             /* itup starts new pending posting list */
     234      365066 :             _bt_dedup_start_pending(state, itup, offnum);
     235             :         }
     236             :     }
     237             : 
     238             :     /* Handle the last item */
     239        2936 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
     240        2936 :     newpagendataitems++;
     241             : 
     242             :     /*
     243             :      * If no items suitable for deduplication were found, newpage must be
     244             :      * exactly the same as the original page, so just return from function.
     245             :      *
     246             :      * We could determine whether or not to proceed on the basis the space
     247             :      * savings being sufficient to avoid an immediate page split instead.  We
     248             :      * don't do that because there is some small value in nbtsplitloc.c always
     249             :      * operating against a page that is fully deduplicated (apart from
     250             :      * newitem).  Besides, most of the cost has already been paid.
     251             :      */
     252        2936 :     if (state->nintervals == 0)
     253             :     {
     254             :         /* cannot leak memory here */
     255         360 :         pfree(newpage);
     256         360 :         pfree(state->htids);
     257         360 :         pfree(state);
     258         360 :         return;
     259             :     }
     260             : 
     261             :     /*
     262             :      * By here, it's clear that deduplication will definitely go ahead.
     263             :      *
     264             :      * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
     265             :      * still falsely set, just to keep things tidy.  (We can't rely on
     266             :      * _bt_vacuum_one_page() having done this already, and we can't rely on a
     267             :      * page split or VACUUM getting to it in the near future.)
     268             :      */
     269        2576 :     if (P_HAS_GARBAGE(opaque))
     270             :     {
     271           0 :         BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
     272             : 
     273           0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     274             :     }
     275             : 
     276        2576 :     START_CRIT_SECTION();
     277             : 
     278        2576 :     PageRestoreTempPage(newpage, page);
     279        2576 :     MarkBufferDirty(buf);
     280             : 
     281             :     /* XLOG stuff */
     282        2576 :     if (RelationNeedsWAL(rel))
     283             :     {
     284             :         XLogRecPtr  recptr;
     285             :         xl_btree_dedup xlrec_dedup;
     286             : 
     287        2492 :         xlrec_dedup.nintervals = state->nintervals;
     288             : 
     289        2492 :         XLogBeginInsert();
     290        2492 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     291        2492 :         XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
     292             : 
     293             :         /*
     294             :          * The intervals array is not in the buffer, but pretend that it is.
     295             :          * When XLogInsert stores the whole buffer, the array need not be
     296             :          * stored too.
     297             :          */
     298        2492 :         XLogRegisterBufData(0, (char *) state->intervals,
     299        2492 :                             state->nintervals * sizeof(BTDedupInterval));
     300             : 
     301        2492 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     302             : 
     303        2492 :         PageSetLSN(page, recptr);
     304             :     }
     305             : 
     306        2576 :     END_CRIT_SECTION();
     307             : 
     308             :     /* Local space accounting should agree with page accounting */
     309             :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     310             : 
     311             :     /* cannot leak memory here */
     312        2576 :     pfree(state->htids);
     313        2576 :     pfree(state);
     314             : }
     315             : 
     316             : /*
     317             :  * Create a new pending posting list tuple based on caller's base tuple.
     318             :  *
     319             :  * Every tuple processed by deduplication either becomes the base tuple for a
     320             :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     321             :  * A tuple that starts out as the base tuple for a posting list will only
     322             :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     323             :  * that there are duplicates that can be merged into the base tuple.
     324             :  */
     325             : void
     326     6444242 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     327             :                         OffsetNumber baseoff)
     328             : {
     329             :     Assert(state->nhtids == 0);
     330             :     Assert(state->nitems == 0);
     331             :     Assert(!BTreeTupleIsPivot(base));
     332             : 
     333             :     /*
     334             :      * Copy heap TID(s) from new base tuple for new candidate posting list
     335             :      * into working state's array
     336             :      */
     337     6444242 :     if (!BTreeTupleIsPosting(base))
     338             :     {
     339     6200402 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     340     6200402 :         state->nhtids = 1;
     341     6200402 :         state->basetupsize = IndexTupleSize(base);
     342             :     }
     343             :     else
     344             :     {
     345             :         int         nposting;
     346             : 
     347      243840 :         nposting = BTreeTupleGetNPosting(base);
     348      243840 :         memcpy(state->htids, BTreeTupleGetPosting(base),
     349             :                sizeof(ItemPointerData) * nposting);
     350      243840 :         state->nhtids = nposting;
     351             :         /* basetupsize should not include existing posting list */
     352      243840 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
     353             :     }
     354             : 
     355             :     /*
     356             :      * Save new base tuple itself -- it'll be needed if we actually create a
     357             :      * new posting list from new pending posting list.
     358             :      *
     359             :      * Must maintain physical size of all existing tuples (including line
     360             :      * pointer overhead) so that we can calculate space savings on page.
     361             :      */
     362     6444242 :     state->nitems = 1;
     363     6444242 :     state->base = base;
     364     6444242 :     state->baseoff = baseoff;
     365     6444242 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     366             :     /* Also save baseoff in pending state for interval */
     367     6444242 :     state->intervals[state->nintervals].baseoff = state->baseoff;
     368     6444242 : }
     369             : 
     370             : /*
     371             :  * Save itup heap TID(s) into pending posting list where possible.
     372             :  *
     373             :  * Returns bool indicating if the pending posting list managed by state now
     374             :  * includes itup's heap TID(s).
     375             :  */
     376             : bool
     377      869610 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     378             : {
     379             :     int         nhtids;
     380             :     ItemPointer htids;
     381             :     Size        mergedtupsz;
     382             : 
     383             :     Assert(!BTreeTupleIsPivot(itup));
     384             : 
     385      869610 :     if (!BTreeTupleIsPosting(itup))
     386             :     {
     387      866928 :         nhtids = 1;
     388      866928 :         htids = &itup->t_tid;
     389             :     }
     390             :     else
     391             :     {
     392        2682 :         nhtids = BTreeTupleGetNPosting(itup);
     393        2682 :         htids = BTreeTupleGetPosting(itup);
     394             :     }
     395             : 
     396             :     /*
     397             :      * Don't append (have caller finish pending posting list as-is) if
     398             :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
     399             :      *
     400             :      * This calculation needs to match the code used within _bt_form_posting()
     401             :      * for new posting list tuples.
     402             :      */
     403      869610 :     mergedtupsz = MAXALIGN(state->basetupsize +
     404             :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
     405             : 
     406      869610 :     if (mergedtupsz > state->maxpostingsize)
     407        6262 :         return false;
     408             : 
     409             :     /*
     410             :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
     411             :      * pending posting list
     412             :      */
     413      863348 :     state->nitems++;
     414      863348 :     memcpy(state->htids + state->nhtids, htids,
     415             :            sizeof(ItemPointerData) * nhtids);
     416      863348 :     state->nhtids += nhtids;
     417      863348 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     418             : 
     419      863348 :     return true;
     420             : }
     421             : 
     422             : /*
     423             :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     424             :  * is based on saved base tuple, and saved list of heap TIDs.
     425             :  *
     426             :  * Returns space saving from deduplicating to make a new posting list tuple.
     427             :  * Note that this includes line pointer overhead.  This is zero in the case
     428             :  * where no deduplication was possible.
     429             :  */
     430             : Size
     431      438310 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     432             : {
     433             :     OffsetNumber tupoff;
     434             :     Size        tuplesz;
     435             :     Size        spacesaving;
     436             : 
     437             :     Assert(state->nitems > 0);
     438             :     Assert(state->nitems <= state->nhtids);
     439             :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     440             : 
     441      438310 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     442      438310 :     if (state->nitems == 1)
     443             :     {
     444             :         /* Use original, unchanged base tuple */
     445      358106 :         tuplesz = IndexTupleSize(state->base);
     446      358106 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
     447             :                         false, false) == InvalidOffsetNumber)
     448           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     449             : 
     450      358106 :         spacesaving = 0;
     451             :     }
     452             :     else
     453             :     {
     454             :         IndexTuple  final;
     455             : 
     456             :         /* Form a tuple with a posting list */
     457       80204 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
     458       80204 :         tuplesz = IndexTupleSize(final);
     459             :         Assert(tuplesz <= state->maxpostingsize);
     460             : 
     461             :         /* Save final number of items for posting list */
     462       80204 :         state->intervals[state->nintervals].nitems = state->nitems;
     463             : 
     464             :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     465       80204 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
     466             :                         false) == InvalidOffsetNumber)
     467           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     468             : 
     469       80204 :         pfree(final);
     470       80204 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     471             :         /* Increment nintervals, since we wrote a new posting list tuple */
     472       80204 :         state->nintervals++;
     473             :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     474             :     }
     475             : 
     476             :     /* Reset state for next pending posting list */
     477      438310 :     state->nhtids = 0;
     478      438310 :     state->nitems = 0;
     479      438310 :     state->phystupsize = 0;
     480             : 
     481      438310 :     return spacesaving;
     482             : }
     483             : 
     484             : /*
     485             :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     486             :  * same value -- if they are, deduplication's "single value" strategy should
     487             :  * be applied.  The general goal of this strategy is to ensure that
     488             :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     489             :  * split point as further duplicates are inserted, and successive rightmost
     490             :  * page splits occur among pages that store the same duplicate value.  When
     491             :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     492             :  * just like it would if deduplication were disabled.
     493             :  *
     494             :  * We expect that affected workloads will require _several_ single value
     495             :  * strategy deduplication passes (over a page that only stores duplicates)
     496             :  * before the page is finally split.  The first deduplication pass should only
     497             :  * find regular non-pivot tuples.  Later deduplication passes will find
     498             :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     499             :  * over.  The penultimate pass is generally the first pass that actually
     500             :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     501             :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     502             :  * any space -- it will skip over everything without merging anything (it
     503             :  * retraces the steps of the penultimate pass).
     504             :  *
     505             :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     506             :  * the first pass) won't spend many cycles on the large posting list tuples
     507             :  * left by previous passes.  Each pass will find a large contiguous group of
     508             :  * smaller duplicate tuples to merge together at the end of the page.
     509             :  *
     510             :  * Note: We deliberately don't bother checking if the high key is a distinct
     511             :  * value (prior to the TID tiebreaker column) before proceeding, unlike
     512             :  * nbtsplitloc.c.  Its single value strategy only gets applied on the
     513             :  * rightmost page of duplicates of the same value (other leaf pages full of
     514             :  * duplicates will get a simple 50:50 page split instead of splitting towards
     515             :  * the end of the page).  There is little point in making the same distinction
     516             :  * here.
     517             :  */
     518             : static bool
     519        2640 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     520             :                  OffsetNumber minoff, IndexTuple newitem)
     521             : {
     522        2640 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     523             :     ItemId      itemid;
     524             :     IndexTuple  itup;
     525             : 
     526        2640 :     itemid = PageGetItemId(page, minoff);
     527        2640 :     itup = (IndexTuple) PageGetItem(page, itemid);
     528             : 
     529        2640 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     530             :     {
     531         222 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     532         222 :         itup = (IndexTuple) PageGetItem(page, itemid);
     533             : 
     534         222 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     535         158 :             return true;
     536             :     }
     537             : 
     538        2482 :     return false;
     539             : }
     540             : 
     541             : /*
     542             :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     543             :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     544             :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     545             :  * five tuples could actually just be very large duplicate tuples that
     546             :  * couldn't be merged together at all.  Deduplication will simply not modify
     547             :  * the page when that happens.)
     548             :  *
     549             :  * When there are six posting lists on the page (after current deduplication
     550             :  * pass goes on to create/observe a sixth very large tuple), caller should end
     551             :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     552             :  * are supposed to end up on the new right sibling page following the
     553             :  * anticipated page split.  A future deduplication pass of future right
     554             :  * sibling page might take care of it.  (This is why the first single value
     555             :  * strategy deduplication pass for a given leaf page will generally find only
     556             :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     557             :  */
     558             : static void
     559          16 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     560             : {
     561             :     Size        leftfree;
     562             :     int         reduction;
     563             : 
     564             :     /* This calculation needs to match nbtsplitloc.c */
     565          16 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     566             :         MAXALIGN(sizeof(BTPageOpaqueData));
     567             :     /* Subtract size of new high key (includes pivot heap TID space) */
     568          16 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     569             : 
     570             :     /*
     571             :      * Reduce maxpostingsize by an amount equal to target free space on left
     572             :      * half of page
     573             :      */
     574          16 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     575          16 :     if (state->maxpostingsize > reduction)
     576          16 :         state->maxpostingsize -= reduction;
     577             :     else
     578           0 :         state->maxpostingsize = 0;
     579          16 : }
     580             : 
     581             : /*
     582             :  * Build a posting list tuple based on caller's "base" index tuple and list of
     583             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     584             :  * posting list. (Posting list tuples can never have a single heap TID, partly
     585             :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     586             :  * size of entire tuple.)
     587             :  *
     588             :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     589             :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     590             :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     591             :  * This sometimes wastes a little space that was only needed as alignment
     592             :  * padding in the original tuple.  Following this convention simplifies the
     593             :  * space accounting used when deduplicating a page (the same convention
     594             :  * simplifies the accounting for choosing a point to split a page at).
     595             :  *
     596             :  * Note: Caller's "htids" array must be unique and already in ascending TID
     597             :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     598             :  * returned posting list tuple (they must be included in htids array.)
     599             :  */
     600             : IndexTuple
     601       95758 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
     602             : {
     603             :     uint32      keysize,
     604             :                 newsize;
     605             :     IndexTuple  itup;
     606             : 
     607       95758 :     if (BTreeTupleIsPosting(base))
     608       44786 :         keysize = BTreeTupleGetPostingOffset(base);
     609             :     else
     610       50972 :         keysize = IndexTupleSize(base);
     611             : 
     612             :     Assert(!BTreeTupleIsPivot(base));
     613             :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     614             :     Assert(keysize == MAXALIGN(keysize));
     615             : 
     616             :     /* Determine final size of new tuple */
     617       95758 :     if (nhtids > 1)
     618       91822 :         newsize = MAXALIGN(keysize +
     619             :                            nhtids * sizeof(ItemPointerData));
     620             :     else
     621        3936 :         newsize = keysize;
     622             : 
     623             :     Assert(newsize <= INDEX_SIZE_MASK);
     624             :     Assert(newsize == MAXALIGN(newsize));
     625             : 
     626             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     627       95758 :     itup = palloc0(newsize);
     628       95758 :     memcpy(itup, base, keysize);
     629       95758 :     itup->t_info &= ~INDEX_SIZE_MASK;
     630       95758 :     itup->t_info |= newsize;
     631       95758 :     if (nhtids > 1)
     632             :     {
     633             :         /* Form posting list tuple */
     634       91822 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     635       91822 :         memcpy(BTreeTupleGetPosting(itup), htids,
     636             :                sizeof(ItemPointerData) * nhtids);
     637             :         Assert(_bt_posting_valid(itup));
     638             :     }
     639             :     else
     640             :     {
     641             :         /* Form standard non-pivot tuple */
     642        3936 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     643        3936 :         ItemPointerCopy(htids, &itup->t_tid);
     644             :         Assert(ItemPointerIsValid(&itup->t_tid));
     645             :     }
     646             : 
     647       95758 :     return itup;
     648             : }
     649             : 
     650             : /*
     651             :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     652             :  * no longer has TIDs that need to be deleted.
     653             :  *
     654             :  * Used by VACUUM.  Caller's vacposting argument points to the existing
     655             :  * posting list tuple to be updated.
     656             :  *
     657             :  * On return, caller's vacposting argument will point to final "updated"
     658             :  * tuple, which will be palloc()'d in caller's memory context.
     659             :  */
     660             : void
     661        6396 : _bt_update_posting(BTVacuumPosting vacposting)
     662             : {
     663        6396 :     IndexTuple  origtuple = vacposting->itup;
     664             :     uint32      keysize,
     665             :                 newsize;
     666             :     IndexTuple  itup;
     667             :     int         nhtids;
     668             :     int         ui,
     669             :                 d;
     670             :     ItemPointer htids;
     671             : 
     672        6396 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     673             : 
     674             :     Assert(_bt_posting_valid(origtuple));
     675             :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     676             : 
     677             :     /*
     678             :      * Determine final size of new tuple.
     679             :      *
     680             :      * This calculation needs to match the code used within _bt_form_posting()
     681             :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
     682             :      * to save ourselves a second memory allocation for a htids workspace.
     683             :      */
     684        6396 :     keysize = BTreeTupleGetPostingOffset(origtuple);
     685        6396 :     if (nhtids > 1)
     686         522 :         newsize = MAXALIGN(keysize +
     687             :                            nhtids * sizeof(ItemPointerData));
     688             :     else
     689        5874 :         newsize = keysize;
     690             : 
     691             :     Assert(newsize <= INDEX_SIZE_MASK);
     692             :     Assert(newsize == MAXALIGN(newsize));
     693             : 
     694             :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     695        6396 :     itup = palloc0(newsize);
     696        6396 :     memcpy(itup, origtuple, keysize);
     697        6396 :     itup->t_info &= ~INDEX_SIZE_MASK;
     698        6396 :     itup->t_info |= newsize;
     699             : 
     700        6396 :     if (nhtids > 1)
     701             :     {
     702             :         /* Form posting list tuple */
     703         522 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     704         522 :         htids = BTreeTupleGetPosting(itup);
     705             :     }
     706             :     else
     707             :     {
     708             :         /* Form standard non-pivot tuple */
     709        5874 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     710        5874 :         htids = &itup->t_tid;
     711             :     }
     712             : 
     713        6396 :     ui = 0;
     714        6396 :     d = 0;
     715       31222 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     716             :     {
     717       24826 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     718             :         {
     719       16868 :             d++;
     720       16868 :             continue;
     721             :         }
     722        7958 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     723             :     }
     724             :     Assert(ui == nhtids);
     725             :     Assert(d == vacposting->ndeletedtids);
     726             :     Assert(nhtids == 1 || _bt_posting_valid(itup));
     727             :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     728             : 
     729             :     /* vacposting arg's itup will now point to updated version */
     730        6396 :     vacposting->itup = itup;
     731        6396 : }
     732             : 
     733             : /*
     734             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     735             :  * TID from original posting list (the 'oposting' heap TID located at offset
     736             :  * 'postingoff').  Modifies newitem, so caller should pass their own private
     737             :  * copy that can safely be modified.
     738             :  *
     739             :  * Returns new posting list tuple, which is palloc()'d in caller's context.
     740             :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
     741             :  * what caller actually inserts. (This happens inside the same critical
     742             :  * section that performs an in-place update of old posting list using new
     743             :  * posting list returned here.)
     744             :  *
     745             :  * While the keys from newitem and oposting must be opclass equal, and must
     746             :  * generate identical output when run through the underlying type's output
     747             :  * function, it doesn't follow that their representations match exactly.
     748             :  * Caller must avoid assuming that there can't be representational differences
     749             :  * that make datums from oposting bigger or smaller than the corresponding
     750             :  * datums from newitem.  For example, differences in TOAST input state might
     751             :  * break a faulty assumption about tuple size (the executor is entitled to
     752             :  * apply TOAST compression based on its own criteria).  It also seems possible
     753             :  * that further representational variation will be introduced in the future,
     754             :  * in order to support nbtree features like page-level prefix compression.
     755             :  *
     756             :  * See nbtree/README for details on the design of posting list splits.
     757             :  */
     758             : IndexTuple
     759        3976 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
     760             : {
     761             :     int         nhtids;
     762             :     char       *replacepos;
     763             :     char       *replaceposright;
     764             :     Size        nmovebytes;
     765             :     IndexTuple  nposting;
     766             : 
     767        3976 :     nhtids = BTreeTupleGetNPosting(oposting);
     768             :     Assert(_bt_posting_valid(oposting));
     769             :     Assert(postingoff > 0 && postingoff < nhtids);
     770             : 
     771             :     /*
     772             :      * Move item pointers in posting list to make a gap for the new item's
     773             :      * heap TID.  We shift TIDs one place to the right, losing original
     774             :      * rightmost TID. (nmovebytes must not include TIDs to the left of
     775             :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
     776             :      */
     777        3976 :     nposting = CopyIndexTuple(oposting);
     778        3976 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
     779        3976 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
     780        3976 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
     781        3976 :     memmove(replaceposright, replacepos, nmovebytes);
     782             : 
     783             :     /* Fill the gap at postingoff with TID of new item (original new TID) */
     784             :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
     785        3976 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
     786             : 
     787             :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
     788        3976 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
     789             : 
     790             :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
     791             :                               BTreeTupleGetHeapTID(newitem)) < 0);
     792             :     Assert(_bt_posting_valid(nposting));
     793             : 
     794        3976 :     return nposting;
     795             : }
     796             : 
     797             : /*
     798             :  * Verify posting list invariants for "posting", which must be a posting list
     799             :  * tuple.  Used within assertions.
     800             :  */
     801             : #ifdef USE_ASSERT_CHECKING
     802             : static bool
     803             : _bt_posting_valid(IndexTuple posting)
     804             : {
     805             :     ItemPointerData last;
     806             :     ItemPointer htid;
     807             : 
     808             :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
     809             :         return false;
     810             : 
     811             :     /* Remember first heap TID for loop */
     812             :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
     813             :     if (!ItemPointerIsValid(&last))
     814             :         return false;
     815             : 
     816             :     /* Iterate, starting from second TID */
     817             :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
     818             :     {
     819             :         htid = BTreeTupleGetPostingN(posting, i);
     820             : 
     821             :         if (!ItemPointerIsValid(htid))
     822             :             return false;
     823             :         if (ItemPointerCompare(htid, &last) <= 0)
     824             :             return false;
     825             :         ItemPointerCopy(htid, &last);
     826             :     }
     827             : 
     828             :     return true;
     829             : }
     830             : #endif

Generated by: LCOV version 1.13