LCOV - code coverage report
Current view: top level - src/backend/access/heap - pruneheap.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.3 % 642 586
Test Date: 2026-04-02 11:16:07 Functions: 96.4 % 28 27
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pruneheap.c
       4              :  *    heap page pruning and HOT-chain management code
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/heap/pruneheap.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #include "postgres.h"
      16              : 
      17              : #include "access/heapam.h"
      18              : #include "access/heapam_xlog.h"
      19              : #include "access/htup_details.h"
      20              : #include "access/multixact.h"
      21              : #include "access/transam.h"
      22              : #include "access/visibilitymap.h"
      23              : #include "access/xlog.h"
      24              : #include "access/xloginsert.h"
      25              : #include "commands/vacuum.h"
      26              : #include "executor/instrument.h"
      27              : #include "miscadmin.h"
      28              : #include "pgstat.h"
      29              : #include "storage/bufmgr.h"
      30              : #include "utils/rel.h"
      31              : #include "utils/snapmgr.h"
      32              : 
      33              : /* Working data for heap_page_prune_and_freeze() and subroutines */
      34              : typedef struct
      35              : {
      36              :     /*-------------------------------------------------------
      37              :      * Arguments passed to heap_page_prune_and_freeze()
      38              :      *-------------------------------------------------------
      39              :      */
      40              : 
      41              :     /* tuple visibility test, initialized for the relation */
      42              :     GlobalVisState *vistest;
      43              :     /* whether or not dead items can be set LP_UNUSED during pruning */
      44              :     bool        mark_unused_now;
      45              :     /* whether to attempt freezing tuples */
      46              :     bool        attempt_freeze;
      47              :     /* whether to attempt setting the VM */
      48              :     bool        attempt_set_vm;
      49              :     struct VacuumCutoffs *cutoffs;
      50              :     Relation    relation;
      51              : 
      52              :     /*
      53              :      * Keep the buffer, block, and page handy so that helpers needing to
      54              :      * access them don't need to make repeated calls to BufferGetBlockNumber()
      55              :      * and BufferGetPage().
      56              :      */
      57              :     BlockNumber block;
      58              :     Buffer      buffer;
      59              :     Page        page;
      60              : 
      61              :     /*-------------------------------------------------------
      62              :      * Fields describing what to do to the page
      63              :      *-------------------------------------------------------
      64              :      */
      65              :     TransactionId new_prune_xid;    /* new prune hint value */
      66              :     TransactionId latest_xid_removed;
      67              :     int         nredirected;    /* numbers of entries in arrays below */
      68              :     int         ndead;
      69              :     int         nunused;
      70              :     int         nfrozen;
      71              :     /* arrays that accumulate indexes of items to be changed */
      72              :     OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
      73              :     OffsetNumber nowdead[MaxHeapTuplesPerPage];
      74              :     OffsetNumber nowunused[MaxHeapTuplesPerPage];
      75              :     HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
      76              : 
      77              :     /*
      78              :      * set_all_visible and set_all_frozen indicate if the all-visible and
      79              :      * all-frozen bits in the visibility map can be set for this page after
      80              :      * pruning. They are only tracked when the caller requests VM updates
      81              :      * (attempt_set_vm); otherwise they remain false throughout.
      82              :      *
      83              :      * NOTE: set_all_visible and set_all_frozen initially don't include
      84              :      * LP_DEAD items. That's convenient for heap_page_prune_and_freeze() to
      85              :      * use them to decide whether to opportunistically freeze the page or not.
      86              :      * The set_all_visible and set_all_frozen values ultimately used to set
      87              :      * the VM are adjusted to include LP_DEAD items after we determine whether
      88              :      * or not to opportunistically freeze.
      89              :      */
      90              :     bool        set_all_visible;
      91              :     bool        set_all_frozen;
      92              : 
      93              :     /*-------------------------------------------------------
      94              :      * Working state for HOT chain processing
      95              :      *-------------------------------------------------------
      96              :      */
      97              : 
      98              :     /*
      99              :      * 'root_items' contains offsets of all LP_REDIRECT line pointers and
     100              :      * normal non-HOT tuples.  They can be stand-alone items or the first item
     101              :      * in a HOT chain.  'heaponly_items' contains heap-only tuples which can
     102              :      * only be removed as part of a HOT chain.
     103              :      */
     104              :     int         nroot_items;
     105              :     OffsetNumber root_items[MaxHeapTuplesPerPage];
     106              :     int         nheaponly_items;
     107              :     OffsetNumber heaponly_items[MaxHeapTuplesPerPage];
     108              : 
     109              :     /*
     110              :      * processed[offnum] is true if item at offnum has been processed.
     111              :      *
     112              :      * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
     113              :      * 1. Otherwise every access would need to subtract 1.
     114              :      */
     115              :     bool        processed[MaxHeapTuplesPerPage + 1];
     116              : 
     117              :     /*
     118              :      * Tuple visibility is only computed once for each tuple, for correctness
     119              :      * and efficiency reasons; see comment in heap_page_prune_and_freeze() for
     120              :      * details.  This is of type int8[], instead of HTSV_Result[], so we can
     121              :      * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD
     122              :      * items.
     123              :      *
     124              :      * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
     125              :      * 1. Otherwise every access would need to subtract 1.
     126              :      */
     127              :     int8        htsv[MaxHeapTuplesPerPage + 1];
     128              : 
     129              :     /*-------------------------------------------------------
     130              :      * Working state for freezing
     131              :      *-------------------------------------------------------
     132              :      */
     133              :     HeapPageFreeze pagefrz;
     134              : 
     135              :     /*-------------------------------------------------------
     136              :      * Working state for visibility map processing
     137              :      *-------------------------------------------------------
     138              :      */
     139              : 
     140              :     /*
     141              :      * Caller must provide a pinned vmbuffer corresponding to the heap block
     142              :      * passed to heap_page_prune_and_freeze(). We will fix any corruption
     143              :      * found in the VM and set the VM if the page is all-visible/all-frozen.
     144              :      */
     145              :     Buffer      vmbuffer;
     146              : 
     147              :     /*
     148              :      * The state of the VM bits at the beginning of pruning and the state they
     149              :      * will be in at the end.
     150              :      */
     151              :     uint8       old_vmbits;
     152              :     uint8       new_vmbits;
     153              : 
     154              :     /* The newest xmin of live tuples on the page */
     155              :     TransactionId newest_live_xid;
     156              : 
     157              :     /*-------------------------------------------------------
     158              :      * Information about what was done
     159              :      *
     160              :      * These fields are not used by pruning itself for the most part, but are
     161              :      * used to collect information about what was pruned and what state the
     162              :      * page is in after pruning, for the benefit of the caller.  They are
     163              :      * copied to the caller's PruneFreezeResult at the end.
     164              :      * -------------------------------------------------------
     165              :      */
     166              : 
     167              :     int         ndeleted;       /* Number of tuples deleted from the page */
     168              : 
     169              :     /* Number of live and recently dead tuples, after pruning */
     170              :     int         live_tuples;
     171              :     int         recently_dead_tuples;
     172              : 
     173              :     /* Whether or not the page makes rel truncation unsafe */
     174              :     bool        hastup;
     175              : 
     176              :     /*
     177              :      * LP_DEAD items on the page after pruning.  Includes existing LP_DEAD
     178              :      * items
     179              :      */
     180              :     int         lpdead_items;   /* number of items in the array */
     181              :     OffsetNumber *deadoffsets;  /* points directly to presult->deadoffsets */
     182              : } PruneState;
     183              : 
     184              : /*
     185              :  * Type of visibility map corruption detected on a heap page and its
     186              :  * associated VM page. Passed to heap_page_fix_vm_corruption() so the caller
     187              :  * can specify what it found rather than having the function rederive the
     188              :  * corruption from page state.
     189              :  */
     190              : typedef enum VMCorruptionType
     191              : {
     192              :     /* VM bits are set but the heap page-level PD_ALL_VISIBLE flag is not */
     193              :     VM_CORRUPT_MISSING_PAGE_HINT,
     194              :     /* LP_DEAD line pointers found on a page marked all-visible */
     195              :     VM_CORRUPT_LPDEAD,
     196              :     /* Tuple not visible to all transactions on a page marked all-visible */
     197              :     VM_CORRUPT_TUPLE_VISIBILITY,
     198              : } VMCorruptionType;
     199              : 
     200              : /* Local functions */
     201              : static void prune_freeze_setup(PruneFreezeParams *params,
     202              :                                TransactionId *new_relfrozen_xid,
     203              :                                MultiXactId *new_relmin_mxid,
     204              :                                PruneFreezeResult *presult,
     205              :                                PruneState *prstate);
     206              : static void heap_page_fix_vm_corruption(PruneState *prstate,
     207              :                                         OffsetNumber offnum,
     208              :                                         VMCorruptionType ctype);
     209              : static void prune_freeze_fast_path(PruneState *prstate,
     210              :                                    PruneFreezeResult *presult);
     211              : static void prune_freeze_plan(PruneState *prstate,
     212              :                               OffsetNumber *off_loc);
     213              : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
     214              :                                                HeapTuple tup);
     215              : static inline HTSV_Result htsv_get_valid_status(int status);
     216              : static void heap_prune_chain(OffsetNumber maxoff,
     217              :                              OffsetNumber rootoffnum, PruneState *prstate);
     218              : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
     219              :                                        OffsetNumber offnum);
     220              : static void heap_prune_record_redirect(PruneState *prstate,
     221              :                                        OffsetNumber offnum, OffsetNumber rdoffnum,
     222              :                                        bool was_normal);
     223              : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
     224              :                                    bool was_normal);
     225              : static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
     226              :                                              bool was_normal);
     227              : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
     228              : 
     229              : static void heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum);
     230              : static void heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum);
     231              : static void heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum);
     232              : static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum);
     233              : 
     234              : static void page_verify_redirects(Page page);
     235              : 
     236              : static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune,
     237              :                                   PruneState *prstate);
     238              : static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason,
     239              :                                   bool do_prune, bool do_freeze);
     240              : 
     241              : 
     242              : /*
     243              :  * Optionally prune and repair fragmentation in the specified page.
     244              :  *
     245              :  * This is an opportunistic function.  It will perform housekeeping
     246              :  * only if the page heuristically looks like a candidate for pruning and we
     247              :  * can acquire buffer cleanup lock without blocking.
     248              :  *
     249              :  * Note: this is called quite often.  It's important that it fall out quickly
     250              :  * if there's not any use in pruning.
     251              :  *
     252              :  * Caller must have pin on the buffer, and must *not* have a lock on it.
     253              :  *
     254              :  * This function may pin *vmbuffer. It's passed by reference so the caller can
     255              :  * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find
     256              :  * VM corruption during pruning, we will fix it. Caller is responsible for
     257              :  * unpinning *vmbuffer.
     258              :  *
     259              :  * rel_read_only is true if we determined at plan time that the query does not
     260              :  * modify the relation. It is counterproductive to set the VM if the query
     261              :  * will immediately clear it.
     262              :  *
     263              :  * As noted in ScanRelIsReadOnly(), INSERT ... SELECT from the same table will
     264              :  * report the scan relation as read-only. This is usually harmless in
     265              :  * practice. It is useful to set scanned pages all-visible that won't be
     266              :  * inserted into. Pages it does insert to will rarely meet the criteria for
     267              :  * pruning, and those that do are likely to contain in-progress inserts which
     268              :  * make the page not fully all-visible.
     269              :  */
     270              : void
     271     21107923 : heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer,
     272              :                     bool rel_read_only)
     273              : {
     274     21107923 :     Page        page = BufferGetPage(buffer);
     275              :     TransactionId prune_xid;
     276              :     GlobalVisState *vistest;
     277              :     Size        minfree;
     278              : 
     279              :     /*
     280              :      * We can't write WAL in recovery mode, so there's no point trying to
     281              :      * clean the page. The primary will likely issue a cleaning WAL record
     282              :      * soon anyway, so this is no particular loss.
     283              :      */
     284     21107923 :     if (RecoveryInProgress())
     285       241635 :         return;
     286              : 
     287              :     /*
     288              :      * First check whether there's any chance there's something to prune,
     289              :      * determining the appropriate horizon is a waste if there's no prune_xid
     290              :      * (i.e. no updates/deletes left potentially dead tuples around and no
     291              :      * inserts inserted new tuples that may be visible to all).
     292              :      */
     293     20866288 :     prune_xid = PageGetPruneXid(page);
     294     20866288 :     if (!TransactionIdIsValid(prune_xid))
     295      9425118 :         return;
     296              : 
     297              :     /*
     298              :      * Check whether prune_xid indicates that there may be dead rows that can
     299              :      * be cleaned up.
     300              :      */
     301     11441170 :     vistest = GlobalVisTestFor(relation);
     302              : 
     303     11441170 :     if (!GlobalVisTestIsRemovableXid(vistest, prune_xid, true))
     304      8984204 :         return;
     305              : 
     306              :     /*
     307              :      * We prune when a previous UPDATE failed to find enough space on the page
     308              :      * for a new tuple version, or when free space falls below the relation's
     309              :      * fill-factor target (but not less than 10%).
     310              :      *
     311              :      * Checking free space here is questionable since we aren't holding any
     312              :      * lock on the buffer; in the worst case we could get a bogus answer. It's
     313              :      * unlikely to be *seriously* wrong, though, since reading either pd_lower
     314              :      * or pd_upper is probably atomic.  Avoiding taking a lock seems more
     315              :      * important than sometimes getting a wrong answer in what is after all
     316              :      * just a heuristic estimate.
     317              :      */
     318      2456966 :     minfree = RelationGetTargetPageFreeSpace(relation,
     319              :                                              HEAP_DEFAULT_FILLFACTOR);
     320      2456966 :     minfree = Max(minfree, BLCKSZ / 10);
     321              : 
     322      2456966 :     if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
     323              :     {
     324              :         /* OK, try to get exclusive buffer lock */
     325       107597 :         if (!ConditionalLockBufferForCleanup(buffer))
     326         1996 :             return;
     327              : 
     328              :         /*
     329              :          * Now that we have buffer lock, get accurate information about the
     330              :          * page's free space, and recheck the heuristic about whether to
     331              :          * prune.
     332              :          */
     333       105601 :         if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
     334              :         {
     335              :             OffsetNumber dummy_off_loc;
     336              :             PruneFreezeResult presult;
     337              :             PruneFreezeParams params;
     338              : 
     339       105601 :             visibilitymap_pin(relation, BufferGetBlockNumber(buffer),
     340              :                               vmbuffer);
     341              : 
     342       105601 :             params.relation = relation;
     343       105601 :             params.buffer = buffer;
     344       105601 :             params.vmbuffer = *vmbuffer;
     345       105601 :             params.reason = PRUNE_ON_ACCESS;
     346       105601 :             params.vistest = vistest;
     347       105601 :             params.cutoffs = NULL;
     348              : 
     349              :             /*
     350              :              * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
     351              :              * regardless of whether or not the relation has indexes, since we
     352              :              * cannot safely determine that during on-access pruning with the
     353              :              * current implementation.
     354              :              */
     355       105601 :             params.options = HEAP_PAGE_PRUNE_ALLOW_FAST_PATH;
     356       105601 :             if (rel_read_only)
     357        37506 :                 params.options |= HEAP_PAGE_PRUNE_SET_VM;
     358              : 
     359       105601 :             heap_page_prune_and_freeze(&params, &presult, &dummy_off_loc,
     360              :                                        NULL, NULL);
     361              : 
     362              :             /*
     363              :              * Report the number of tuples reclaimed to pgstats.  This is
     364              :              * presult.ndeleted minus the number of newly-LP_DEAD-set items.
     365              :              *
     366              :              * We derive the number of dead tuples like this to avoid totally
     367              :              * forgetting about items that were set to LP_DEAD, since they
     368              :              * still need to be cleaned up by VACUUM.  We only want to count
     369              :              * heap-only tuples that just became LP_UNUSED in our report,
     370              :              * which don't.
     371              :              *
     372              :              * VACUUM doesn't have to compensate in the same way when it
     373              :              * tracks ndeleted, since it will set the same LP_DEAD items to
     374              :              * LP_UNUSED separately.
     375              :              */
     376       105601 :             if (presult.ndeleted > presult.nnewlpdead)
     377        20684 :                 pgstat_update_heap_dead_tuples(relation,
     378        20684 :                                                presult.ndeleted - presult.nnewlpdead);
     379              :         }
     380              : 
     381              :         /* And release buffer lock */
     382       105601 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
     383              : 
     384              :         /*
     385              :          * We avoid reuse of any free space created on the page by unrelated
     386              :          * UPDATEs/INSERTs by opting to not update the FSM at this point.  The
     387              :          * free space should be reused by UPDATEs to *this* page.
     388              :          */
     389              :     }
     390              : }
     391              : 
     392              : /*
     393              :  * Helper for heap_page_prune_and_freeze() to initialize the PruneState using
     394              :  * the provided parameters.
     395              :  *
     396              :  * params, new_relfrozen_xid, new_relmin_mxid, and presult are input
     397              :  * parameters and are not modified by this function. Only prstate is modified.
     398              :  */
     399              : static void
     400       507791 : prune_freeze_setup(PruneFreezeParams *params,
     401              :                    TransactionId *new_relfrozen_xid,
     402              :                    MultiXactId *new_relmin_mxid,
     403              :                    PruneFreezeResult *presult,
     404              :                    PruneState *prstate)
     405              : {
     406              :     /* Copy parameters to prstate */
     407       507791 :     prstate->vistest = params->vistest;
     408       507791 :     prstate->mark_unused_now =
     409       507791 :         (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0;
     410              : 
     411              :     /* cutoffs must be provided if we will attempt freezing */
     412              :     Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs);
     413       507791 :     prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0;
     414       507791 :     prstate->attempt_set_vm = (params->options & HEAP_PAGE_PRUNE_SET_VM) != 0;
     415       507791 :     prstate->cutoffs = params->cutoffs;
     416       507791 :     prstate->relation = params->relation;
     417       507791 :     prstate->block = BufferGetBlockNumber(params->buffer);
     418       507791 :     prstate->buffer = params->buffer;
     419       507791 :     prstate->page = BufferGetPage(params->buffer);
     420              : 
     421              :     Assert(BufferIsValid(params->vmbuffer));
     422       507791 :     prstate->vmbuffer = params->vmbuffer;
     423       507791 :     prstate->new_vmbits = 0;
     424       507791 :     prstate->old_vmbits = visibilitymap_get_status(prstate->relation,
     425              :                                                    prstate->block,
     426              :                                                    &prstate->vmbuffer);
     427              : 
     428              :     /*
     429              :      * Our strategy is to scan the page and make lists of items to change,
     430              :      * then apply the changes within a critical section.  This keeps as much
     431              :      * logic as possible out of the critical section, and also ensures that
     432              :      * WAL replay will work the same as the normal case.
     433              :      *
     434              :      * First, initialize the new pd_prune_xid value to zero (indicating no
     435              :      * prunable tuples).  If we find any tuples which may soon become
     436              :      * prunable, we will save the lowest relevant XID in new_prune_xid. Also
     437              :      * initialize the rest of our working state.
     438              :      */
     439       507791 :     prstate->new_prune_xid = InvalidTransactionId;
     440       507791 :     prstate->latest_xid_removed = InvalidTransactionId;
     441       507791 :     prstate->nredirected = prstate->ndead = prstate->nunused = 0;
     442       507791 :     prstate->nfrozen = 0;
     443       507791 :     prstate->nroot_items = 0;
     444       507791 :     prstate->nheaponly_items = 0;
     445              : 
     446              :     /* initialize page freezing working state */
     447       507791 :     prstate->pagefrz.freeze_required = false;
     448       507791 :     prstate->pagefrz.FreezePageConflictXid = InvalidTransactionId;
     449       507791 :     if (prstate->attempt_freeze)
     450              :     {
     451              :         Assert(new_relfrozen_xid && new_relmin_mxid);
     452       402190 :         prstate->pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid;
     453       402190 :         prstate->pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid;
     454       402190 :         prstate->pagefrz.FreezePageRelminMxid = *new_relmin_mxid;
     455       402190 :         prstate->pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid;
     456              :     }
     457              :     else
     458              :     {
     459              :         Assert(!new_relfrozen_xid && !new_relmin_mxid);
     460       105601 :         prstate->pagefrz.FreezePageRelminMxid = InvalidMultiXactId;
     461       105601 :         prstate->pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId;
     462       105601 :         prstate->pagefrz.FreezePageRelfrozenXid = InvalidTransactionId;
     463       105601 :         prstate->pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId;
     464              :     }
     465              : 
     466       507791 :     prstate->ndeleted = 0;
     467       507791 :     prstate->live_tuples = 0;
     468       507791 :     prstate->recently_dead_tuples = 0;
     469       507791 :     prstate->hastup = false;
     470       507791 :     prstate->lpdead_items = 0;
     471              : 
     472              :     /*
     473              :      * deadoffsets are filled in during pruning but are only used to populate
     474              :      * PruneFreezeResult->deadoffsets. To avoid needing two copies of the
     475              :      * array, just save a pointer to the result offsets array in the
     476              :      * PruneState.
     477              :      */
     478       507791 :     prstate->deadoffsets = presult->deadoffsets;
     479              : 
     480              :     /*
     481              :      * We track whether the page will be all-visible/all-frozen at the end of
     482              :      * pruning and freezing. While examining tuple visibility, we'll set
     483              :      * set_all_visible to false if there are tuples on the page not visible to
     484              :      * all running and future transactions. If setting the VM is enabled for
     485              :      * this scan, we will do so if the page ends up being all-visible.
     486              :      *
     487              :      * We also keep track of the newest live XID, which is used to calculate
     488              :      * the snapshot conflict horizon for a WAL record setting the VM.
     489              :      */
     490       507791 :     prstate->set_all_visible = prstate->attempt_set_vm;
     491       507791 :     prstate->newest_live_xid = InvalidTransactionId;
     492              : 
     493              :     /*
     494              :      * Currently, only VACUUM performs freezing, but other callers may in the
     495              :      * future. We must initialize set_all_frozen based on whether or not the
     496              :      * caller passed HEAP_PAGE_PRUNE_FREEZE, because if they did not, we won't
     497              :      * call heap_prepare_freeze_tuple() for each tuple, and set_all_frozen
     498              :      * will never be cleared for tuples that need freezing. This would lead to
     499              :      * incorrectly setting the visibility map all-frozen for this page. We
     500              :      * can't set the page all-frozen in the VM if the caller didn't pass
     501              :      * HEAP_PAGE_PRUNE_SET_VM.
     502              :      *
     503              :      * When freezing is not required (no XIDs/MXIDs older than the freeze
     504              :      * cutoff), we may still choose to "opportunistically" freeze if doing so
     505              :      * would make the page all-frozen.
     506              :      *
     507              :      * We will not be able to freeze the whole page at the end of vacuum if
     508              :      * there are tuples present that are not visible to everyone or if there
     509              :      * are dead tuples which will not be removable. However, dead tuples that
     510              :      * will be removed by the end of vacuum should not prevent this
     511              :      * opportunistic freezing.
     512              :      *
     513              :      * Therefore, we do not clear set_all_visible and set_all_frozen when we
     514              :      * encounter LP_DEAD items. Instead, we correct them after deciding
     515              :      * whether to freeze, but before updating the VM, to avoid setting the VM
     516              :      * bits incorrectly.
     517              :      */
     518       507791 :     prstate->set_all_frozen = prstate->attempt_freeze && prstate->attempt_set_vm;
     519       507791 : }
     520              : 
     521              : /*
     522              :  * Helper for heap_page_prune_and_freeze(). Iterates over every tuple on the
     523              :  * page, examines its visibility information, and determines the appropriate
     524              :  * action for each tuple. All tuples are processed and classified during this
     525              :  * phase, but no modifications are made to the page until the later execution
     526              :  * stage.
     527              :  *
     528              :  * *off_loc is used for error callback and cleared before returning.
     529              :  */
     530              : static void
     531       336771 : prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc)
     532              : {
     533       336771 :     Page        page = prstate->page;
     534       336771 :     BlockNumber blockno = prstate->block;
     535       336771 :     OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
     536              :     OffsetNumber offnum;
     537              :     HeapTupleData tup;
     538              : 
     539       336771 :     tup.t_tableOid = RelationGetRelid(prstate->relation);
     540              : 
     541              :     /*
     542              :      * Determine HTSV for all tuples, and queue them up for processing as HOT
     543              :      * chain roots or as heap-only items.
     544              :      *
     545              :      * Determining HTSV only once for each tuple is required for correctness,
     546              :      * to deal with cases where running HTSV twice could result in different
     547              :      * results.  For example, RECENTLY_DEAD can turn to DEAD if another
     548              :      * checked item causes GlobalVisTestIsRemovableFullXid() to update the
     549              :      * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting
     550              :      * transaction aborts.
     551              :      *
     552              :      * It's also good for performance. Most commonly tuples within a page are
     553              :      * stored at decreasing offsets (while the items are stored at increasing
     554              :      * offsets). When processing all tuples on a page this leads to reading
     555              :      * memory at decreasing offsets within a page, with a variable stride.
     556              :      * That's hard for CPU prefetchers to deal with. Processing the items in
     557              :      * reverse order (and thus the tuples in increasing order) increases
     558              :      * prefetching efficiency significantly / decreases the number of cache
     559              :      * misses.
     560              :      */
     561       336771 :     for (offnum = maxoff;
     562     20616561 :          offnum >= FirstOffsetNumber;
     563     20279790 :          offnum = OffsetNumberPrev(offnum))
     564              :     {
     565     20279790 :         ItemId      itemid = PageGetItemId(page, offnum);
     566              :         HeapTupleHeader htup;
     567              : 
     568              :         /*
     569              :          * Set the offset number so that we can display it along with any
     570              :          * error that occurred while processing this tuple.
     571              :          */
     572     20279790 :         *off_loc = offnum;
     573              : 
     574     20279790 :         prstate->processed[offnum] = false;
     575     20279790 :         prstate->htsv[offnum] = -1;
     576              : 
     577              :         /* Nothing to do if slot doesn't contain a tuple */
     578     20279790 :         if (!ItemIdIsUsed(itemid))
     579              :         {
     580       135740 :             heap_prune_record_unchanged_lp_unused(prstate, offnum);
     581       135740 :             continue;
     582              :         }
     583              : 
     584     20144050 :         if (ItemIdIsDead(itemid))
     585              :         {
     586              :             /*
     587              :              * If the caller set mark_unused_now true, we can set dead line
     588              :              * pointers LP_UNUSED now.
     589              :              */
     590      1555675 :             if (unlikely(prstate->mark_unused_now))
     591         2067 :                 heap_prune_record_unused(prstate, offnum, false);
     592              :             else
     593      1553608 :                 heap_prune_record_unchanged_lp_dead(prstate, offnum);
     594      1555675 :             continue;
     595              :         }
     596              : 
     597     18588375 :         if (ItemIdIsRedirected(itemid))
     598              :         {
     599              :             /* This is the start of a HOT chain */
     600       198029 :             prstate->root_items[prstate->nroot_items++] = offnum;
     601       198029 :             continue;
     602              :         }
     603              : 
     604              :         Assert(ItemIdIsNormal(itemid));
     605              : 
     606              :         /*
     607              :          * Get the tuple's visibility status and queue it up for processing.
     608              :          */
     609     18390346 :         htup = (HeapTupleHeader) PageGetItem(page, itemid);
     610     18390346 :         tup.t_data = htup;
     611     18390346 :         tup.t_len = ItemIdGetLength(itemid);
     612     18390346 :         ItemPointerSet(&tup.t_self, blockno, offnum);
     613              : 
     614     18390346 :         prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, &tup);
     615              : 
     616     18390346 :         if (!HeapTupleHeaderIsHeapOnly(htup))
     617     18050731 :             prstate->root_items[prstate->nroot_items++] = offnum;
     618              :         else
     619       339615 :             prstate->heaponly_items[prstate->nheaponly_items++] = offnum;
     620              :     }
     621              : 
     622              :     /*
     623              :      * Process HOT chains.
     624              :      *
     625              :      * We added the items to the array starting from 'maxoff', so by
     626              :      * processing the array in reverse order, we process the items in
     627              :      * ascending offset number order.  The order doesn't matter for
     628              :      * correctness, but some quick micro-benchmarking suggests that this is
     629              :      * faster.  (Earlier PostgreSQL versions, which scanned all the items on
     630              :      * the page instead of using the root_items array, also did it in
     631              :      * ascending offset number order.)
     632              :      */
     633     18585531 :     for (int i = prstate->nroot_items - 1; i >= 0; i--)
     634              :     {
     635     18248760 :         offnum = prstate->root_items[i];
     636              : 
     637              :         /* Ignore items already processed as part of an earlier chain */
     638     18248760 :         if (prstate->processed[offnum])
     639            0 :             continue;
     640              : 
     641              :         /* see preceding loop */
     642     18248760 :         *off_loc = offnum;
     643              : 
     644              :         /* Process this item or chain of items */
     645     18248760 :         heap_prune_chain(maxoff, offnum, prstate);
     646              :     }
     647              : 
     648              :     /*
     649              :      * Process any heap-only tuples that were not already processed as part of
     650              :      * a HOT chain.
     651              :      */
     652       676386 :     for (int i = prstate->nheaponly_items - 1; i >= 0; i--)
     653              :     {
     654       339615 :         offnum = prstate->heaponly_items[i];
     655              : 
     656       339615 :         if (prstate->processed[offnum])
     657       322655 :             continue;
     658              : 
     659              :         /* see preceding loop */
     660        16960 :         *off_loc = offnum;
     661              : 
     662              :         /*
     663              :          * If the tuple is DEAD and doesn't chain to anything else, mark it
     664              :          * unused.  (If it does chain, we can only remove it as part of
     665              :          * pruning its chain.)
     666              :          *
     667              :          * We need this primarily to handle aborted HOT updates, that is,
     668              :          * XMIN_INVALID heap-only tuples.  Those might not be linked to by any
     669              :          * chain, since the parent tuple might be re-updated before any
     670              :          * pruning occurs.  So we have to be able to reap them separately from
     671              :          * chain-pruning.  (Note that HeapTupleHeaderIsHotUpdated will never
     672              :          * return true for an XMIN_INVALID tuple, so this code will work even
     673              :          * when there were sequential updates within the aborted transaction.)
     674              :          */
     675        16960 :         if (prstate->htsv[offnum] == HEAPTUPLE_DEAD)
     676              :         {
     677         3244 :             ItemId      itemid = PageGetItemId(page, offnum);
     678         3244 :             HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
     679              : 
     680         3244 :             if (likely(!HeapTupleHeaderIsHotUpdated(htup)))
     681              :             {
     682         3244 :                 HeapTupleHeaderAdvanceConflictHorizon(htup,
     683              :                                                       &prstate->latest_xid_removed);
     684         3244 :                 heap_prune_record_unused(prstate, offnum, true);
     685              :             }
     686              :             else
     687              :             {
     688              :                 /*
     689              :                  * This tuple should've been processed and removed as part of
     690              :                  * a HOT chain, so something's wrong.  To preserve evidence,
     691              :                  * we don't dare to remove it.  We cannot leave behind a DEAD
     692              :                  * tuple either, because that will cause VACUUM to error out.
     693              :                  * Throwing an error with a distinct error message seems like
     694              :                  * the least bad option.
     695              :                  */
     696            0 :                 elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
     697              :                      blockno, offnum);
     698              :             }
     699              :         }
     700              :         else
     701        13716 :             heap_prune_record_unchanged_lp_normal(prstate, offnum);
     702              :     }
     703              : 
     704              :     /* We should now have processed every tuple exactly once  */
     705              : #ifdef USE_ASSERT_CHECKING
     706              :     for (offnum = FirstOffsetNumber;
     707              :          offnum <= maxoff;
     708              :          offnum = OffsetNumberNext(offnum))
     709              :     {
     710              :         *off_loc = offnum;
     711              : 
     712              :         Assert(prstate->processed[offnum]);
     713              :     }
     714              : #endif
     715              : 
     716              :     /* Clear the offset information once we have processed the given page. */
     717       336771 :     *off_loc = InvalidOffsetNumber;
     718       336771 : }
     719              : 
     720              : /*
     721              :  * Decide whether to proceed with freezing according to the freeze plans
     722              :  * prepared for the current heap buffer. If freezing is chosen, this function
     723              :  * performs several pre-freeze checks.
     724              :  *
     725              :  * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be
     726              :  * determined before calling this function.
     727              :  *
     728              :  * prstate is both an input and output parameter.
     729              :  *
     730              :  * Returns true if we should apply the freeze plans and freeze tuples on the
     731              :  * page, and false otherwise.
     732              :  */
     733              : static bool
     734       336771 : heap_page_will_freeze(bool did_tuple_hint_fpi,
     735              :                       bool do_prune,
     736              :                       bool do_hint_prune,
     737              :                       PruneState *prstate)
     738              : {
     739       336771 :     bool        do_freeze = false;
     740              : 
     741              :     /*
     742              :      * If the caller specified we should not attempt to freeze any tuples,
     743              :      * validate that everything is in the right state and return.
     744              :      */
     745       336771 :     if (!prstate->attempt_freeze)
     746              :     {
     747              :         Assert(!prstate->set_all_frozen && prstate->nfrozen == 0);
     748       105503 :         return false;
     749              :     }
     750              : 
     751       231268 :     if (prstate->pagefrz.freeze_required)
     752              :     {
     753              :         /*
     754              :          * heap_prepare_freeze_tuple indicated that at least one XID/MXID from
     755              :          * before FreezeLimit/MultiXactCutoff is present.  Must freeze to
     756              :          * advance relfrozenxid/relminmxid.
     757              :          */
     758        22143 :         do_freeze = true;
     759              :     }
     760              :     else
     761              :     {
     762              :         /*
     763              :          * Opportunistically freeze the page if we are generating an FPI
     764              :          * anyway and if doing so means that we can set the page all-frozen
     765              :          * afterwards (might not happen until VACUUM's final heap pass).
     766              :          *
     767              :          * XXX: Previously, we knew if pruning emitted an FPI by checking
     768              :          * pgWalUsage.wal_fpi before and after pruning.  Once the freeze and
     769              :          * prune records were combined, this heuristic couldn't be used
     770              :          * anymore.  The opportunistic freeze heuristic must be improved;
     771              :          * however, for now, try to approximate the old logic.
     772              :          */
     773       209125 :         if (prstate->set_all_frozen && prstate->nfrozen > 0)
     774              :         {
     775              :             Assert(prstate->set_all_visible);
     776              : 
     777              :             /*
     778              :              * Freezing would make the page all-frozen.  Have already emitted
     779              :              * an FPI or will do so anyway?
     780              :              */
     781        19111 :             if (RelationNeedsWAL(prstate->relation))
     782              :             {
     783        17453 :                 if (did_tuple_hint_fpi)
     784         1195 :                     do_freeze = true;
     785        16258 :                 else if (do_prune)
     786              :                 {
     787         1885 :                     if (XLogCheckBufferNeedsBackup(prstate->buffer))
     788          668 :                         do_freeze = true;
     789              :                 }
     790        14373 :                 else if (do_hint_prune)
     791              :                 {
     792        16146 :                     if (XLogHintBitIsNeeded() &&
     793         8073 :                         XLogCheckBufferNeedsBackup(prstate->buffer))
     794         1349 :                         do_freeze = true;
     795              :                 }
     796              :             }
     797              :         }
     798              :     }
     799              : 
     800       231268 :     if (do_freeze)
     801              :     {
     802              :         /*
     803              :          * Validate the tuples we will be freezing before entering the
     804              :          * critical section.
     805              :          */
     806        25355 :         heap_pre_freeze_checks(prstate->buffer, prstate->frozen, prstate->nfrozen);
     807              :         Assert(TransactionIdPrecedes(prstate->pagefrz.FreezePageConflictXid,
     808              :                                      prstate->cutoffs->OldestXmin));
     809              :     }
     810       205913 :     else if (prstate->nfrozen > 0)
     811              :     {
     812              :         /*
     813              :          * The page contained some tuples that were not already frozen, and we
     814              :          * chose not to freeze them now.  The page won't be all-frozen then.
     815              :          */
     816              :         Assert(!prstate->pagefrz.freeze_required);
     817              : 
     818        16469 :         prstate->set_all_frozen = false;
     819        16469 :         prstate->nfrozen = 0;    /* avoid miscounts in instrumentation */
     820              :     }
     821              :     else
     822              :     {
     823              :         /*
     824              :          * We have no freeze plans to execute.  The page might already be
     825              :          * all-frozen (perhaps only following pruning), though.  Such pages
     826              :          * can be marked all-frozen in the VM by our caller, even though none
     827              :          * of its tuples were newly frozen here.
     828              :          */
     829              :     }
     830              : 
     831       231268 :     return do_freeze;
     832              : }
     833              : 
     834              : /*
     835              :  * Emit a warning about and fix visibility map corruption on the given page.
     836              :  *
     837              :  * The caller specifies the type of corruption it has already detected via
     838              :  * corruption_type, so that we can emit the appropriate warning. All cases
     839              :  * result in the VM bits being cleared; corruption types where PD_ALL_VISIBLE
     840              :  * is incorrectly set also clear PD_ALL_VISIBLE.
     841              :  *
     842              :  * Must be called while holding an exclusive lock on the heap buffer. Dead
     843              :  * items and not all-visible tuples must have been discovered under that same
     844              :  * lock. Although we do not hold a lock on the VM buffer, it is pinned, and
     845              :  * the heap buffer is exclusively locked, ensuring that no other backend can
     846              :  * update the VM bits corresponding to this heap page.
     847              :  *
     848              :  * This function makes changes to the VM and, potentially, the heap page, but
     849              :  * it does not need to be done in a critical section.
     850              :  */
     851              : static void
     852            0 : heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum,
     853              :                             VMCorruptionType corruption_type)
     854              : {
     855            0 :     const char *relname = RelationGetRelationName(prstate->relation);
     856            0 :     bool        do_clear_vm = false;
     857            0 :     bool        do_clear_heap = false;
     858              : 
     859              :     Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE));
     860              : 
     861            0 :     switch (corruption_type)
     862              :     {
     863            0 :         case VM_CORRUPT_LPDEAD:
     864            0 :             ereport(WARNING,
     865              :                     (errcode(ERRCODE_DATA_CORRUPTED),
     866              :                      errmsg("dead line pointer found on page marked all-visible"),
     867              :                      errcontext("relation \"%s\", page %u, tuple %u",
     868              :                                 relname, prstate->block, offnum)));
     869            0 :             do_clear_vm = true;
     870            0 :             do_clear_heap = true;
     871            0 :             break;
     872              : 
     873            0 :         case VM_CORRUPT_TUPLE_VISIBILITY:
     874              : 
     875              :             /*
     876              :              * A HEAPTUPLE_LIVE tuple on an all-visible page can appear to not
     877              :              * be visible to everyone when
     878              :              * GetOldestNonRemovableTransactionId() returns a conservative
     879              :              * value that's older than the real safe xmin. That is not
     880              :              * corruption -- the PD_ALL_VISIBLE flag is still correct.
     881              :              *
     882              :              * However, dead tuple versions, in-progress inserts, and
     883              :              * in-progress deletes should never appear on a page marked
     884              :              * all-visible. That indicates real corruption. PD_ALL_VISIBLE
     885              :              * should have been cleared by the DML operation that deleted or
     886              :              * inserted the tuple.
     887              :              */
     888            0 :             ereport(WARNING,
     889              :                     (errcode(ERRCODE_DATA_CORRUPTED),
     890              :                      errmsg("tuple not visible to all transactions found on page marked all-visible"),
     891              :                      errcontext("relation \"%s\", page %u, tuple %u",
     892              :                                 relname, prstate->block, offnum)));
     893            0 :             do_clear_vm = true;
     894            0 :             do_clear_heap = true;
     895            0 :             break;
     896              : 
     897            0 :         case VM_CORRUPT_MISSING_PAGE_HINT:
     898              : 
     899              :             /*
     900              :              * As of PostgreSQL 9.2, the visibility map bit should never be
     901              :              * set if the page-level bit is clear. However, for vacuum, it's
     902              :              * possible that the bit got cleared after
     903              :              * heap_vac_scan_next_block() was called, so we must recheck now
     904              :              * that we have the buffer lock before concluding that the VM is
     905              :              * corrupt.
     906              :              */
     907              :             Assert(!PageIsAllVisible(prstate->page));
     908              :             Assert(prstate->old_vmbits & VISIBILITYMAP_VALID_BITS);
     909            0 :             ereport(WARNING,
     910              :                     (errcode(ERRCODE_DATA_CORRUPTED),
     911              :                      errmsg("page is not marked all-visible but visibility map bit is set"),
     912              :                      errcontext("relation \"%s\", page %u",
     913              :                                 relname, prstate->block)));
     914            0 :             do_clear_vm = true;
     915            0 :             break;
     916              :     }
     917              : 
     918              :     Assert(do_clear_heap || do_clear_vm);
     919              : 
     920              :     /* Avoid marking the buffer dirty if PD_ALL_VISIBLE is already clear */
     921            0 :     if (do_clear_heap)
     922              :     {
     923              :         Assert(PageIsAllVisible(prstate->page));
     924            0 :         PageClearAllVisible(prstate->page);
     925            0 :         MarkBufferDirtyHint(prstate->buffer, true);
     926              :     }
     927              : 
     928            0 :     if (do_clear_vm)
     929              :     {
     930            0 :         visibilitymap_clear(prstate->relation, prstate->block,
     931              :                             prstate->vmbuffer,
     932              :                             VISIBILITYMAP_VALID_BITS);
     933            0 :         prstate->old_vmbits = 0;
     934              :     }
     935            0 : }
     936              : 
     937              : /*
     938              :  * Decide whether to set the visibility map bits (all-visible and all-frozen)
     939              :  * for the current page using information from the PruneState and VM.
     940              :  *
     941              :  * This function does not actually set the VM bits or page-level visibility
     942              :  * hint, PD_ALL_VISIBLE.
     943              :  *
     944              :  * This should be called only after do_freeze has been decided (and do_prune
     945              :  * has been set), as these factor into our heuristic-based decision.
     946              :  *
     947              :  * Returns true if one or both VM bits should be set and false otherwise.
     948              :  */
     949              : static bool
     950       336771 : heap_page_will_set_vm(PruneState *prstate, PruneReason reason,
     951              :                       bool do_prune, bool do_freeze)
     952              : {
     953       336771 :     if (!prstate->attempt_set_vm)
     954        68095 :         return false;
     955              : 
     956       268676 :     if (!prstate->set_all_visible)
     957       196115 :         return false;
     958              : 
     959              :     /*
     960              :      * If this is an on-access call and we're not actually pruning, avoid
     961              :      * setting the visibility map if it would newly dirty the heap page or, if
     962              :      * the page is already dirty, if doing so would require including a
     963              :      * full-page image (FPI) of the heap page in the WAL.
     964              :      */
     965        72561 :     if (reason == PRUNE_ON_ACCESS && !do_prune && !do_freeze &&
     966        28844 :         (!BufferIsDirty(prstate->buffer) || XLogCheckBufferNeedsBackup(prstate->buffer)))
     967              :     {
     968        15578 :         prstate->set_all_visible = prstate->set_all_frozen = false;
     969        15578 :         return false;
     970              :     }
     971              : 
     972        56983 :     prstate->new_vmbits = VISIBILITYMAP_ALL_VISIBLE;
     973              : 
     974        56983 :     if (prstate->set_all_frozen)
     975        31646 :         prstate->new_vmbits |= VISIBILITYMAP_ALL_FROZEN;
     976              : 
     977        56983 :     if (prstate->new_vmbits == prstate->old_vmbits)
     978              :     {
     979         1716 :         prstate->new_vmbits = 0;
     980         1716 :         return false;
     981              :     }
     982              : 
     983        55267 :     return true;
     984              : }
     985              : 
     986              : /*
     987              :  * If the page is already all-frozen, or already all-visible and freezing
     988              :  * won't be attempted, there is no remaining work and we can use the fast path
     989              :  * to avoid the expensive overhead of heap_page_prune_and_freeze().
     990              :  *
     991              :  * This can happen when the page has a stale prune hint, or if VACUUM is
     992              :  * scanning an already all-frozen page due to SKIP_PAGES_THRESHOLD.
     993              :  *
     994              :  * The caller must already have examined the visibility map and saved the
     995              :  * status of the page's VM bits in prstate->old_vmbits. Caller must hold a
     996              :  * content lock on the heap page since it will examine line pointers.
     997              :  *
     998              :  * Before calling prune_freeze_fast_path(), the caller should first
     999              :  * check for and fix any discrepancy between the page-level visibility hint
    1000              :  * and the visibility map. Otherwise, the fast path will always prevent us
    1001              :  * from getting them in sync. Note that if there are tuples on the page that
    1002              :  * are not visible to all but the VM is incorrectly marked
    1003              :  * all-visible/all-frozen, we will not get the chance to fix that corruption
    1004              :  * when using the fast path.
    1005              :  */
    1006              : static void
    1007       171020 : prune_freeze_fast_path(PruneState *prstate, PruneFreezeResult *presult)
    1008              : {
    1009       171020 :     OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
    1010       171020 :     Page        page = prstate->page;
    1011              : 
    1012              :     Assert((prstate->old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
    1013              :            ((prstate->old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
    1014              :             !prstate->attempt_freeze));
    1015              : 
    1016              :     /* We'll fill in presult for the caller */
    1017       171020 :     memset(presult, 0, sizeof(PruneFreezeResult));
    1018              : 
    1019              :     /* Clear any stale prune hint */
    1020       171020 :     if (TransactionIdIsValid(PageGetPruneXid(page)))
    1021              :     {
    1022           98 :         PageClearPrunable(page);
    1023           98 :         MarkBufferDirtyHint(prstate->buffer, true);
    1024              :     }
    1025              : 
    1026       171020 :     if (PageIsEmpty(page))
    1027            0 :         return;
    1028              : 
    1029              :     /*
    1030              :      * Since the page is all-visible, a count of the normal ItemIds on the
    1031              :      * page should be sufficient for vacuum's live tuple count.
    1032              :      */
    1033       171020 :     for (OffsetNumber off = FirstOffsetNumber;
    1034      9929724 :          off <= maxoff;
    1035      9758704 :          off = OffsetNumberNext(off))
    1036              :     {
    1037      9758704 :         ItemId      lp = PageGetItemId(page, off);
    1038              : 
    1039      9758704 :         if (!ItemIdIsUsed(lp))
    1040       158431 :             continue;
    1041              : 
    1042      9600273 :         presult->hastup = true;
    1043              : 
    1044      9600273 :         if (ItemIdIsNormal(lp))
    1045      9430598 :             prstate->live_tuples++;
    1046              :     }
    1047              : 
    1048       171020 :     presult->live_tuples = prstate->live_tuples;
    1049              : }
    1050              : 
    1051              : /*
    1052              :  * Prune and repair fragmentation and potentially freeze tuples on the
    1053              :  * specified page. If the page's visibility status has changed, update it in
    1054              :  * the VM.
    1055              :  *
    1056              :  * Caller must have pin and buffer cleanup lock on the page.  Note that we
    1057              :  * don't update the FSM information for page on caller's behalf.  Caller might
    1058              :  * also need to account for a reduction in the length of the line pointer
    1059              :  * array following array truncation by us.
    1060              :  *
    1061              :  * params contains the input parameters used to control freezing and pruning
    1062              :  * behavior. See the definition of PruneFreezeParams for more on what each
    1063              :  * parameter does.
    1064              :  *
    1065              :  * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze
    1066              :  * tuples if it's required in order to advance relfrozenxid / relminmxid, or
    1067              :  * if it's considered advantageous for overall system performance to do so
    1068              :  * now.  The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and
    1069              :  * 'new_relmin_mxid' arguments are required when freezing.
    1070              :  *
    1071              :  * A vmbuffer corresponding to the heap page is also passed and if the page is
    1072              :  * found to be all-visible/all-frozen, we will set it in the VM.
    1073              :  *
    1074              :  * presult contains output parameters needed by callers, such as the number of
    1075              :  * tuples removed and the offsets of dead items on the page after pruning.
    1076              :  * heap_page_prune_and_freeze() is responsible for initializing it.  Required
    1077              :  * by all callers.
    1078              :  *
    1079              :  * off_loc is the offset location required by the caller to use in error
    1080              :  * callback.
    1081              :  *
    1082              :  * new_relfrozen_xid and new_relmin_mxid must be provided by the caller if the
    1083              :  * HEAP_PAGE_PRUNE_FREEZE option is set in params.  On entry, they contain the
    1084              :  * oldest XID and multi-XID seen on the relation so far.  They will be updated
    1085              :  * with the oldest values present on the page after pruning.  After processing
    1086              :  * the whole relation, VACUUM can use these values as the new
    1087              :  * relfrozenxid/relminmxid for the relation.
    1088              :  */
    1089              : void
    1090       507791 : heap_page_prune_and_freeze(PruneFreezeParams *params,
    1091              :                            PruneFreezeResult *presult,
    1092              :                            OffsetNumber *off_loc,
    1093              :                            TransactionId *new_relfrozen_xid,
    1094              :                            MultiXactId *new_relmin_mxid)
    1095              : {
    1096              :     PruneState  prstate;
    1097              :     bool        do_freeze;
    1098              :     bool        do_prune;
    1099              :     bool        do_hint_prune;
    1100              :     bool        do_set_vm;
    1101              :     bool        did_tuple_hint_fpi;
    1102       507791 :     int64       fpi_before = pgWalUsage.wal_fpi;
    1103              :     TransactionId conflict_xid;
    1104              : 
    1105              :     /* Initialize prstate */
    1106       507791 :     prune_freeze_setup(params,
    1107              :                        new_relfrozen_xid, new_relmin_mxid,
    1108              :                        presult, &prstate);
    1109              : 
    1110              :     /*
    1111              :      * If the VM is set but PD_ALL_VISIBLE is clear, fix that corruption
    1112              :      * before pruning and freezing so that the page and VM start out in a
    1113              :      * consistent state.
    1114              :      */
    1115       507791 :     if ((prstate.old_vmbits & VISIBILITYMAP_VALID_BITS) &&
    1116       175923 :         !PageIsAllVisible(prstate.page))
    1117            0 :         heap_page_fix_vm_corruption(&prstate, InvalidOffsetNumber,
    1118              :                                     VM_CORRUPT_MISSING_PAGE_HINT);
    1119              : 
    1120              :     /*
    1121              :      * If the page is already all-frozen, or already all-visible when freezing
    1122              :      * is not being attempted, take the fast path, skipping pruning and
    1123              :      * freezing code entirely. This must be done after fixing any discrepancy
    1124              :      * between the page-level visibility hint and the VM, since that may have
    1125              :      * cleared old_vmbits.
    1126              :      */
    1127       507791 :     if ((params->options & HEAP_PAGE_PRUNE_ALLOW_FAST_PATH) != 0 &&
    1128       506498 :         ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
    1129       335478 :          ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
    1130         4444 :           !prstate.attempt_freeze)))
    1131              :     {
    1132       171020 :         prune_freeze_fast_path(&prstate, presult);
    1133       171020 :         return;
    1134              :     }
    1135              : 
    1136              :     /*
    1137              :      * Examine all line pointers and tuple visibility information to determine
    1138              :      * which line pointers should change state and which tuples may be frozen.
    1139              :      * Prepare queue of state changes to later be executed in a critical
    1140              :      * section.
    1141              :      */
    1142       336771 :     prune_freeze_plan(&prstate, off_loc);
    1143              : 
    1144              :     /*
    1145              :      * After processing all the live tuples on the page, if the newest xmin
    1146              :      * amongst them may be considered running by any snapshot, the page cannot
    1147              :      * be all-visible. This should be done before determining whether or not
    1148              :      * to opportunistically freeze.
    1149              :      */
    1150       336771 :     if (prstate.set_all_visible &&
    1151       177230 :         TransactionIdIsNormal(prstate.newest_live_xid) &&
    1152        75619 :         GlobalVisTestXidConsideredRunning(prstate.vistest,
    1153              :                                           prstate.newest_live_xid,
    1154              :                                           true))
    1155         3349 :         prstate.set_all_visible = prstate.set_all_frozen = false;
    1156              : 
    1157              :     /*
    1158              :      * If checksums are enabled, calling heap_prune_satisfies_vacuum() while
    1159              :      * checking tuple visibility information in prune_freeze_plan() may have
    1160              :      * caused an FPI to be emitted.
    1161              :      */
    1162       336771 :     did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi;
    1163              : 
    1164       992859 :     do_prune = prstate.nredirected > 0 ||
    1165       614020 :         prstate.ndead > 0 ||
    1166       277249 :         prstate.nunused > 0;
    1167              : 
    1168              :     /*
    1169              :      * Even if we don't prune anything, if we found a new value for the
    1170              :      * pd_prune_xid field or the page was marked full, we will update the hint
    1171              :      * bit.
    1172              :      */
    1173       529225 :     do_hint_prune = PageGetPruneXid(prstate.page) != prstate.new_prune_xid ||
    1174       192454 :         PageIsFull(prstate.page);
    1175              : 
    1176              :     /*
    1177              :      * Decide if we want to go ahead with freezing according to the freeze
    1178              :      * plans we prepared, or not.
    1179              :      */
    1180       336771 :     do_freeze = heap_page_will_freeze(did_tuple_hint_fpi,
    1181              :                                       do_prune,
    1182              :                                       do_hint_prune,
    1183              :                                       &prstate);
    1184              : 
    1185              :     /*
    1186              :      * While scanning the line pointers, we did not clear
    1187              :      * set_all_visible/set_all_frozen when encountering LP_DEAD items because
    1188              :      * we wanted the decision whether or not to freeze the page to be
    1189              :      * unaffected by the short-term presence of LP_DEAD items.  These LP_DEAD
    1190              :      * items are effectively assumed to be LP_UNUSED items in the making.  It
    1191              :      * doesn't matter which vacuum heap pass (initial pass or final pass) ends
    1192              :      * up setting the page all-frozen, as long as the ongoing VACUUM does it.
    1193              :      *
    1194              :      * Now that we finished determining whether or not to freeze the page,
    1195              :      * update set_all_visible and set_all_frozen so that they reflect the true
    1196              :      * state of the page for setting PD_ALL_VISIBLE and VM bits.
    1197              :      */
    1198       336771 :     if (prstate.lpdead_items > 0)
    1199        61661 :         prstate.set_all_visible = prstate.set_all_frozen = false;
    1200              : 
    1201              :     Assert(!prstate.set_all_frozen || prstate.set_all_visible);
    1202              :     Assert(!prstate.set_all_visible || prstate.attempt_set_vm);
    1203              :     Assert(!prstate.set_all_visible || (prstate.lpdead_items == 0));
    1204              : 
    1205       336771 :     do_set_vm = heap_page_will_set_vm(&prstate, params->reason, do_prune, do_freeze);
    1206              : 
    1207              :     /*
    1208              :      * new_vmbits should be 0 regardless of whether or not the page is
    1209              :      * all-visible if we do not intend to set the VM.
    1210              :      */
    1211              :     Assert(do_set_vm || prstate.new_vmbits == 0);
    1212              : 
    1213              :     /*
    1214              :      * The snapshot conflict horizon for the whole record is the most
    1215              :      * conservative (newest) horizon required by any change in the record.
    1216              :      */
    1217       336771 :     conflict_xid = InvalidTransactionId;
    1218       336771 :     if (do_set_vm)
    1219        55267 :         conflict_xid = prstate.newest_live_xid;
    1220       336771 :     if (do_freeze && TransactionIdFollows(prstate.pagefrz.FreezePageConflictXid, conflict_xid))
    1221         4269 :         conflict_xid = prstate.pagefrz.FreezePageConflictXid;
    1222       336771 :     if (do_prune && TransactionIdFollows(prstate.latest_xid_removed, conflict_xid))
    1223        52704 :         conflict_xid = prstate.latest_xid_removed;
    1224              : 
    1225              :     /* Lock vmbuffer before entering a critical section */
    1226       336771 :     if (do_set_vm)
    1227        55267 :         LockBuffer(prstate.vmbuffer, BUFFER_LOCK_EXCLUSIVE);
    1228              : 
    1229              :     /* Any error while applying the changes is critical */
    1230       336771 :     START_CRIT_SECTION();
    1231              : 
    1232       336771 :     if (do_hint_prune)
    1233              :     {
    1234              :         /*
    1235              :          * Update the page's pd_prune_xid field to either zero, or the lowest
    1236              :          * XID of any soon-prunable tuple.
    1237              :          */
    1238       144383 :         ((PageHeader) prstate.page)->pd_prune_xid = prstate.new_prune_xid;
    1239              : 
    1240              :         /*
    1241              :          * Also clear the "page is full" flag, since there's no point in
    1242              :          * repeating the prune/defrag process until something else happens to
    1243              :          * the page.
    1244              :          */
    1245       144383 :         PageClearFull(prstate.page);
    1246              : 
    1247              :         /*
    1248              :          * If that's all we had to do to the page, this is a non-WAL-logged
    1249              :          * hint. If we are going to freeze or prune the page or set
    1250              :          * PD_ALL_VISIBLE, we will mark the buffer dirty below.
    1251              :          *
    1252              :          * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden
    1253              :          * for the VM to be set and PD_ALL_VISIBLE to be clear.
    1254              :          */
    1255       144383 :         if (!do_freeze && !do_prune && !do_set_vm)
    1256        49972 :             MarkBufferDirtyHint(prstate.buffer, true);
    1257              :     }
    1258              : 
    1259       336771 :     if (do_prune || do_freeze || do_set_vm)
    1260              :     {
    1261              :         /* Apply the planned item changes and repair page fragmentation. */
    1262       117096 :         if (do_prune)
    1263              :         {
    1264        59959 :             heap_page_prune_execute(prstate.buffer, false,
    1265              :                                     prstate.redirected, prstate.nredirected,
    1266              :                                     prstate.nowdead, prstate.ndead,
    1267              :                                     prstate.nowunused, prstate.nunused);
    1268              :         }
    1269              : 
    1270       117096 :         if (do_freeze)
    1271        25355 :             heap_freeze_prepared_tuples(prstate.buffer, prstate.frozen, prstate.nfrozen);
    1272              : 
    1273              :         /* Set the visibility map and page visibility hint */
    1274       117096 :         if (do_set_vm)
    1275              :         {
    1276              :             /*
    1277              :              * While it is valid for PD_ALL_VISIBLE to be set when the
    1278              :              * corresponding VM bit is clear, we strongly prefer to keep them
    1279              :              * in sync.
    1280              :              *
    1281              :              * The heap buffer must be marked dirty before adding it to the
    1282              :              * WAL chain when setting the VM. We don't worry about
    1283              :              * unnecessarily dirtying the heap buffer if PD_ALL_VISIBLE is
    1284              :              * already set, though. It is extremely rare to have a clean heap
    1285              :              * buffer with PD_ALL_VISIBLE already set and the VM bits clear,
    1286              :              * so there is no point in optimizing it.
    1287              :              */
    1288        55267 :             PageSetAllVisible(prstate.page);
    1289        55267 :             PageClearPrunable(prstate.page);
    1290        55267 :             visibilitymap_set(prstate.block, prstate.vmbuffer, prstate.new_vmbits,
    1291        55267 :                               prstate.relation->rd_locator);
    1292              :         }
    1293              : 
    1294       117096 :         MarkBufferDirty(prstate.buffer);
    1295              : 
    1296              :         /*
    1297              :          * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
    1298              :          */
    1299       117096 :         if (RelationNeedsWAL(prstate.relation))
    1300              :         {
    1301       168073 :             log_heap_prune_and_freeze(prstate.relation, prstate.buffer,
    1302              :                                       do_set_vm ? prstate.vmbuffer : InvalidBuffer,
    1303        53676 :                                       do_set_vm ? prstate.new_vmbits : 0,
    1304              :                                       conflict_xid,
    1305              :                                       true, /* cleanup lock */
    1306              :                                       params->reason,
    1307              :                                       prstate.frozen, prstate.nfrozen,
    1308              :                                       prstate.redirected, prstate.nredirected,
    1309              :                                       prstate.nowdead, prstate.ndead,
    1310              :                                       prstate.nowunused, prstate.nunused);
    1311              :         }
    1312              :     }
    1313              : 
    1314       336771 :     END_CRIT_SECTION();
    1315              : 
    1316       336771 :     if (do_set_vm)
    1317        55267 :         LockBuffer(prstate.vmbuffer, BUFFER_LOCK_UNLOCK);
    1318              : 
    1319              :     /*
    1320              :      * During its second pass over the heap, VACUUM calls
    1321              :      * heap_page_would_be_all_visible() to determine whether a page is
    1322              :      * all-visible and all-frozen. The logic here is similar. After completing
    1323              :      * pruning and freezing, use an assertion to verify that our results
    1324              :      * remain consistent with heap_page_would_be_all_visible(). It's also a
    1325              :      * valuable cross-check of the page state after pruning and freezing.
    1326              :      */
    1327              : #ifdef USE_ASSERT_CHECKING
    1328              :     if (prstate.set_all_visible)
    1329              :     {
    1330              :         TransactionId debug_cutoff;
    1331              :         bool        debug_all_frozen;
    1332              : 
    1333              :         Assert(prstate.lpdead_items == 0);
    1334              : 
    1335              :         Assert(heap_page_is_all_visible(prstate.relation, prstate.buffer,
    1336              :                                         prstate.vistest,
    1337              :                                         &debug_all_frozen,
    1338              :                                         &debug_cutoff, off_loc));
    1339              : 
    1340              :         Assert(!TransactionIdIsValid(debug_cutoff) ||
    1341              :                debug_cutoff == prstate.newest_live_xid);
    1342              : 
    1343              :         /*
    1344              :          * It's possible the page is composed entirely of frozen tuples but is
    1345              :          * not set all-frozen in the VM and did not pass
    1346              :          * HEAP_PAGE_PRUNE_FREEZE. In this case, it's possible
    1347              :          * heap_page_is_all_visible() finds the page completely frozen, even
    1348              :          * though prstate.set_all_frozen is false.
    1349              :          */
    1350              :         Assert(!prstate.set_all_frozen || debug_all_frozen);
    1351              :     }
    1352              : #endif
    1353              : 
    1354              :     /* Copy information back for caller */
    1355       336771 :     presult->ndeleted = prstate.ndeleted;
    1356       336771 :     presult->nnewlpdead = prstate.ndead;
    1357       336771 :     presult->nfrozen = prstate.nfrozen;
    1358       336771 :     presult->live_tuples = prstate.live_tuples;
    1359       336771 :     presult->recently_dead_tuples = prstate.recently_dead_tuples;
    1360       336771 :     presult->hastup = prstate.hastup;
    1361              : 
    1362       336771 :     presult->lpdead_items = prstate.lpdead_items;
    1363              :     /* the presult->deadoffsets array was already filled in */
    1364              : 
    1365       336771 :     presult->newly_all_visible = false;
    1366       336771 :     presult->newly_all_frozen = false;
    1367       336771 :     presult->newly_all_visible_frozen = false;
    1368       336771 :     if (do_set_vm)
    1369              :     {
    1370        55267 :         if ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
    1371              :         {
    1372        52080 :             presult->newly_all_visible = true;
    1373        52080 :             if (prstate.set_all_frozen)
    1374        28001 :                 presult->newly_all_visible_frozen = true;
    1375              :         }
    1376         3187 :         else if ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 &&
    1377         3187 :                  prstate.set_all_frozen)
    1378         3187 :             presult->newly_all_frozen = true;
    1379              :     }
    1380              : 
    1381       336771 :     if (prstate.attempt_freeze)
    1382              :     {
    1383       231268 :         if (presult->nfrozen > 0)
    1384              :         {
    1385        25355 :             *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid;
    1386        25355 :             *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid;
    1387              :         }
    1388              :         else
    1389              :         {
    1390       205913 :             *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid;
    1391       205913 :             *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid;
    1392              :         }
    1393              :     }
    1394              : }
    1395              : 
    1396              : 
    1397              : /*
    1398              :  * Perform visibility checks for heap pruning.
    1399              :  */
    1400              : static HTSV_Result
    1401     18390346 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup)
    1402              : {
    1403              :     HTSV_Result res;
    1404              :     TransactionId dead_after;
    1405              : 
    1406     18390346 :     res = HeapTupleSatisfiesVacuumHorizon(tup, prstate->buffer, &dead_after);
    1407              : 
    1408     18390346 :     if (res != HEAPTUPLE_RECENTLY_DEAD)
    1409     16270951 :         return res;
    1410              : 
    1411              :     /*
    1412              :      * For VACUUM, we must be sure to prune tuples with xmax older than
    1413              :      * OldestXmin -- a visibility cutoff determined at the beginning of
    1414              :      * vacuuming the relation. OldestXmin is used for freezing determination
    1415              :      * and we cannot freeze dead tuples' xmaxes.
    1416              :      */
    1417      2119395 :     if (prstate->cutoffs &&
    1418      1129891 :         TransactionIdIsValid(prstate->cutoffs->OldestXmin) &&
    1419      1129891 :         NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin))
    1420       804542 :         return HEAPTUPLE_DEAD;
    1421              : 
    1422              :     /*
    1423              :      * Determine whether or not the tuple is considered dead when compared
    1424              :      * with the provided GlobalVisState. On-access pruning does not provide
    1425              :      * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older
    1426              :      * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead
    1427              :      * if the GlobalVisState has been updated since the beginning of vacuuming
    1428              :      * the relation.
    1429              :      */
    1430      1314853 :     if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after, true))
    1431       937203 :         return HEAPTUPLE_DEAD;
    1432              : 
    1433       377650 :     return res;
    1434              : }
    1435              : 
    1436              : 
    1437              : /*
    1438              :  * Pruning calculates tuple visibility once and saves the results in an array
    1439              :  * of int8.  See PruneState.htsv for details.  This helper function is meant
    1440              :  * to guard against examining visibility status array members which have not
    1441              :  * yet been computed.
    1442              :  */
    1443              : static inline HTSV_Result
    1444     18373386 : htsv_get_valid_status(int status)
    1445              : {
    1446              :     Assert(status >= HEAPTUPLE_DEAD &&
    1447              :            status <= HEAPTUPLE_DELETE_IN_PROGRESS);
    1448     18373386 :     return (HTSV_Result) status;
    1449              : }
    1450              : 
    1451              : /*
    1452              :  * Prune specified line pointer or a HOT chain originating at line pointer.
    1453              :  *
    1454              :  * Tuple visibility information is provided in prstate->htsv.
    1455              :  *
    1456              :  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
    1457              :  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
    1458              :  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
    1459              :  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
    1460              :  * DEAD, our visibility test is just too coarse to detect it.
    1461              :  *
    1462              :  * Pruning must never leave behind a DEAD tuple that still has tuple storage.
    1463              :  * VACUUM isn't prepared to deal with that case.
    1464              :  *
    1465              :  * The root line pointer is redirected to the tuple immediately after the
    1466              :  * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
    1467              :  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
    1468              :  * tuple, which we treat as a chain of length 1.)
    1469              :  *
    1470              :  * We don't actually change the page here. We just add entries to the arrays in
    1471              :  * prstate showing the changes to be made.  Items to be redirected are added
    1472              :  * to the redirected[] array (two entries per redirection); items to be set to
    1473              :  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
    1474              :  * state are added to nowunused[].  We perform bookkeeping of live tuples,
    1475              :  * visibility etc. based on what the page will look like after the changes
    1476              :  * applied.  All that bookkeeping is performed in the heap_prune_record_*()
    1477              :  * subroutines.  The division of labor is that heap_prune_chain() decides the
    1478              :  * fate of each tuple, ie. whether it's going to be removed, redirected or
    1479              :  * left unchanged, and the heap_prune_record_*() subroutines update PruneState
    1480              :  * based on that outcome.
    1481              :  */
    1482              : static void
    1483     18248760 : heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum,
    1484              :                  PruneState *prstate)
    1485              : {
    1486     18248760 :     TransactionId priorXmax = InvalidTransactionId;
    1487              :     ItemId      rootlp;
    1488              :     OffsetNumber offnum;
    1489              :     OffsetNumber chainitems[MaxHeapTuplesPerPage];
    1490     18248760 :     Page        page = prstate->page;
    1491              : 
    1492              :     /*
    1493              :      * After traversing the HOT chain, ndeadchain is the index in chainitems
    1494              :      * of the first live successor after the last dead item.
    1495              :      */
    1496     18248760 :     int         ndeadchain = 0,
    1497     18248760 :                 nchain = 0;
    1498              : 
    1499     18248760 :     rootlp = PageGetItemId(page, rootoffnum);
    1500              : 
    1501              :     /* Start from the root tuple */
    1502     18248760 :     offnum = rootoffnum;
    1503              : 
    1504              :     /* while not end of the chain */
    1505              :     for (;;)
    1506       322655 :     {
    1507              :         HeapTupleHeader htup;
    1508              :         ItemId      lp;
    1509              : 
    1510              :         /* Sanity check (pure paranoia) */
    1511     18571415 :         if (offnum < FirstOffsetNumber)
    1512            0 :             break;
    1513              : 
    1514              :         /*
    1515              :          * An offset past the end of page's line pointer array is possible
    1516              :          * when the array was truncated (original item must have been unused)
    1517              :          */
    1518     18571415 :         if (offnum > maxoff)
    1519            0 :             break;
    1520              : 
    1521              :         /* If item is already processed, stop --- it must not be same chain */
    1522     18571415 :         if (prstate->processed[offnum])
    1523            0 :             break;
    1524              : 
    1525     18571415 :         lp = PageGetItemId(page, offnum);
    1526              : 
    1527              :         /*
    1528              :          * Unused item obviously isn't part of the chain. Likewise, a dead
    1529              :          * line pointer can't be part of the chain.  Both of those cases were
    1530              :          * already marked as processed.
    1531              :          */
    1532              :         Assert(ItemIdIsUsed(lp));
    1533              :         Assert(!ItemIdIsDead(lp));
    1534              : 
    1535              :         /*
    1536              :          * If we are looking at the redirected root line pointer, jump to the
    1537              :          * first normal tuple in the chain.  If we find a redirect somewhere
    1538              :          * else, stop --- it must not be same chain.
    1539              :          */
    1540     18571415 :         if (ItemIdIsRedirected(lp))
    1541              :         {
    1542       198029 :             if (nchain > 0)
    1543            0 :                 break;          /* not at start of chain */
    1544       198029 :             chainitems[nchain++] = offnum;
    1545       198029 :             offnum = ItemIdGetRedirect(rootlp);
    1546       198029 :             continue;
    1547              :         }
    1548              : 
    1549              :         Assert(ItemIdIsNormal(lp));
    1550              : 
    1551     18373386 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    1552              : 
    1553              :         /*
    1554              :          * Check the tuple XMIN against prior XMAX, if any
    1555              :          */
    1556     18498012 :         if (TransactionIdIsValid(priorXmax) &&
    1557       124626 :             !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
    1558            0 :             break;
    1559              : 
    1560              :         /*
    1561              :          * OK, this tuple is indeed a member of the chain.
    1562              :          */
    1563     18373386 :         chainitems[nchain++] = offnum;
    1564              : 
    1565     18373386 :         switch (htsv_get_valid_status(prstate->htsv[offnum]))
    1566              :         {
    1567      1813001 :             case HEAPTUPLE_DEAD:
    1568              : 
    1569              :                 /* Remember the last DEAD tuple seen */
    1570      1813001 :                 ndeadchain = nchain;
    1571      1813001 :                 HeapTupleHeaderAdvanceConflictHorizon(htup,
    1572              :                                                       &prstate->latest_xid_removed);
    1573              :                 /* Advance to next chain member */
    1574      1813001 :                 break;
    1575              : 
    1576       377650 :             case HEAPTUPLE_RECENTLY_DEAD:
    1577              : 
    1578              :                 /*
    1579              :                  * We don't need to advance the conflict horizon for
    1580              :                  * RECENTLY_DEAD tuples, even if we are removing them.  This
    1581              :                  * is because we only remove RECENTLY_DEAD tuples if they
    1582              :                  * precede a DEAD tuple, and the DEAD tuple must have been
    1583              :                  * inserted by a newer transaction than the RECENTLY_DEAD
    1584              :                  * tuple by virtue of being later in the chain.  We will have
    1585              :                  * advanced the conflict horizon for the DEAD tuple.
    1586              :                  */
    1587              : 
    1588              :                 /*
    1589              :                  * Advance past RECENTLY_DEAD tuples just in case there's a
    1590              :                  * DEAD one after them.  We have to make sure that we don't
    1591              :                  * miss any DEAD tuples, since DEAD tuples that still have
    1592              :                  * tuple storage after pruning will confuse VACUUM.
    1593              :                  */
    1594       377650 :                 break;
    1595              : 
    1596     16182735 :             case HEAPTUPLE_DELETE_IN_PROGRESS:
    1597              :             case HEAPTUPLE_LIVE:
    1598              :             case HEAPTUPLE_INSERT_IN_PROGRESS:
    1599     16182735 :                 goto process_chain;
    1600              : 
    1601            0 :             default:
    1602            0 :                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
    1603              :                 goto process_chain;
    1604              :         }
    1605              : 
    1606              :         /*
    1607              :          * If the tuple is not HOT-updated, then we are at the end of this
    1608              :          * HOT-update chain.
    1609              :          */
    1610      2190651 :         if (!HeapTupleHeaderIsHotUpdated(htup))
    1611      2066025 :             goto process_chain;
    1612              : 
    1613              :         /* HOT implies it can't have moved to different partition */
    1614              :         Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
    1615              : 
    1616              :         /*
    1617              :          * Advance to next chain member.
    1618              :          */
    1619              :         Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == prstate->block);
    1620       124626 :         offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    1621       124626 :         priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    1622              :     }
    1623              : 
    1624            0 :     if (ItemIdIsRedirected(rootlp) && nchain < 2)
    1625              :     {
    1626              :         /*
    1627              :          * We found a redirect item that doesn't point to a valid follow-on
    1628              :          * item.  This can happen if the loop in heap_page_prune_and_freeze()
    1629              :          * caused us to visit the dead successor of a redirect item before
    1630              :          * visiting the redirect item.  We can clean up by setting the
    1631              :          * redirect item to LP_DEAD state or LP_UNUSED if the caller
    1632              :          * indicated.
    1633              :          */
    1634            0 :         heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
    1635            0 :         return;
    1636              :     }
    1637              : 
    1638            0 : process_chain:
    1639              : 
    1640     18248760 :     if (ndeadchain == 0)
    1641              :     {
    1642              :         /*
    1643              :          * No DEAD tuple was found, so the chain is entirely composed of
    1644              :          * normal, unchanged tuples.  Leave it alone.
    1645              :          */
    1646     16482526 :         int         i = 0;
    1647              : 
    1648     16482526 :         if (ItemIdIsRedirected(rootlp))
    1649              :         {
    1650       177457 :             heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
    1651       177457 :             i++;
    1652              :         }
    1653     32970629 :         for (; i < nchain; i++)
    1654     16488103 :             heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
    1655              :     }
    1656      1766234 :     else if (ndeadchain == nchain)
    1657              :     {
    1658              :         /*
    1659              :          * The entire chain is dead.  Mark the root line pointer LP_DEAD, and
    1660              :          * fully remove the other tuples in the chain.
    1661              :          */
    1662      1695345 :         heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
    1663      1740981 :         for (int i = 1; i < nchain; i++)
    1664        45636 :             heap_prune_record_unused(prstate, chainitems[i], true);
    1665              :     }
    1666              :     else
    1667              :     {
    1668              :         /*
    1669              :          * We found a DEAD tuple in the chain.  Redirect the root line pointer
    1670              :          * to the first non-DEAD tuple, and mark as unused each intermediate
    1671              :          * item that we are able to remove from the chain.
    1672              :          */
    1673        70889 :         heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
    1674        70889 :                                    ItemIdIsNormal(rootlp));
    1675        92592 :         for (int i = 1; i < ndeadchain; i++)
    1676        21703 :             heap_prune_record_unused(prstate, chainitems[i], true);
    1677              : 
    1678              :         /* the rest of tuples in the chain are normal, unchanged tuples */
    1679       143171 :         for (int i = ndeadchain; i < nchain; i++)
    1680        72282 :             heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
    1681              :     }
    1682              : }
    1683              : 
    1684              : /* Record lowest soon-prunable XID */
    1685              : static void
    1686      4594615 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
    1687              :                            OffsetNumber offnum)
    1688              : {
    1689              :     /*
    1690              :      * This should exactly match the PageSetPrunable macro.  We can't store
    1691              :      * directly into the page header yet, so we update working state.
    1692              :      */
    1693              :     Assert(TransactionIdIsNormal(xid));
    1694      8997679 :     if (!TransactionIdIsValid(prstate->new_prune_xid) ||
    1695      4403064 :         TransactionIdPrecedes(xid, prstate->new_prune_xid))
    1696       193132 :         prstate->new_prune_xid = xid;
    1697              : 
    1698              :     /*
    1699              :      * It's incorrect for a page to be marked all-visible if it contains
    1700              :      * prunable items.
    1701              :      */
    1702      4594615 :     if (PageIsAllVisible(prstate->page))
    1703            0 :         heap_page_fix_vm_corruption(prstate, offnum,
    1704              :                                     VM_CORRUPT_TUPLE_VISIBILITY);
    1705      4594615 : }
    1706              : 
    1707              : /* Record line pointer to be redirected */
    1708              : static void
    1709        70889 : heap_prune_record_redirect(PruneState *prstate,
    1710              :                            OffsetNumber offnum, OffsetNumber rdoffnum,
    1711              :                            bool was_normal)
    1712              : {
    1713              :     Assert(!prstate->processed[offnum]);
    1714        70889 :     prstate->processed[offnum] = true;
    1715              : 
    1716              :     /*
    1717              :      * Do not mark the redirect target here.  It needs to be counted
    1718              :      * separately as an unchanged tuple.
    1719              :      */
    1720              : 
    1721              :     Assert(prstate->nredirected < MaxHeapTuplesPerPage);
    1722        70889 :     prstate->redirected[prstate->nredirected * 2] = offnum;
    1723        70889 :     prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
    1724              : 
    1725        70889 :     prstate->nredirected++;
    1726              : 
    1727              :     /*
    1728              :      * If the root entry had been a normal tuple, we are deleting it, so count
    1729              :      * it in the result.  But changing a redirect (even to DEAD state) doesn't
    1730              :      * count.
    1731              :      */
    1732        70889 :     if (was_normal)
    1733        62546 :         prstate->ndeleted++;
    1734              : 
    1735        70889 :     prstate->hastup = true;
    1736        70889 : }
    1737              : 
    1738              : /* Record line pointer to be marked dead */
    1739              : static void
    1740      1661511 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
    1741              :                        bool was_normal)
    1742              : {
    1743              :     Assert(!prstate->processed[offnum]);
    1744      1661511 :     prstate->processed[offnum] = true;
    1745              : 
    1746              :     Assert(prstate->ndead < MaxHeapTuplesPerPage);
    1747      1661511 :     prstate->nowdead[prstate->ndead] = offnum;
    1748      1661511 :     prstate->ndead++;
    1749              : 
    1750              :     /*
    1751              :      * Deliberately delay unsetting set_all_visible and set_all_frozen until
    1752              :      * later during pruning. Removable dead tuples shouldn't preclude freezing
    1753              :      * the page.
    1754              :      */
    1755              : 
    1756              :     /* Record the dead offset for vacuum */
    1757      1661511 :     prstate->deadoffsets[prstate->lpdead_items++] = offnum;
    1758              : 
    1759              :     /*
    1760              :      * If the root entry had been a normal tuple, we are deleting it, so count
    1761              :      * it in the result.  But changing a redirect (even to DEAD state) doesn't
    1762              :      * count.
    1763              :      */
    1764      1661511 :     if (was_normal)
    1765      1649282 :         prstate->ndeleted++;
    1766      1661511 : }
    1767              : 
    1768              : /*
    1769              :  * Depending on whether or not the caller set mark_unused_now to true, record that a
    1770              :  * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in
    1771              :  * which we will mark line pointers LP_UNUSED, but we will not mark line
    1772              :  * pointers LP_DEAD if mark_unused_now is true.
    1773              :  */
    1774              : static void
    1775      1695345 : heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
    1776              :                                  bool was_normal)
    1777              : {
    1778              :     /*
    1779              :      * If the caller set mark_unused_now to true, we can remove dead tuples
    1780              :      * during pruning instead of marking their line pointers dead. Set this
    1781              :      * tuple's line pointer LP_UNUSED. We hint that this option is less
    1782              :      * likely.
    1783              :      */
    1784      1695345 :     if (unlikely(prstate->mark_unused_now))
    1785        33834 :         heap_prune_record_unused(prstate, offnum, was_normal);
    1786              :     else
    1787      1661511 :         heap_prune_record_dead(prstate, offnum, was_normal);
    1788              : 
    1789              :     /*
    1790              :      * It's incorrect for the page to be set all-visible if it contains dead
    1791              :      * items. Fix that on the heap page and check the VM for corruption as
    1792              :      * well. Do that here rather than in heap_prune_record_dead() so we also
    1793              :      * cover tuples that are directly marked LP_UNUSED via mark_unused_now.
    1794              :      */
    1795      1695345 :     if (PageIsAllVisible(prstate->page))
    1796            0 :         heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
    1797      1695345 : }
    1798              : 
    1799              : /* Record line pointer to be marked unused */
    1800              : static void
    1801       106484 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
    1802              : {
    1803              :     Assert(!prstate->processed[offnum]);
    1804       106484 :     prstate->processed[offnum] = true;
    1805              : 
    1806              :     Assert(prstate->nunused < MaxHeapTuplesPerPage);
    1807       106484 :     prstate->nowunused[prstate->nunused] = offnum;
    1808       106484 :     prstate->nunused++;
    1809              : 
    1810              :     /*
    1811              :      * If the root entry had been a normal tuple, we are deleting it, so count
    1812              :      * it in the result.  But changing a redirect (even to DEAD state) doesn't
    1813              :      * count.
    1814              :      */
    1815       106484 :     if (was_normal)
    1816       104417 :         prstate->ndeleted++;
    1817       106484 : }
    1818              : 
    1819              : /*
    1820              :  * Record an unused line pointer that is left unchanged.
    1821              :  */
    1822              : static void
    1823       135740 : heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum)
    1824              : {
    1825              :     Assert(!prstate->processed[offnum]);
    1826       135740 :     prstate->processed[offnum] = true;
    1827       135740 : }
    1828              : 
    1829              : /*
    1830              :  * Record line pointer that is left unchanged.  We consider freezing it, and
    1831              :  * update bookkeeping of tuple counts and page visibility.
    1832              :  */
    1833              : static void
    1834     16574101 : heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
    1835              : {
    1836              :     HeapTupleHeader htup;
    1837              :     TransactionId xmin;
    1838     16574101 :     Page        page = prstate->page;
    1839              : 
    1840              :     Assert(!prstate->processed[offnum]);
    1841     16574101 :     prstate->processed[offnum] = true;
    1842              : 
    1843     16574101 :     prstate->hastup = true;      /* the page is not empty */
    1844              : 
    1845              :     /*
    1846              :      * The criteria for counting a tuple as live in this block need to match
    1847              :      * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and
    1848              :      * ANALYZE may produce wildly different reltuples values, e.g. when there
    1849              :      * are many recently-dead tuples.
    1850              :      *
    1851              :      * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM
    1852              :      * can't run inside a transaction block, which makes some cases impossible
    1853              :      * (e.g. in-progress insert from the same transaction).
    1854              :      *
    1855              :      * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*()
    1856              :      * subroutines.  They don't count dead items like acquire_sample_rows()
    1857              :      * does, because we assume that all dead items will become LP_UNUSED
    1858              :      * before VACUUM finishes.  This difference is only superficial.  VACUUM
    1859              :      * effectively agrees with ANALYZE about DEAD items, in the end.  VACUUM
    1860              :      * won't remember LP_DEAD items, but only because they're not supposed to
    1861              :      * be left behind when it is done. (Cases where we bypass index vacuuming
    1862              :      * will violate this optimistic assumption, but the overall impact of that
    1863              :      * should be negligible.)
    1864              :      */
    1865     16574101 :     htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum));
    1866              : 
    1867     16574101 :     switch (prstate->htsv[offnum])
    1868              :     {
    1869     11979486 :         case HEAPTUPLE_LIVE:
    1870              : 
    1871              :             /*
    1872              :              * Count it as live.  Not only is this natural, but it's also what
    1873              :              * acquire_sample_rows() does.
    1874              :              */
    1875     11979486 :             prstate->live_tuples++;
    1876              : 
    1877              :             /*
    1878              :              * Is the tuple definitely visible to all transactions?
    1879              :              *
    1880              :              * NB: Like with per-tuple hint bits, we can't set the
    1881              :              * PD_ALL_VISIBLE flag if the inserter committed asynchronously.
    1882              :              * See SetHintBits for more info.  Check that the tuple is hinted
    1883              :              * xmin-committed because of that.
    1884              :              */
    1885     11979486 :             if (!HeapTupleHeaderXminCommitted(htup))
    1886              :             {
    1887        49487 :                 prstate->set_all_visible = false;
    1888        49487 :                 prstate->set_all_frozen = false;
    1889        49487 :                 break;
    1890              :             }
    1891              : 
    1892              :             /*
    1893              :              * The inserter definitely committed. But we don't know if it is
    1894              :              * old enough that everyone sees it as committed. Later, after
    1895              :              * processing all the tuples on the page, we'll check if there is
    1896              :              * any snapshot that still considers the newest xid on the page to
    1897              :              * be running. If so, we don't consider the page all-visible.
    1898              :              */
    1899     11929999 :             xmin = HeapTupleHeaderGetXmin(htup);
    1900              : 
    1901              :             /* Track newest xmin on page. */
    1902     11929999 :             if (TransactionIdFollows(xmin, prstate->newest_live_xid) &&
    1903              :                 TransactionIdIsNormal(xmin))
    1904       539235 :                 prstate->newest_live_xid = xmin;
    1905              : 
    1906     11929999 :             break;
    1907              : 
    1908       377650 :         case HEAPTUPLE_RECENTLY_DEAD:
    1909       377650 :             prstate->recently_dead_tuples++;
    1910       377650 :             prstate->set_all_visible = false;
    1911       377650 :             prstate->set_all_frozen = false;
    1912              : 
    1913              :             /*
    1914              :              * This tuple will soon become DEAD.  Update the hint field so
    1915              :              * that the page is reconsidered for pruning in future.
    1916              :              */
    1917       377650 :             heap_prune_record_prunable(prstate,
    1918              :                                        HeapTupleHeaderGetUpdateXid(htup),
    1919              :                                        offnum);
    1920       377650 :             break;
    1921              : 
    1922       130009 :         case HEAPTUPLE_INSERT_IN_PROGRESS:
    1923              : 
    1924              :             /*
    1925              :              * We do not count these rows as live, because we expect the
    1926              :              * inserting transaction to update the counters at commit, and we
    1927              :              * assume that will happen only after we report our results.  This
    1928              :              * assumption is a bit shaky, but it is what acquire_sample_rows()
    1929              :              * does, so be consistent.
    1930              :              */
    1931       130009 :             prstate->set_all_visible = false;
    1932       130009 :             prstate->set_all_frozen = false;
    1933              : 
    1934              :             /*
    1935              :              * Though there is nothing "prunable" on the page, we maintain
    1936              :              * pd_prune_xid for inserts so that we have the opportunity to
    1937              :              * mark them all-visible during the next round of pruning.
    1938              :              */
    1939       130009 :             heap_prune_record_prunable(prstate,
    1940              :                                        HeapTupleHeaderGetXmin(htup),
    1941              :                                        offnum);
    1942       130009 :             break;
    1943              : 
    1944      4086956 :         case HEAPTUPLE_DELETE_IN_PROGRESS:
    1945              : 
    1946              :             /*
    1947              :              * This an expected case during concurrent vacuum.  Count such
    1948              :              * rows as live.  As above, we assume the deleting transaction
    1949              :              * will commit and update the counters after we report.
    1950              :              */
    1951      4086956 :             prstate->live_tuples++;
    1952      4086956 :             prstate->set_all_visible = false;
    1953      4086956 :             prstate->set_all_frozen = false;
    1954              : 
    1955              :             /*
    1956              :              * This tuple may soon become DEAD.  Update the hint field so that
    1957              :              * the page is reconsidered for pruning in future.
    1958              :              */
    1959      4086956 :             heap_prune_record_prunable(prstate,
    1960              :                                        HeapTupleHeaderGetUpdateXid(htup),
    1961              :                                        offnum);
    1962      4086956 :             break;
    1963              : 
    1964            0 :         default:
    1965              : 
    1966              :             /*
    1967              :              * DEAD tuples should've been passed to heap_prune_record_dead()
    1968              :              * or heap_prune_record_unused() instead.
    1969              :              */
    1970            0 :             elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d",
    1971              :                  prstate->htsv[offnum]);
    1972              :             break;
    1973              :     }
    1974              : 
    1975              :     /* Consider freezing any normal tuples which will not be removed */
    1976     16574101 :     if (prstate->attempt_freeze)
    1977              :     {
    1978              :         bool        totally_frozen;
    1979              : 
    1980      9795534 :         if ((heap_prepare_freeze_tuple(htup,
    1981      9795534 :                                        prstate->cutoffs,
    1982              :                                        &prstate->pagefrz,
    1983      9795534 :                                        &prstate->frozen[prstate->nfrozen],
    1984              :                                        &totally_frozen)))
    1985              :         {
    1986              :             /* Save prepared freeze plan for later */
    1987      2708353 :             prstate->frozen[prstate->nfrozen++].offset = offnum;
    1988              :         }
    1989              : 
    1990              :         /*
    1991              :          * If any tuple isn't either totally frozen already or eligible to
    1992              :          * become totally frozen (according to its freeze plan), then the page
    1993              :          * definitely cannot be set all-frozen in the visibility map later on.
    1994              :          */
    1995      9795534 :         if (!totally_frozen)
    1996      4828508 :             prstate->set_all_frozen = false;
    1997              :     }
    1998     16574101 : }
    1999              : 
    2000              : 
    2001              : /*
    2002              :  * Record line pointer that was already LP_DEAD and is left unchanged.
    2003              :  */
    2004              : static void
    2005      1553608 : heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum)
    2006              : {
    2007              :     Assert(!prstate->processed[offnum]);
    2008      1553608 :     prstate->processed[offnum] = true;
    2009              : 
    2010              :     /*
    2011              :      * Deliberately don't set hastup for LP_DEAD items.  We make the soft
    2012              :      * assumption that any LP_DEAD items encountered here will become
    2013              :      * LP_UNUSED later on, before count_nondeletable_pages is reached.  If we
    2014              :      * don't make this assumption then rel truncation will only happen every
    2015              :      * other VACUUM, at most.  Besides, VACUUM must treat
    2016              :      * hastup/nonempty_pages as provisional no matter how LP_DEAD items are
    2017              :      * handled (handled here, or handled later on).
    2018              :      *
    2019              :      * Similarly, don't unset set_all_visible and set_all_frozen until later,
    2020              :      * at the end of heap_page_prune_and_freeze().  This will allow us to
    2021              :      * attempt to freeze the page after pruning.  As long as we unset it
    2022              :      * before updating the visibility map, this will be correct.
    2023              :      */
    2024              : 
    2025              :     /* Record the dead offset for vacuum */
    2026      1553608 :     prstate->deadoffsets[prstate->lpdead_items++] = offnum;
    2027              : 
    2028              :     /*
    2029              :      * It's incorrect for a page to be marked all-visible if it contains dead
    2030              :      * items.
    2031              :      */
    2032      1553608 :     if (PageIsAllVisible(prstate->page))
    2033            0 :         heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
    2034      1553608 : }
    2035              : 
    2036              : /*
    2037              :  * Record LP_REDIRECT that is left unchanged.
    2038              :  */
    2039              : static void
    2040       177457 : heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum)
    2041              : {
    2042              :     /*
    2043              :      * A redirect line pointer doesn't count as a live tuple.
    2044              :      *
    2045              :      * If we leave a redirect line pointer in place, there will be another
    2046              :      * tuple on the page that it points to.  We will do the bookkeeping for
    2047              :      * that separately.  So we have nothing to do here, except remember that
    2048              :      * we processed this item.
    2049              :      */
    2050              :     Assert(!prstate->processed[offnum]);
    2051       177457 :     prstate->processed[offnum] = true;
    2052       177457 : }
    2053              : 
    2054              : /*
    2055              :  * Perform the actual page changes needed by heap_page_prune_and_freeze().
    2056              :  *
    2057              :  * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers
    2058              :  * as unused, not redirecting or removing anything else.  The
    2059              :  * PageRepairFragmentation() call is skipped in that case.
    2060              :  *
    2061              :  * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on
    2062              :  * the buffer.  If it is set, an ordinary exclusive lock suffices.
    2063              :  */
    2064              : void
    2065        70146 : heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
    2066              :                         OffsetNumber *redirected, int nredirected,
    2067              :                         OffsetNumber *nowdead, int ndead,
    2068              :                         OffsetNumber *nowunused, int nunused)
    2069              : {
    2070        70146 :     Page        page = BufferGetPage(buffer);
    2071              :     OffsetNumber *offnum;
    2072              :     HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
    2073              : 
    2074              :     /* Shouldn't be called unless there's something to do */
    2075              :     Assert(nredirected > 0 || ndead > 0 || nunused > 0);
    2076              : 
    2077              :     /* If 'lp_truncate_only', we can only remove already-dead line pointers */
    2078              :     Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
    2079              : 
    2080              :     /* Update all redirected line pointers */
    2081        70146 :     offnum = redirected;
    2082       160350 :     for (int i = 0; i < nredirected; i++)
    2083              :     {
    2084        90204 :         OffsetNumber fromoff = *offnum++;
    2085        90204 :         OffsetNumber tooff = *offnum++;
    2086        90204 :         ItemId      fromlp = PageGetItemId(page, fromoff);
    2087              :         ItemId      tolp PG_USED_FOR_ASSERTS_ONLY;
    2088              : 
    2089              : #ifdef USE_ASSERT_CHECKING
    2090              : 
    2091              :         /*
    2092              :          * Any existing item that we set as an LP_REDIRECT (any 'from' item)
    2093              :          * must be the first item from a HOT chain.  If the item has tuple
    2094              :          * storage then it can't be a heap-only tuple.  Otherwise we are just
    2095              :          * maintaining an existing LP_REDIRECT from an existing HOT chain that
    2096              :          * has been pruned at least once before now.
    2097              :          */
    2098              :         if (!ItemIdIsRedirected(fromlp))
    2099              :         {
    2100              :             Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
    2101              : 
    2102              :             htup = (HeapTupleHeader) PageGetItem(page, fromlp);
    2103              :             Assert(!HeapTupleHeaderIsHeapOnly(htup));
    2104              :         }
    2105              :         else
    2106              :         {
    2107              :             /* We shouldn't need to redundantly set the redirect */
    2108              :             Assert(ItemIdGetRedirect(fromlp) != tooff);
    2109              :         }
    2110              : 
    2111              :         /*
    2112              :          * The item that we're about to set as an LP_REDIRECT (the 'from'
    2113              :          * item) will point to an existing item (the 'to' item) that is
    2114              :          * already a heap-only tuple.  There can be at most one LP_REDIRECT
    2115              :          * item per HOT chain.
    2116              :          *
    2117              :          * We need to keep around an LP_REDIRECT item (after original
    2118              :          * non-heap-only root tuple gets pruned away) so that it's always
    2119              :          * possible for VACUUM to easily figure out what TID to delete from
    2120              :          * indexes when an entire HOT chain becomes dead.  A heap-only tuple
    2121              :          * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
    2122              :          * tuple can.
    2123              :          *
    2124              :          * This check may miss problems, e.g. the target of a redirect could
    2125              :          * be marked as unused subsequently. The page_verify_redirects() check
    2126              :          * below will catch such problems.
    2127              :          */
    2128              :         tolp = PageGetItemId(page, tooff);
    2129              :         Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
    2130              :         htup = (HeapTupleHeader) PageGetItem(page, tolp);
    2131              :         Assert(HeapTupleHeaderIsHeapOnly(htup));
    2132              : #endif
    2133              : 
    2134        90204 :         ItemIdSetRedirect(fromlp, tooff);
    2135              :     }
    2136              : 
    2137              :     /* Update all now-dead line pointers */
    2138        70146 :     offnum = nowdead;
    2139      2006290 :     for (int i = 0; i < ndead; i++)
    2140              :     {
    2141      1936144 :         OffsetNumber off = *offnum++;
    2142      1936144 :         ItemId      lp = PageGetItemId(page, off);
    2143              : 
    2144              : #ifdef USE_ASSERT_CHECKING
    2145              : 
    2146              :         /*
    2147              :          * An LP_DEAD line pointer must be left behind when the original item
    2148              :          * (which is dead to everybody) could still be referenced by a TID in
    2149              :          * an index.  This should never be necessary with any individual
    2150              :          * heap-only tuple item, though. (It's not clear how much of a problem
    2151              :          * that would be, but there is no reason to allow it.)
    2152              :          */
    2153              :         if (ItemIdHasStorage(lp))
    2154              :         {
    2155              :             Assert(ItemIdIsNormal(lp));
    2156              :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    2157              :             Assert(!HeapTupleHeaderIsHeapOnly(htup));
    2158              :         }
    2159              :         else
    2160              :         {
    2161              :             /* Whole HOT chain becomes dead */
    2162              :             Assert(ItemIdIsRedirected(lp));
    2163              :         }
    2164              : #endif
    2165              : 
    2166      1936144 :         ItemIdSetDead(lp);
    2167              :     }
    2168              : 
    2169              :     /* Update all now-unused line pointers */
    2170        70146 :     offnum = nowunused;
    2171       378301 :     for (int i = 0; i < nunused; i++)
    2172              :     {
    2173       308155 :         OffsetNumber off = *offnum++;
    2174       308155 :         ItemId      lp = PageGetItemId(page, off);
    2175              : 
    2176              : #ifdef USE_ASSERT_CHECKING
    2177              : 
    2178              :         if (lp_truncate_only)
    2179              :         {
    2180              :             /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
    2181              :             Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
    2182              :         }
    2183              :         else
    2184              :         {
    2185              :             /*
    2186              :              * When heap_page_prune_and_freeze() was called, mark_unused_now
    2187              :              * may have been passed as true, which allows would-be LP_DEAD
    2188              :              * items to be made LP_UNUSED instead.  This is only possible if
    2189              :              * the relation has no indexes.  If there are any dead items, then
    2190              :              * mark_unused_now was not true and every item being marked
    2191              :              * LP_UNUSED must refer to a heap-only tuple.
    2192              :              */
    2193              :             if (ndead > 0)
    2194              :             {
    2195              :                 Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
    2196              :                 htup = (HeapTupleHeader) PageGetItem(page, lp);
    2197              :                 Assert(HeapTupleHeaderIsHeapOnly(htup));
    2198              :             }
    2199              :             else
    2200              :                 Assert(ItemIdIsUsed(lp));
    2201              :         }
    2202              : 
    2203              : #endif
    2204              : 
    2205       308155 :         ItemIdSetUnused(lp);
    2206              :     }
    2207              : 
    2208        70146 :     if (lp_truncate_only)
    2209         2070 :         PageTruncateLinePointerArray(page);
    2210              :     else
    2211              :     {
    2212              :         /*
    2213              :          * Finally, repair any fragmentation, and update the page's hint bit
    2214              :          * about whether it has free pointers.
    2215              :          */
    2216        68076 :         PageRepairFragmentation(page);
    2217              : 
    2218              :         /*
    2219              :          * Now that the page has been modified, assert that redirect items
    2220              :          * still point to valid targets.
    2221              :          */
    2222        68076 :         page_verify_redirects(page);
    2223              :     }
    2224        70146 : }
    2225              : 
    2226              : 
    2227              : /*
    2228              :  * If built with assertions, verify that all LP_REDIRECT items point to a
    2229              :  * valid item.
    2230              :  *
    2231              :  * One way that bugs related to HOT pruning show is redirect items pointing to
    2232              :  * removed tuples. It's not trivial to reliably check that marking an item
    2233              :  * unused will not orphan a redirect item during heap_prune_chain() /
    2234              :  * heap_page_prune_execute(), so we additionally check the whole page after
    2235              :  * pruning. Without this check such bugs would typically only cause asserts
    2236              :  * later, potentially well after the corruption has been introduced.
    2237              :  *
    2238              :  * Also check comments in heap_page_prune_execute()'s redirection loop.
    2239              :  */
    2240              : static void
    2241        68076 : page_verify_redirects(Page page)
    2242              : {
    2243              : #ifdef USE_ASSERT_CHECKING
    2244              :     OffsetNumber offnum;
    2245              :     OffsetNumber maxoff;
    2246              : 
    2247              :     maxoff = PageGetMaxOffsetNumber(page);
    2248              :     for (offnum = FirstOffsetNumber;
    2249              :          offnum <= maxoff;
    2250              :          offnum = OffsetNumberNext(offnum))
    2251              :     {
    2252              :         ItemId      itemid = PageGetItemId(page, offnum);
    2253              :         OffsetNumber targoff;
    2254              :         ItemId      targitem;
    2255              :         HeapTupleHeader htup;
    2256              : 
    2257              :         if (!ItemIdIsRedirected(itemid))
    2258              :             continue;
    2259              : 
    2260              :         targoff = ItemIdGetRedirect(itemid);
    2261              :         targitem = PageGetItemId(page, targoff);
    2262              : 
    2263              :         Assert(ItemIdIsUsed(targitem));
    2264              :         Assert(ItemIdIsNormal(targitem));
    2265              :         Assert(ItemIdHasStorage(targitem));
    2266              :         htup = (HeapTupleHeader) PageGetItem(page, targitem);
    2267              :         Assert(HeapTupleHeaderIsHeapOnly(htup));
    2268              :     }
    2269              : #endif
    2270        68076 : }
    2271              : 
    2272              : 
    2273              : /*
    2274              :  * For all items in this page, find their respective root line pointers.
    2275              :  * If item k is part of a HOT-chain with root at item j, then we set
    2276              :  * root_offsets[k - 1] = j.
    2277              :  *
    2278              :  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
    2279              :  * Unused entries are filled with InvalidOffsetNumber (zero).
    2280              :  *
    2281              :  * The function must be called with at least share lock on the buffer, to
    2282              :  * prevent concurrent prune operations.
    2283              :  *
    2284              :  * Note: The information collected here is valid only as long as the caller
    2285              :  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
    2286              :  * and reused by a completely unrelated tuple.
    2287              :  */
    2288              : void
    2289       134232 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
    2290              : {
    2291              :     OffsetNumber offnum,
    2292              :                 maxoff;
    2293              : 
    2294       134232 :     MemSet(root_offsets, InvalidOffsetNumber,
    2295              :            MaxHeapTuplesPerPage * sizeof(OffsetNumber));
    2296              : 
    2297       134232 :     maxoff = PageGetMaxOffsetNumber(page);
    2298     10806332 :     for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
    2299              :     {
    2300     10672100 :         ItemId      lp = PageGetItemId(page, offnum);
    2301              :         HeapTupleHeader htup;
    2302              :         OffsetNumber nextoffnum;
    2303              :         TransactionId priorXmax;
    2304              : 
    2305              :         /* skip unused and dead items */
    2306     10672100 :         if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
    2307         9258 :             continue;
    2308              : 
    2309     10662842 :         if (ItemIdIsNormal(lp))
    2310              :         {
    2311     10659026 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    2312              : 
    2313              :             /*
    2314              :              * Check if this tuple is part of a HOT-chain rooted at some other
    2315              :              * tuple. If so, skip it for now; we'll process it when we find
    2316              :              * its root.
    2317              :              */
    2318     10659026 :             if (HeapTupleHeaderIsHeapOnly(htup))
    2319         4156 :                 continue;
    2320              : 
    2321              :             /*
    2322              :              * This is either a plain tuple or the root of a HOT-chain.
    2323              :              * Remember it in the mapping.
    2324              :              */
    2325     10654870 :             root_offsets[offnum - 1] = offnum;
    2326              : 
    2327              :             /* If it's not the start of a HOT-chain, we're done with it */
    2328     10654870 :             if (!HeapTupleHeaderIsHotUpdated(htup))
    2329     10654606 :                 continue;
    2330              : 
    2331              :             /* Set up to scan the HOT-chain */
    2332          264 :             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    2333          264 :             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    2334              :         }
    2335              :         else
    2336              :         {
    2337              :             /* Must be a redirect item. We do not set its root_offsets entry */
    2338              :             Assert(ItemIdIsRedirected(lp));
    2339              :             /* Set up to scan the HOT-chain */
    2340         3816 :             nextoffnum = ItemIdGetRedirect(lp);
    2341         3816 :             priorXmax = InvalidTransactionId;
    2342              :         }
    2343              : 
    2344              :         /*
    2345              :          * Now follow the HOT-chain and collect other tuples in the chain.
    2346              :          *
    2347              :          * Note: Even though this is a nested loop, the complexity of the
    2348              :          * function is O(N) because a tuple in the page should be visited not
    2349              :          * more than twice, once in the outer loop and once in HOT-chain
    2350              :          * chases.
    2351              :          */
    2352              :         for (;;)
    2353              :         {
    2354              :             /* Sanity check (pure paranoia) */
    2355         4152 :             if (offnum < FirstOffsetNumber)
    2356            0 :                 break;
    2357              : 
    2358              :             /*
    2359              :              * An offset past the end of page's line pointer array is possible
    2360              :              * when the array was truncated
    2361              :              */
    2362         4152 :             if (offnum > maxoff)
    2363            0 :                 break;
    2364              : 
    2365         4152 :             lp = PageGetItemId(page, nextoffnum);
    2366              : 
    2367              :             /* Check for broken chains */
    2368         4152 :             if (!ItemIdIsNormal(lp))
    2369            0 :                 break;
    2370              : 
    2371         4152 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    2372              : 
    2373         4488 :             if (TransactionIdIsValid(priorXmax) &&
    2374          336 :                 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
    2375            0 :                 break;
    2376              : 
    2377              :             /* Remember the root line pointer for this item */
    2378         4152 :             root_offsets[nextoffnum - 1] = offnum;
    2379              : 
    2380              :             /* Advance to next chain member, if any */
    2381         4152 :             if (!HeapTupleHeaderIsHotUpdated(htup))
    2382         4080 :                 break;
    2383              : 
    2384              :             /* HOT implies it can't have moved to different partition */
    2385              :             Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
    2386              : 
    2387           72 :             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    2388           72 :             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    2389              :         }
    2390              :     }
    2391       134232 : }
    2392              : 
    2393              : 
    2394              : /*
    2395              :  * Compare fields that describe actions required to freeze tuple with caller's
    2396              :  * open plan.  If everything matches then the frz tuple plan is equivalent to
    2397              :  * caller's plan.
    2398              :  */
    2399              : static inline bool
    2400      1190337 : heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
    2401              : {
    2402      1190337 :     if (plan->xmax == frz->xmax &&
    2403      1189042 :         plan->t_infomask2 == frz->t_infomask2 &&
    2404      1188287 :         plan->t_infomask == frz->t_infomask &&
    2405      1185287 :         plan->frzflags == frz->frzflags)
    2406      1185287 :         return true;
    2407              : 
    2408              :     /* Caller must call heap_log_freeze_new_plan again for frz */
    2409         5050 :     return false;
    2410              : }
    2411              : 
    2412              : /*
    2413              :  * Comparator used to deduplicate the freeze plans used in WAL records.
    2414              :  */
    2415              : static int
    2416      1604477 : heap_log_freeze_cmp(const void *arg1, const void *arg2)
    2417              : {
    2418      1604477 :     const HeapTupleFreeze *frz1 = arg1;
    2419      1604477 :     const HeapTupleFreeze *frz2 = arg2;
    2420              : 
    2421      1604477 :     if (frz1->xmax < frz2->xmax)
    2422        13169 :         return -1;
    2423      1591308 :     else if (frz1->xmax > frz2->xmax)
    2424        14212 :         return 1;
    2425              : 
    2426      1577096 :     if (frz1->t_infomask2 < frz2->t_infomask2)
    2427         5159 :         return -1;
    2428      1571937 :     else if (frz1->t_infomask2 > frz2->t_infomask2)
    2429         4584 :         return 1;
    2430              : 
    2431      1567353 :     if (frz1->t_infomask < frz2->t_infomask)
    2432        13456 :         return -1;
    2433      1553897 :     else if (frz1->t_infomask > frz2->t_infomask)
    2434        17118 :         return 1;
    2435              : 
    2436      1536779 :     if (frz1->frzflags < frz2->frzflags)
    2437            0 :         return -1;
    2438      1536779 :     else if (frz1->frzflags > frz2->frzflags)
    2439            0 :         return 1;
    2440              : 
    2441              :     /*
    2442              :      * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
    2443              :      * (So the tuples will share a single canonical freeze plan.)
    2444              :      *
    2445              :      * We tiebreak on page offset number to keep each freeze plan's page
    2446              :      * offset number array individually sorted. (Unnecessary, but be tidy.)
    2447              :      */
    2448      1536779 :     if (frz1->offset < frz2->offset)
    2449      1319958 :         return -1;
    2450       216821 :     else if (frz1->offset > frz2->offset)
    2451       216821 :         return 1;
    2452              : 
    2453              :     Assert(false);
    2454            0 :     return 0;
    2455              : }
    2456              : 
    2457              : /*
    2458              :  * Start new plan initialized using tuple-level actions.  At least one tuple
    2459              :  * will have steps required to freeze described by caller's plan during REDO.
    2460              :  */
    2461              : static inline void
    2462        30402 : heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
    2463              : {
    2464        30402 :     plan->xmax = frz->xmax;
    2465        30402 :     plan->t_infomask2 = frz->t_infomask2;
    2466        30402 :     plan->t_infomask = frz->t_infomask;
    2467        30402 :     plan->frzflags = frz->frzflags;
    2468        30402 :     plan->ntuples = 1;           /* for now */
    2469        30402 : }
    2470              : 
    2471              : /*
    2472              :  * Deduplicate tuple-based freeze plans so that each distinct set of
    2473              :  * processing steps is only stored once in the WAL record.
    2474              :  * Called during original execution of freezing (for logged relations).
    2475              :  *
    2476              :  * Return value is number of plans set in *plans_out for caller.  Also writes
    2477              :  * an array of offset numbers into *offsets_out output argument for caller
    2478              :  * (actually there is one array per freeze plan, but that's not of immediate
    2479              :  * concern to our caller).
    2480              :  */
    2481              : static int
    2482        25352 : heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
    2483              :                      xlhp_freeze_plan *plans_out,
    2484              :                      OffsetNumber *offsets_out)
    2485              : {
    2486        25352 :     int         nplans = 0;
    2487              : 
    2488              :     /* Sort tuple-based freeze plans in the order required to deduplicate */
    2489        25352 :     qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
    2490              : 
    2491      1241041 :     for (int i = 0; i < ntuples; i++)
    2492              :     {
    2493      1215689 :         HeapTupleFreeze *frz = tuples + i;
    2494              : 
    2495      1215689 :         if (i == 0)
    2496              :         {
    2497              :             /* New canonical freeze plan starting with first tup */
    2498        25352 :             heap_log_freeze_new_plan(plans_out, frz);
    2499        25352 :             nplans++;
    2500              :         }
    2501      1190337 :         else if (heap_log_freeze_eq(plans_out, frz))
    2502              :         {
    2503              :             /* tup matches open canonical plan -- include tup in it */
    2504              :             Assert(offsets_out[i - 1] < frz->offset);
    2505      1185287 :             plans_out->ntuples++;
    2506              :         }
    2507              :         else
    2508              :         {
    2509              :             /* Tup doesn't match current plan -- done with it now */
    2510         5050 :             plans_out++;
    2511              : 
    2512              :             /* New canonical freeze plan starting with this tup */
    2513         5050 :             heap_log_freeze_new_plan(plans_out, frz);
    2514         5050 :             nplans++;
    2515              :         }
    2516              : 
    2517              :         /*
    2518              :          * Save page offset number in dedicated buffer in passing.
    2519              :          *
    2520              :          * REDO routine relies on the record's offset numbers array grouping
    2521              :          * offset numbers by freeze plan.  The sort order within each grouping
    2522              :          * is ascending offset number order, just to keep things tidy.
    2523              :          */
    2524      1215689 :         offsets_out[i] = frz->offset;
    2525              :     }
    2526              : 
    2527              :     Assert(nplans > 0 && nplans <= ntuples);
    2528              : 
    2529        25352 :     return nplans;
    2530              : }
    2531              : 
    2532              : /*
    2533              :  * Write an XLOG_HEAP2_PRUNE* WAL record
    2534              :  *
    2535              :  * This is used for several different page maintenance operations:
    2536              :  *
    2537              :  * - Page pruning, in VACUUM's 1st pass or on access: Some items are
    2538              :  *   redirected, some marked dead, and some removed altogether.
    2539              :  *
    2540              :  * - Freezing: Items are marked as 'frozen'.
    2541              :  *
    2542              :  * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused.
    2543              :  *
    2544              :  * They have enough commonalities that we use a single WAL record for them
    2545              :  * all.
    2546              :  *
    2547              :  * If replaying the record requires a cleanup lock, pass cleanup_lock = true.
    2548              :  * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but
    2549              :  * replaying 'unused' items depends on whether they were all previously marked
    2550              :  * as dead.
    2551              :  *
    2552              :  * If the VM is being updated, vmflags will contain the bits to set. In this
    2553              :  * case, vmbuffer should already have been updated and marked dirty and should
    2554              :  * still be pinned and locked.
    2555              :  *
    2556              :  * Note: This function scribbles on the 'frozen' array.
    2557              :  *
    2558              :  * Note: This is called in a critical section, so careful what you do here.
    2559              :  */
    2560              : void
    2561       129195 : log_heap_prune_and_freeze(Relation relation, Buffer buffer,
    2562              :                           Buffer vmbuffer, uint8 vmflags,
    2563              :                           TransactionId conflict_xid,
    2564              :                           bool cleanup_lock,
    2565              :                           PruneReason reason,
    2566              :                           HeapTupleFreeze *frozen, int nfrozen,
    2567              :                           OffsetNumber *redirected, int nredirected,
    2568              :                           OffsetNumber *dead, int ndead,
    2569              :                           OffsetNumber *unused, int nunused)
    2570              : {
    2571              :     xl_heap_prune xlrec;
    2572              :     XLogRecPtr  recptr;
    2573              :     uint8       info;
    2574              :     uint8       regbuf_flags_heap;
    2575              : 
    2576       129195 :     Page        heap_page = BufferGetPage(buffer);
    2577              : 
    2578              :     /* The following local variables hold data registered in the WAL record: */
    2579              :     xlhp_freeze_plan plans[MaxHeapTuplesPerPage];
    2580              :     xlhp_freeze_plans freeze_plans;
    2581              :     xlhp_prune_items redirect_items;
    2582              :     xlhp_prune_items dead_items;
    2583              :     xlhp_prune_items unused_items;
    2584              :     OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
    2585       129195 :     bool        do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
    2586       129195 :     bool        do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
    2587       129195 :     bool        heap_fpi_allowed = true;
    2588              : 
    2589              :     Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags);
    2590              : 
    2591       129195 :     xlrec.flags = 0;
    2592       129195 :     regbuf_flags_heap = REGBUF_STANDARD;
    2593              : 
    2594              :     /*
    2595              :      * We can avoid an FPI of the heap page if the only modification we are
    2596              :      * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are
    2597              :      * disabled.
    2598              :      *
    2599              :      * However, if the page has never been WAL-logged (LSN is invalid), we
    2600              :      * must force an FPI regardless.  This can happen when another backend
    2601              :      * extends the heap, initializes the page, and then fails before WAL-
    2602              :      * logging it.  Since heap extension is not WAL-logged, recovery might try
    2603              :      * to replay our record and find that the page isn't initialized, which
    2604              :      * would cause a PANIC.
    2605              :      */
    2606       129195 :     if (!XLogRecPtrIsValid(PageGetLSN(heap_page)))
    2607            0 :         regbuf_flags_heap |= REGBUF_FORCE_IMAGE;
    2608       129195 :     else if (!do_prune && nfrozen == 0 && (!do_set_vm || !XLogHintBitIsNeeded()))
    2609              :     {
    2610         1332 :         regbuf_flags_heap |= REGBUF_NO_IMAGE;
    2611         1332 :         heap_fpi_allowed = false;
    2612              :     }
    2613              : 
    2614              :     /*
    2615              :      * Prepare data for the buffer.  The arrays are not actually in the
    2616              :      * buffer, but we pretend that they are.  When XLogInsert stores a full
    2617              :      * page image, the arrays can be omitted.
    2618              :      */
    2619       129195 :     XLogBeginInsert();
    2620       129195 :     XLogRegisterBuffer(0, buffer, regbuf_flags_heap);
    2621              : 
    2622       129195 :     if (do_set_vm)
    2623        68362 :         XLogRegisterBuffer(1, vmbuffer, 0);
    2624              : 
    2625       129195 :     if (nfrozen > 0)
    2626              :     {
    2627              :         int         nplans;
    2628              : 
    2629        25352 :         xlrec.flags |= XLHP_HAS_FREEZE_PLANS;
    2630              : 
    2631              :         /*
    2632              :          * Prepare deduplicated representation for use in the WAL record. This
    2633              :          * destructively sorts frozen tuples array in-place.
    2634              :          */
    2635        25352 :         nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets);
    2636              : 
    2637        25352 :         freeze_plans.nplans = nplans;
    2638        25352 :         XLogRegisterBufData(0, &freeze_plans,
    2639              :                             offsetof(xlhp_freeze_plans, plans));
    2640        25352 :         XLogRegisterBufData(0, plans,
    2641              :                             sizeof(xlhp_freeze_plan) * nplans);
    2642              :     }
    2643       129195 :     if (nredirected > 0)
    2644              :     {
    2645        17447 :         xlrec.flags |= XLHP_HAS_REDIRECTIONS;
    2646              : 
    2647        17447 :         redirect_items.ntargets = nredirected;
    2648        17447 :         XLogRegisterBufData(0, &redirect_items,
    2649              :                             offsetof(xlhp_prune_items, data));
    2650        17447 :         XLogRegisterBufData(0, redirected,
    2651              :                             sizeof(OffsetNumber[2]) * nredirected);
    2652              :     }
    2653       129195 :     if (ndead > 0)
    2654              :     {
    2655        46337 :         xlrec.flags |= XLHP_HAS_DEAD_ITEMS;
    2656              : 
    2657        46337 :         dead_items.ntargets = ndead;
    2658        46337 :         XLogRegisterBufData(0, &dead_items,
    2659              :                             offsetof(xlhp_prune_items, data));
    2660        46337 :         XLogRegisterBufData(0, dead,
    2661              :                             sizeof(OffsetNumber) * ndead);
    2662              :     }
    2663       129195 :     if (nunused > 0)
    2664              :     {
    2665        28886 :         xlrec.flags |= XLHP_HAS_NOW_UNUSED_ITEMS;
    2666              : 
    2667        28886 :         unused_items.ntargets = nunused;
    2668        28886 :         XLogRegisterBufData(0, &unused_items,
    2669              :                             offsetof(xlhp_prune_items, data));
    2670        28886 :         XLogRegisterBufData(0, unused,
    2671              :                             sizeof(OffsetNumber) * nunused);
    2672              :     }
    2673       129195 :     if (nfrozen > 0)
    2674        25352 :         XLogRegisterBufData(0, frz_offsets,
    2675              :                             sizeof(OffsetNumber) * nfrozen);
    2676              : 
    2677              :     /*
    2678              :      * Prepare the main xl_heap_prune record.  We already set the XLHP_HAS_*
    2679              :      * flag above.
    2680              :      */
    2681       129195 :     if (vmflags & VISIBILITYMAP_ALL_VISIBLE)
    2682              :     {
    2683        68362 :         xlrec.flags |= XLHP_VM_ALL_VISIBLE;
    2684        68362 :         if (vmflags & VISIBILITYMAP_ALL_FROZEN)
    2685        42027 :             xlrec.flags |= XLHP_VM_ALL_FROZEN;
    2686              :     }
    2687       129195 :     if (RelationIsAccessibleInLogicalDecoding(relation))
    2688          593 :         xlrec.flags |= XLHP_IS_CATALOG_REL;
    2689       129195 :     if (TransactionIdIsValid(conflict_xid))
    2690       102366 :         xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON;
    2691       129195 :     if (cleanup_lock)
    2692       114397 :         xlrec.flags |= XLHP_CLEANUP_LOCK;
    2693              :     else
    2694              :     {
    2695              :         Assert(nredirected == 0 && ndead == 0);
    2696              :         /* also, any items in 'unused' must've been LP_DEAD previously */
    2697              :     }
    2698       129195 :     XLogRegisterData(&xlrec, SizeOfHeapPrune);
    2699       129195 :     if (TransactionIdIsValid(conflict_xid))
    2700       102366 :         XLogRegisterData(&conflict_xid, sizeof(TransactionId));
    2701              : 
    2702       129195 :     switch (reason)
    2703              :     {
    2704        59629 :         case PRUNE_ON_ACCESS:
    2705        59629 :             info = XLOG_HEAP2_PRUNE_ON_ACCESS;
    2706        59629 :             break;
    2707        54768 :         case PRUNE_VACUUM_SCAN:
    2708        54768 :             info = XLOG_HEAP2_PRUNE_VACUUM_SCAN;
    2709        54768 :             break;
    2710        14798 :         case PRUNE_VACUUM_CLEANUP:
    2711        14798 :             info = XLOG_HEAP2_PRUNE_VACUUM_CLEANUP;
    2712        14798 :             break;
    2713            0 :         default:
    2714            0 :             elog(ERROR, "unrecognized prune reason: %d", (int) reason);
    2715              :             break;
    2716              :     }
    2717       129195 :     recptr = XLogInsert(RM_HEAP2_ID, info);
    2718              : 
    2719       129195 :     if (do_set_vm)
    2720              :     {
    2721              :         Assert(BufferIsDirty(vmbuffer));
    2722        68362 :         PageSetLSN(BufferGetPage(vmbuffer), recptr);
    2723              :     }
    2724              : 
    2725              :     /*
    2726              :      * If we explicitly skip an FPI, we must not stamp the heap page with this
    2727              :      * record's LSN. Recovery skips records <= the stamped LSN, so this could
    2728              :      * lead to skipping an earlier FPI needed to repair a torn page.
    2729              :      */
    2730       129195 :     if (heap_fpi_allowed)
    2731              :     {
    2732              :         Assert(BufferIsDirty(buffer));
    2733       127863 :         PageSetLSN(heap_page, recptr);
    2734              :     }
    2735       129195 : }
        

Generated by: LCOV version 2.0-1