Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pruneheap.c
4 : * heap page pruning and HOT-chain management code
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/pruneheap.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/heapam.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/htup_details.h"
20 : #include "access/multixact.h"
21 : #include "access/transam.h"
22 : #include "access/visibilitymapdefs.h"
23 : #include "access/xlog.h"
24 : #include "access/xloginsert.h"
25 : #include "commands/vacuum.h"
26 : #include "executor/instrument.h"
27 : #include "miscadmin.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "utils/rel.h"
31 : #include "utils/snapmgr.h"
32 :
33 : /* Working data for heap_page_prune_and_freeze() and subroutines */
34 : typedef struct
35 : {
36 : /*-------------------------------------------------------
37 : * Arguments passed to heap_page_prune_and_freeze()
38 : *-------------------------------------------------------
39 : */
40 :
41 : /* tuple visibility test, initialized for the relation */
42 : GlobalVisState *vistest;
43 : /* whether or not dead items can be set LP_UNUSED during pruning */
44 : bool mark_unused_now;
45 : /* whether to attempt freezing tuples */
46 : bool attempt_freeze;
47 : struct VacuumCutoffs *cutoffs;
48 :
49 : /*-------------------------------------------------------
50 : * Fields describing what to do to the page
51 : *-------------------------------------------------------
52 : */
53 : TransactionId new_prune_xid; /* new prune hint value */
54 : TransactionId latest_xid_removed;
55 : int nredirected; /* numbers of entries in arrays below */
56 : int ndead;
57 : int nunused;
58 : int nfrozen;
59 : /* arrays that accumulate indexes of items to be changed */
60 : OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
61 : OffsetNumber nowdead[MaxHeapTuplesPerPage];
62 : OffsetNumber nowunused[MaxHeapTuplesPerPage];
63 : HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
64 :
65 : /*-------------------------------------------------------
66 : * Working state for HOT chain processing
67 : *-------------------------------------------------------
68 : */
69 :
70 : /*
71 : * 'root_items' contains offsets of all LP_REDIRECT line pointers and
72 : * normal non-HOT tuples. They can be stand-alone items or the first item
73 : * in a HOT chain. 'heaponly_items' contains heap-only tuples which can
74 : * only be removed as part of a HOT chain.
75 : */
76 : int nroot_items;
77 : OffsetNumber root_items[MaxHeapTuplesPerPage];
78 : int nheaponly_items;
79 : OffsetNumber heaponly_items[MaxHeapTuplesPerPage];
80 :
81 : /*
82 : * processed[offnum] is true if item at offnum has been processed.
83 : *
84 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
85 : * 1. Otherwise every access would need to subtract 1.
86 : */
87 : bool processed[MaxHeapTuplesPerPage + 1];
88 :
89 : /*
90 : * Tuple visibility is only computed once for each tuple, for correctness
91 : * and efficiency reasons; see comment in heap_page_prune_and_freeze() for
92 : * details. This is of type int8[], instead of HTSV_Result[], so we can
93 : * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD
94 : * items.
95 : *
96 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
97 : * 1. Otherwise every access would need to subtract 1.
98 : */
99 : int8 htsv[MaxHeapTuplesPerPage + 1];
100 :
101 : /*
102 : * Freezing-related state.
103 : */
104 : HeapPageFreeze pagefrz;
105 :
106 : /*-------------------------------------------------------
107 : * Information about what was done
108 : *
109 : * These fields are not used by pruning itself for the most part, but are
110 : * used to collect information about what was pruned and what state the
111 : * page is in after pruning, for the benefit of the caller. They are
112 : * copied to the caller's PruneFreezeResult at the end.
113 : * -------------------------------------------------------
114 : */
115 :
116 : int ndeleted; /* Number of tuples deleted from the page */
117 :
118 : /* Number of live and recently dead tuples, after pruning */
119 : int live_tuples;
120 : int recently_dead_tuples;
121 :
122 : /* Whether or not the page makes rel truncation unsafe */
123 : bool hastup;
124 :
125 : /*
126 : * LP_DEAD items on the page after pruning. Includes existing LP_DEAD
127 : * items
128 : */
129 : int lpdead_items; /* number of items in the array */
130 : OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */
131 :
132 : /*
133 : * The snapshot conflict horizon used when freezing tuples. The final
134 : * snapshot conflict horizon for the record may be newer if pruning
135 : * removes newer transaction IDs.
136 : */
137 : TransactionId frz_conflict_horizon;
138 :
139 : /*
140 : * all_visible and all_frozen indicate if the all-visible and all-frozen
141 : * bits in the visibility map can be set for this page after pruning.
142 : *
143 : * visibility_cutoff_xid is the newest xmin of live tuples on the page.
144 : * The caller can use it as the conflict horizon, when setting the VM
145 : * bits. It is only valid if we froze some tuples, and all_frozen is
146 : * true.
147 : *
148 : * NOTE: all_visible and all_frozen initially don't include LP_DEAD items.
149 : * That's convenient for heap_page_prune_and_freeze() to use them to
150 : * decide whether to freeze the page or not. The all_visible and
151 : * all_frozen values returned to the caller are adjusted to include
152 : * LP_DEAD items after we determine whether to opportunistically freeze.
153 : */
154 : bool all_visible;
155 : bool all_frozen;
156 : TransactionId visibility_cutoff_xid;
157 : } PruneState;
158 :
159 : /* Local functions */
160 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
161 : HeapTuple tup,
162 : Buffer buffer);
163 : static inline HTSV_Result htsv_get_valid_status(int status);
164 : static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff,
165 : OffsetNumber rootoffnum, PruneState *prstate);
166 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
167 : static void heap_prune_record_redirect(PruneState *prstate,
168 : OffsetNumber offnum, OffsetNumber rdoffnum,
169 : bool was_normal);
170 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
171 : bool was_normal);
172 : static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
173 : bool was_normal);
174 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
175 :
176 : static void heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum);
177 : static void heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum);
178 : static void heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum);
179 : static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum);
180 :
181 : static void page_verify_redirects(Page page);
182 :
183 : static bool heap_page_will_freeze(Relation relation, Buffer buffer,
184 : bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune,
185 : PruneState *prstate);
186 :
187 :
188 : /*
189 : * Optionally prune and repair fragmentation in the specified page.
190 : *
191 : * This is an opportunistic function. It will perform housekeeping
192 : * only if the page heuristically looks like a candidate for pruning and we
193 : * can acquire buffer cleanup lock without blocking.
194 : *
195 : * Note: this is called quite often. It's important that it fall out quickly
196 : * if there's not any use in pruning.
197 : *
198 : * Caller must have pin on the buffer, and must *not* have a lock on it.
199 : */
200 : void
201 32695268 : heap_page_prune_opt(Relation relation, Buffer buffer)
202 : {
203 32695268 : Page page = BufferGetPage(buffer);
204 : TransactionId prune_xid;
205 : GlobalVisState *vistest;
206 : Size minfree;
207 :
208 : /*
209 : * We can't write WAL in recovery mode, so there's no point trying to
210 : * clean the page. The primary will likely issue a cleaning WAL record
211 : * soon anyway, so this is no particular loss.
212 : */
213 32695268 : if (RecoveryInProgress())
214 435870 : return;
215 :
216 : /*
217 : * First check whether there's any chance there's something to prune,
218 : * determining the appropriate horizon is a waste if there's no prune_xid
219 : * (i.e. no updates/deletes left potentially dead tuples around).
220 : */
221 32259398 : prune_xid = ((PageHeader) page)->pd_prune_xid;
222 32259398 : if (!TransactionIdIsValid(prune_xid))
223 16764888 : return;
224 :
225 : /*
226 : * Check whether prune_xid indicates that there may be dead rows that can
227 : * be cleaned up.
228 : */
229 15494510 : vistest = GlobalVisTestFor(relation);
230 :
231 15494510 : if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
232 13014116 : return;
233 :
234 : /*
235 : * We prune when a previous UPDATE failed to find enough space on the page
236 : * for a new tuple version, or when free space falls below the relation's
237 : * fill-factor target (but not less than 10%).
238 : *
239 : * Checking free space here is questionable since we aren't holding any
240 : * lock on the buffer; in the worst case we could get a bogus answer. It's
241 : * unlikely to be *seriously* wrong, though, since reading either pd_lower
242 : * or pd_upper is probably atomic. Avoiding taking a lock seems more
243 : * important than sometimes getting a wrong answer in what is after all
244 : * just a heuristic estimate.
245 : */
246 2480394 : minfree = RelationGetTargetPageFreeSpace(relation,
247 : HEAP_DEFAULT_FILLFACTOR);
248 2480394 : minfree = Max(minfree, BLCKSZ / 10);
249 :
250 2480394 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
251 : {
252 : /* OK, try to get exclusive buffer lock */
253 83922 : if (!ConditionalLockBufferForCleanup(buffer))
254 1138 : return;
255 :
256 : /*
257 : * Now that we have buffer lock, get accurate information about the
258 : * page's free space, and recheck the heuristic about whether to
259 : * prune.
260 : */
261 82784 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
262 : {
263 : OffsetNumber dummy_off_loc;
264 : PruneFreezeResult presult;
265 :
266 : /*
267 : * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
268 : * regardless of whether or not the relation has indexes, since we
269 : * cannot safely determine that during on-access pruning with the
270 : * current implementation.
271 : */
272 82784 : PruneFreezeParams params = {
273 : .relation = relation,
274 : .buffer = buffer,
275 : .reason = PRUNE_ON_ACCESS,
276 : .options = 0,
277 : .vistest = vistest,
278 : .cutoffs = NULL,
279 : };
280 :
281 82784 : heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc,
282 : NULL, NULL);
283 :
284 : /*
285 : * Report the number of tuples reclaimed to pgstats. This is
286 : * presult.ndeleted minus the number of newly-LP_DEAD-set items.
287 : *
288 : * We derive the number of dead tuples like this to avoid totally
289 : * forgetting about items that were set to LP_DEAD, since they
290 : * still need to be cleaned up by VACUUM. We only want to count
291 : * heap-only tuples that just became LP_UNUSED in our report,
292 : * which don't.
293 : *
294 : * VACUUM doesn't have to compensate in the same way when it
295 : * tracks ndeleted, since it will set the same LP_DEAD items to
296 : * LP_UNUSED separately.
297 : */
298 82784 : if (presult.ndeleted > presult.nnewlpdead)
299 37918 : pgstat_update_heap_dead_tuples(relation,
300 37918 : presult.ndeleted - presult.nnewlpdead);
301 : }
302 :
303 : /* And release buffer lock */
304 82784 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
305 :
306 : /*
307 : * We avoid reuse of any free space created on the page by unrelated
308 : * UPDATEs/INSERTs by opting to not update the FSM at this point. The
309 : * free space should be reused by UPDATEs to *this* page.
310 : */
311 : }
312 : }
313 :
314 : /*
315 : * Decide whether to proceed with freezing according to the freeze plans
316 : * prepared for the given heap buffer. If freezing is chosen, this function
317 : * performs several pre-freeze checks.
318 : *
319 : * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be
320 : * determined before calling this function.
321 : *
322 : * prstate is both an input and output parameter.
323 : *
324 : * Returns true if we should apply the freeze plans and freeze tuples on the
325 : * page, and false otherwise.
326 : */
327 : static bool
328 1278294 : heap_page_will_freeze(Relation relation, Buffer buffer,
329 : bool did_tuple_hint_fpi,
330 : bool do_prune,
331 : bool do_hint_prune,
332 : PruneState *prstate)
333 : {
334 1278294 : bool do_freeze = false;
335 :
336 : /*
337 : * If the caller specified we should not attempt to freeze any tuples,
338 : * validate that everything is in the right state and return.
339 : */
340 1278294 : if (!prstate->attempt_freeze)
341 : {
342 : Assert(!prstate->all_frozen && prstate->nfrozen == 0);
343 : Assert(prstate->lpdead_items == 0 || !prstate->all_visible);
344 82784 : return false;
345 : }
346 :
347 1195510 : if (prstate->pagefrz.freeze_required)
348 : {
349 : /*
350 : * heap_prepare_freeze_tuple indicated that at least one XID/MXID from
351 : * before FreezeLimit/MultiXactCutoff is present. Must freeze to
352 : * advance relfrozenxid/relminmxid.
353 : */
354 41638 : do_freeze = true;
355 : }
356 : else
357 : {
358 : /*
359 : * Opportunistically freeze the page if we are generating an FPI
360 : * anyway and if doing so means that we can set the page all-frozen
361 : * afterwards (might not happen until VACUUM's final heap pass).
362 : *
363 : * XXX: Previously, we knew if pruning emitted an FPI by checking
364 : * pgWalUsage.wal_fpi before and after pruning. Once the freeze and
365 : * prune records were combined, this heuristic couldn't be used
366 : * anymore. The opportunistic freeze heuristic must be improved;
367 : * however, for now, try to approximate the old logic.
368 : */
369 1153872 : if (prstate->all_frozen && prstate->nfrozen > 0)
370 : {
371 : Assert(prstate->all_visible);
372 :
373 : /*
374 : * Freezing would make the page all-frozen. Have already emitted
375 : * an FPI or will do so anyway?
376 : */
377 30388 : if (RelationNeedsWAL(relation))
378 : {
379 27080 : if (did_tuple_hint_fpi)
380 2626 : do_freeze = true;
381 24454 : else if (do_prune)
382 : {
383 2744 : if (XLogCheckBufferNeedsBackup(buffer))
384 1760 : do_freeze = true;
385 : }
386 21710 : else if (do_hint_prune)
387 : {
388 12 : if (XLogHintBitIsNeeded() && XLogCheckBufferNeedsBackup(buffer))
389 6 : do_freeze = true;
390 : }
391 : }
392 : }
393 : }
394 :
395 1195510 : if (do_freeze)
396 : {
397 : /*
398 : * Validate the tuples we will be freezing before entering the
399 : * critical section.
400 : */
401 46030 : heap_pre_freeze_checks(buffer, prstate->frozen, prstate->nfrozen);
402 :
403 : /*
404 : * Calculate what the snapshot conflict horizon should be for a record
405 : * freezing tuples. We can use the visibility_cutoff_xid as our cutoff
406 : * for conflicts when the whole page is eligible to become all-frozen
407 : * in the VM once we're done with it. Otherwise, we generate a
408 : * conservative cutoff by stepping back from OldestXmin.
409 : */
410 46030 : if (prstate->all_frozen)
411 41476 : prstate->frz_conflict_horizon = prstate->visibility_cutoff_xid;
412 : else
413 : {
414 : /* Avoids false conflicts when hot_standby_feedback in use */
415 4554 : prstate->frz_conflict_horizon = prstate->cutoffs->OldestXmin;
416 4554 : TransactionIdRetreat(prstate->frz_conflict_horizon);
417 : }
418 : }
419 1149480 : else if (prstate->nfrozen > 0)
420 : {
421 : /*
422 : * The page contained some tuples that were not already frozen, and we
423 : * chose not to freeze them now. The page won't be all-frozen then.
424 : */
425 : Assert(!prstate->pagefrz.freeze_required);
426 :
427 27842 : prstate->all_frozen = false;
428 27842 : prstate->nfrozen = 0; /* avoid miscounts in instrumentation */
429 : }
430 : else
431 : {
432 : /*
433 : * We have no freeze plans to execute. The page might already be
434 : * all-frozen (perhaps only following pruning), though. Such pages
435 : * can be marked all-frozen in the VM by our caller, even though none
436 : * of its tuples were newly frozen here.
437 : */
438 : }
439 :
440 1195510 : return do_freeze;
441 : }
442 :
443 :
444 : /*
445 : * Prune and repair fragmentation and potentially freeze tuples on the
446 : * specified page.
447 : *
448 : * Caller must have pin and buffer cleanup lock on the page. Note that we
449 : * don't update the FSM information for page on caller's behalf. Caller might
450 : * also need to account for a reduction in the length of the line pointer
451 : * array following array truncation by us.
452 : *
453 : * params contains the input parameters used to control freezing and pruning
454 : * behavior. See the definition of PruneFreezeParams for more on what each
455 : * parameter does.
456 : *
457 : * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze
458 : * tuples if it's required in order to advance relfrozenxid / relminmxid, or
459 : * if it's considered advantageous for overall system performance to do so
460 : * now. The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and
461 : * 'new_relmin_mxid' arguments are required when freezing. When
462 : * HEAP_PAGE_PRUNE_FREEZE option is passed, we also set presult->all_visible
463 : * and presult->all_frozen after determining whether or not to
464 : * opporunistically freeze, to indicate if the VM bits can be set. They are
465 : * always set to false when the HEAP_PAGE_PRUNE_FREEZE option is not passed,
466 : * because at the moment only callers that also freeze need that information.
467 : *
468 : * presult contains output parameters needed by callers, such as the number of
469 : * tuples removed and the offsets of dead items on the page after pruning.
470 : * heap_page_prune_and_freeze() is responsible for initializing it. Required
471 : * by all callers.
472 : *
473 : * off_loc is the offset location required by the caller to use in error
474 : * callback.
475 : *
476 : * new_relfrozen_xid and new_relmin_mxid must provided by the caller if the
477 : * HEAP_PAGE_PRUNE_FREEZE option is set in params. On entry, they contain the
478 : * oldest XID and multi-XID seen on the relation so far. They will be updated
479 : * with oldest values present on the page after pruning. After processing the
480 : * whole relation, VACUUM can use these values as the new
481 : * relfrozenxid/relminmxid for the relation.
482 : */
483 : void
484 1278294 : heap_page_prune_and_freeze(PruneFreezeParams *params,
485 : PruneFreezeResult *presult,
486 : OffsetNumber *off_loc,
487 : TransactionId *new_relfrozen_xid,
488 : MultiXactId *new_relmin_mxid)
489 : {
490 1278294 : Buffer buffer = params->buffer;
491 1278294 : Page page = BufferGetPage(buffer);
492 1278294 : BlockNumber blockno = BufferGetBlockNumber(buffer);
493 : OffsetNumber offnum,
494 : maxoff;
495 : PruneState prstate;
496 : HeapTupleData tup;
497 : bool do_freeze;
498 : bool do_prune;
499 : bool do_hint_prune;
500 : bool did_tuple_hint_fpi;
501 1278294 : int64 fpi_before = pgWalUsage.wal_fpi;
502 :
503 : /* Copy parameters to prstate */
504 1278294 : prstate.vistest = params->vistest;
505 1278294 : prstate.mark_unused_now =
506 1278294 : (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0;
507 1278294 : prstate.attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0;
508 1278294 : prstate.cutoffs = params->cutoffs;
509 :
510 : /*
511 : * Our strategy is to scan the page and make lists of items to change,
512 : * then apply the changes within a critical section. This keeps as much
513 : * logic as possible out of the critical section, and also ensures that
514 : * WAL replay will work the same as the normal case.
515 : *
516 : * First, initialize the new pd_prune_xid value to zero (indicating no
517 : * prunable tuples). If we find any tuples which may soon become
518 : * prunable, we will save the lowest relevant XID in new_prune_xid. Also
519 : * initialize the rest of our working state.
520 : */
521 1278294 : prstate.new_prune_xid = InvalidTransactionId;
522 1278294 : prstate.latest_xid_removed = InvalidTransactionId;
523 1278294 : prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nfrozen = 0;
524 1278294 : prstate.nroot_items = 0;
525 1278294 : prstate.nheaponly_items = 0;
526 :
527 : /* initialize page freezing working state */
528 1278294 : prstate.pagefrz.freeze_required = false;
529 1278294 : if (prstate.attempt_freeze)
530 : {
531 : Assert(new_relfrozen_xid && new_relmin_mxid);
532 1195510 : prstate.pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid;
533 1195510 : prstate.pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid;
534 1195510 : prstate.pagefrz.FreezePageRelminMxid = *new_relmin_mxid;
535 1195510 : prstate.pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid;
536 : }
537 : else
538 : {
539 : Assert(new_relfrozen_xid == NULL && new_relmin_mxid == NULL);
540 82784 : prstate.pagefrz.FreezePageRelminMxid = InvalidMultiXactId;
541 82784 : prstate.pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId;
542 82784 : prstate.pagefrz.FreezePageRelfrozenXid = InvalidTransactionId;
543 82784 : prstate.pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId;
544 : }
545 :
546 1278294 : prstate.ndeleted = 0;
547 1278294 : prstate.live_tuples = 0;
548 1278294 : prstate.recently_dead_tuples = 0;
549 1278294 : prstate.hastup = false;
550 1278294 : prstate.lpdead_items = 0;
551 1278294 : prstate.deadoffsets = presult->deadoffsets;
552 1278294 : prstate.frz_conflict_horizon = InvalidTransactionId;
553 :
554 : /*
555 : * Caller may update the VM after we're done. We can keep track of
556 : * whether the page will be all-visible and all-frozen after pruning and
557 : * freezing to help the caller to do that.
558 : *
559 : * Currently, only VACUUM sets the VM bits. To save the effort, only do
560 : * the bookkeeping if the caller needs it. Currently, that's tied to
561 : * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted
562 : * to update the VM bits without also freezing or freeze without also
563 : * setting the VM bits.
564 : *
565 : * In addition to telling the caller whether it can set the VM bit, we
566 : * also use 'all_visible' and 'all_frozen' for our own decision-making. If
567 : * the whole page would become frozen, we consider opportunistically
568 : * freezing tuples. We will not be able to freeze the whole page if there
569 : * are tuples present that are not visible to everyone or if there are
570 : * dead tuples which are not yet removable. However, dead tuples which
571 : * will be removed by the end of vacuuming should not preclude us from
572 : * opportunistically freezing. Because of that, we do not immediately
573 : * clear all_visible and all_frozen when we see LP_DEAD items. We fix
574 : * that after scanning the line pointers. We must correct all_visible and
575 : * all_frozen before we return them to the caller, so that the caller
576 : * doesn't set the VM bits incorrectly.
577 : */
578 1278294 : if (prstate.attempt_freeze)
579 : {
580 1195510 : prstate.all_visible = true;
581 1195510 : prstate.all_frozen = true;
582 : }
583 : else
584 : {
585 : /*
586 : * Initializing to false allows skipping the work to update them in
587 : * heap_prune_record_unchanged_lp_normal().
588 : */
589 82784 : prstate.all_visible = false;
590 82784 : prstate.all_frozen = false;
591 : }
592 :
593 : /*
594 : * The visibility cutoff xid is the newest xmin of live tuples on the
595 : * page. In the common case, this will be set as the conflict horizon the
596 : * caller can use for updating the VM. If, at the end of freezing and
597 : * pruning, the page is all-frozen, there is no possibility that any
598 : * running transaction on the standby does not see tuples on the page as
599 : * all-visible, so the conflict horizon remains InvalidTransactionId.
600 : */
601 1278294 : prstate.visibility_cutoff_xid = InvalidTransactionId;
602 :
603 1278294 : maxoff = PageGetMaxOffsetNumber(page);
604 1278294 : tup.t_tableOid = RelationGetRelid(params->relation);
605 :
606 : /*
607 : * Determine HTSV for all tuples, and queue them up for processing as HOT
608 : * chain roots or as heap-only items.
609 : *
610 : * Determining HTSV only once for each tuple is required for correctness,
611 : * to deal with cases where running HTSV twice could result in different
612 : * results. For example, RECENTLY_DEAD can turn to DEAD if another
613 : * checked item causes GlobalVisTestIsRemovableFullXid() to update the
614 : * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting
615 : * transaction aborts.
616 : *
617 : * It's also good for performance. Most commonly tuples within a page are
618 : * stored at decreasing offsets (while the items are stored at increasing
619 : * offsets). When processing all tuples on a page this leads to reading
620 : * memory at decreasing offsets within a page, with a variable stride.
621 : * That's hard for CPU prefetchers to deal with. Processing the items in
622 : * reverse order (and thus the tuples in increasing order) increases
623 : * prefetching efficiency significantly / decreases the number of cache
624 : * misses.
625 : */
626 1278294 : for (offnum = maxoff;
627 62139652 : offnum >= FirstOffsetNumber;
628 60861358 : offnum = OffsetNumberPrev(offnum))
629 : {
630 60861358 : ItemId itemid = PageGetItemId(page, offnum);
631 : HeapTupleHeader htup;
632 :
633 : /*
634 : * Set the offset number so that we can display it along with any
635 : * error that occurred while processing this tuple.
636 : */
637 60861358 : *off_loc = offnum;
638 :
639 60861358 : prstate.processed[offnum] = false;
640 60861358 : prstate.htsv[offnum] = -1;
641 :
642 : /* Nothing to do if slot doesn't contain a tuple */
643 60861358 : if (!ItemIdIsUsed(itemid))
644 : {
645 618914 : heap_prune_record_unchanged_lp_unused(page, &prstate, offnum);
646 618914 : continue;
647 : }
648 :
649 60242444 : if (ItemIdIsDead(itemid))
650 : {
651 : /*
652 : * If the caller set mark_unused_now true, we can set dead line
653 : * pointers LP_UNUSED now.
654 : */
655 2112438 : if (unlikely(prstate.mark_unused_now))
656 1052 : heap_prune_record_unused(&prstate, offnum, false);
657 : else
658 2111386 : heap_prune_record_unchanged_lp_dead(page, &prstate, offnum);
659 2112438 : continue;
660 : }
661 :
662 58130006 : if (ItemIdIsRedirected(itemid))
663 : {
664 : /* This is the start of a HOT chain */
665 647138 : prstate.root_items[prstate.nroot_items++] = offnum;
666 647138 : continue;
667 : }
668 :
669 : Assert(ItemIdIsNormal(itemid));
670 :
671 : /*
672 : * Get the tuple's visibility status and queue it up for processing.
673 : */
674 57482868 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
675 57482868 : tup.t_data = htup;
676 57482868 : tup.t_len = ItemIdGetLength(itemid);
677 57482868 : ItemPointerSet(&tup.t_self, blockno, offnum);
678 :
679 57482868 : prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
680 : buffer);
681 :
682 57482868 : if (!HeapTupleHeaderIsHeapOnly(htup))
683 56597266 : prstate.root_items[prstate.nroot_items++] = offnum;
684 : else
685 885602 : prstate.heaponly_items[prstate.nheaponly_items++] = offnum;
686 : }
687 :
688 : /*
689 : * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused
690 : * an FPI to be emitted.
691 : */
692 1278294 : did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi;
693 :
694 : /*
695 : * Process HOT chains.
696 : *
697 : * We added the items to the array starting from 'maxoff', so by
698 : * processing the array in reverse order, we process the items in
699 : * ascending offset number order. The order doesn't matter for
700 : * correctness, but some quick micro-benchmarking suggests that this is
701 : * faster. (Earlier PostgreSQL versions, which scanned all the items on
702 : * the page instead of using the root_items array, also did it in
703 : * ascending offset number order.)
704 : */
705 58522698 : for (int i = prstate.nroot_items - 1; i >= 0; i--)
706 : {
707 57244404 : offnum = prstate.root_items[i];
708 :
709 : /* Ignore items already processed as part of an earlier chain */
710 57244404 : if (prstate.processed[offnum])
711 0 : continue;
712 :
713 : /* see preceding loop */
714 57244404 : *off_loc = offnum;
715 :
716 : /* Process this item or chain of items */
717 57244404 : heap_prune_chain(page, blockno, maxoff, offnum, &prstate);
718 : }
719 :
720 : /*
721 : * Process any heap-only tuples that were not already processed as part of
722 : * a HOT chain.
723 : */
724 2163896 : for (int i = prstate.nheaponly_items - 1; i >= 0; i--)
725 : {
726 885602 : offnum = prstate.heaponly_items[i];
727 :
728 885602 : if (prstate.processed[offnum])
729 858646 : continue;
730 :
731 : /* see preceding loop */
732 26956 : *off_loc = offnum;
733 :
734 : /*
735 : * If the tuple is DEAD and doesn't chain to anything else, mark it
736 : * unused. (If it does chain, we can only remove it as part of
737 : * pruning its chain.)
738 : *
739 : * We need this primarily to handle aborted HOT updates, that is,
740 : * XMIN_INVALID heap-only tuples. Those might not be linked to by any
741 : * chain, since the parent tuple might be re-updated before any
742 : * pruning occurs. So we have to be able to reap them separately from
743 : * chain-pruning. (Note that HeapTupleHeaderIsHotUpdated will never
744 : * return true for an XMIN_INVALID tuple, so this code will work even
745 : * when there were sequential updates within the aborted transaction.)
746 : */
747 26956 : if (prstate.htsv[offnum] == HEAPTUPLE_DEAD)
748 : {
749 4322 : ItemId itemid = PageGetItemId(page, offnum);
750 4322 : HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
751 :
752 4322 : if (likely(!HeapTupleHeaderIsHotUpdated(htup)))
753 : {
754 4322 : HeapTupleHeaderAdvanceConflictHorizon(htup,
755 : &prstate.latest_xid_removed);
756 4322 : heap_prune_record_unused(&prstate, offnum, true);
757 : }
758 : else
759 : {
760 : /*
761 : * This tuple should've been processed and removed as part of
762 : * a HOT chain, so something's wrong. To preserve evidence,
763 : * we don't dare to remove it. We cannot leave behind a DEAD
764 : * tuple either, because that will cause VACUUM to error out.
765 : * Throwing an error with a distinct error message seems like
766 : * the least bad option.
767 : */
768 0 : elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
769 : blockno, offnum);
770 : }
771 : }
772 : else
773 22634 : heap_prune_record_unchanged_lp_normal(page, &prstate, offnum);
774 : }
775 :
776 : /* We should now have processed every tuple exactly once */
777 : #ifdef USE_ASSERT_CHECKING
778 : for (offnum = FirstOffsetNumber;
779 : offnum <= maxoff;
780 : offnum = OffsetNumberNext(offnum))
781 : {
782 : *off_loc = offnum;
783 :
784 : Assert(prstate.processed[offnum]);
785 : }
786 : #endif
787 :
788 : /* Clear the offset information once we have processed the given page. */
789 1278294 : *off_loc = InvalidOffsetNumber;
790 :
791 3801446 : do_prune = prstate.nredirected > 0 ||
792 2451836 : prstate.ndead > 0 ||
793 1173542 : prstate.nunused > 0;
794 :
795 : /*
796 : * Even if we don't prune anything, if we found a new value for the
797 : * pd_prune_xid field or the page was marked full, we will update the hint
798 : * bit.
799 : */
800 2451266 : do_hint_prune = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
801 1172972 : PageIsFull(page);
802 :
803 : /*
804 : * Decide if we want to go ahead with freezing according to the freeze
805 : * plans we prepared, or not.
806 : */
807 1278294 : do_freeze = heap_page_will_freeze(params->relation, buffer,
808 : did_tuple_hint_fpi,
809 : do_prune,
810 : do_hint_prune,
811 : &prstate);
812 :
813 : /*
814 : * While scanning the line pointers, we did not clear
815 : * all_visible/all_frozen when encountering LP_DEAD items because we
816 : * wanted the decision whether or not to freeze the page to be unaffected
817 : * by the short-term presence of LP_DEAD items. These LP_DEAD items are
818 : * effectively assumed to be LP_UNUSED items in the making. It doesn't
819 : * matter which vacuum heap pass (initial pass or final pass) ends up
820 : * setting the page all-frozen, as long as the ongoing VACUUM does it.
821 : *
822 : * Now that we finished determining whether or not to freeze the page,
823 : * update all_visible and all_frozen so that they reflect the true state
824 : * of the page for setting PD_ALL_VISIBLE and VM bits.
825 : */
826 1278294 : if (prstate.lpdead_items > 0)
827 108022 : prstate.all_visible = prstate.all_frozen = false;
828 :
829 : Assert(!prstate.all_frozen || prstate.all_visible);
830 :
831 : /* Any error while applying the changes is critical */
832 1278294 : START_CRIT_SECTION();
833 :
834 1278294 : if (do_hint_prune)
835 : {
836 : /*
837 : * Update the page's pd_prune_xid field to either zero, or the lowest
838 : * XID of any soon-prunable tuple.
839 : */
840 105410 : ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
841 :
842 : /*
843 : * Also clear the "page is full" flag, since there's no point in
844 : * repeating the prune/defrag process until something else happens to
845 : * the page.
846 : */
847 105410 : PageClearFull(page);
848 :
849 : /*
850 : * If that's all we had to do to the page, this is a non-WAL-logged
851 : * hint. If we are going to freeze or prune the page, we will mark
852 : * the buffer dirty below.
853 : */
854 105410 : if (!do_freeze && !do_prune)
855 304 : MarkBufferDirtyHint(buffer, true);
856 : }
857 :
858 1278294 : if (do_prune || do_freeze)
859 : {
860 : /* Apply the planned item changes and repair page fragmentation. */
861 147446 : if (do_prune)
862 : {
863 105514 : heap_page_prune_execute(buffer, false,
864 : prstate.redirected, prstate.nredirected,
865 : prstate.nowdead, prstate.ndead,
866 : prstate.nowunused, prstate.nunused);
867 : }
868 :
869 147446 : if (do_freeze)
870 46030 : heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen);
871 :
872 147446 : MarkBufferDirty(buffer);
873 :
874 : /*
875 : * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
876 : */
877 147446 : if (RelationNeedsWAL(params->relation))
878 : {
879 : /*
880 : * The snapshotConflictHorizon for the whole record should be the
881 : * most conservative of all the horizons calculated for any of the
882 : * possible modifications. If this record will prune tuples, any
883 : * transactions on the standby older than the youngest xmax of the
884 : * most recently removed tuple this record will prune will
885 : * conflict. If this record will freeze tuples, any transactions
886 : * on the standby with xids older than the youngest tuple this
887 : * record will freeze will conflict.
888 : */
889 : TransactionId conflict_xid;
890 :
891 145694 : if (TransactionIdFollows(prstate.frz_conflict_horizon,
892 : prstate.latest_xid_removed))
893 42630 : conflict_xid = prstate.frz_conflict_horizon;
894 : else
895 103064 : conflict_xid = prstate.latest_xid_removed;
896 :
897 145694 : log_heap_prune_and_freeze(params->relation, buffer,
898 : InvalidBuffer, /* vmbuffer */
899 : 0, /* vmflags */
900 : conflict_xid,
901 : true, params->reason,
902 : prstate.frozen, prstate.nfrozen,
903 : prstate.redirected, prstate.nredirected,
904 : prstate.nowdead, prstate.ndead,
905 : prstate.nowunused, prstate.nunused);
906 : }
907 : }
908 :
909 1278294 : END_CRIT_SECTION();
910 :
911 : /* Copy information back for caller */
912 1278294 : presult->ndeleted = prstate.ndeleted;
913 1278294 : presult->nnewlpdead = prstate.ndead;
914 1278294 : presult->nfrozen = prstate.nfrozen;
915 1278294 : presult->live_tuples = prstate.live_tuples;
916 1278294 : presult->recently_dead_tuples = prstate.recently_dead_tuples;
917 1278294 : presult->all_visible = prstate.all_visible;
918 1278294 : presult->all_frozen = prstate.all_frozen;
919 1278294 : presult->hastup = prstate.hastup;
920 :
921 : /*
922 : * For callers planning to update the visibility map, the conflict horizon
923 : * for that record must be the newest xmin on the page. However, if the
924 : * page is completely frozen, there can be no conflict and the
925 : * vm_conflict_horizon should remain InvalidTransactionId. This includes
926 : * the case that we just froze all the tuples; the prune-freeze record
927 : * included the conflict XID already so the caller doesn't need it.
928 : */
929 1278294 : if (presult->all_frozen)
930 441838 : presult->vm_conflict_horizon = InvalidTransactionId;
931 : else
932 836456 : presult->vm_conflict_horizon = prstate.visibility_cutoff_xid;
933 :
934 1278294 : presult->lpdead_items = prstate.lpdead_items;
935 : /* the presult->deadoffsets array was already filled in */
936 :
937 1278294 : if (prstate.attempt_freeze)
938 : {
939 1195510 : if (presult->nfrozen > 0)
940 : {
941 46030 : *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid;
942 46030 : *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid;
943 : }
944 : else
945 : {
946 1149480 : *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid;
947 1149480 : *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid;
948 : }
949 : }
950 1278294 : }
951 :
952 :
953 : /*
954 : * Perform visibility checks for heap pruning.
955 : */
956 : static HTSV_Result
957 57482868 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
958 : {
959 : HTSV_Result res;
960 : TransactionId dead_after;
961 :
962 57482868 : res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
963 :
964 57482868 : if (res != HEAPTUPLE_RECENTLY_DEAD)
965 54006570 : return res;
966 :
967 : /*
968 : * For VACUUM, we must be sure to prune tuples with xmax older than
969 : * OldestXmin -- a visibility cutoff determined at the beginning of
970 : * vacuuming the relation. OldestXmin is used for freezing determination
971 : * and we cannot freeze dead tuples' xmaxes.
972 : */
973 3476298 : if (prstate->cutoffs &&
974 1954884 : TransactionIdIsValid(prstate->cutoffs->OldestXmin) &&
975 1954884 : NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin))
976 1301700 : return HEAPTUPLE_DEAD;
977 :
978 : /*
979 : * Determine whether or not the tuple is considered dead when compared
980 : * with the provided GlobalVisState. On-access pruning does not provide
981 : * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older
982 : * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead
983 : * if the GlobalVisState has been updated since the beginning of vacuuming
984 : * the relation.
985 : */
986 2174598 : if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
987 1463514 : return HEAPTUPLE_DEAD;
988 :
989 711084 : return res;
990 : }
991 :
992 :
993 : /*
994 : * Pruning calculates tuple visibility once and saves the results in an array
995 : * of int8. See PruneState.htsv for details. This helper function is meant
996 : * to guard against examining visibility status array members which have not
997 : * yet been computed.
998 : */
999 : static inline HTSV_Result
1000 57455912 : htsv_get_valid_status(int status)
1001 : {
1002 : Assert(status >= HEAPTUPLE_DEAD &&
1003 : status <= HEAPTUPLE_DELETE_IN_PROGRESS);
1004 57455912 : return (HTSV_Result) status;
1005 : }
1006 :
1007 : /*
1008 : * Prune specified line pointer or a HOT chain originating at line pointer.
1009 : *
1010 : * Tuple visibility information is provided in prstate->htsv.
1011 : *
1012 : * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
1013 : * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
1014 : * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
1015 : * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
1016 : * DEAD, our visibility test is just too coarse to detect it.
1017 : *
1018 : * Pruning must never leave behind a DEAD tuple that still has tuple storage.
1019 : * VACUUM isn't prepared to deal with that case.
1020 : *
1021 : * The root line pointer is redirected to the tuple immediately after the
1022 : * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
1023 : * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
1024 : * tuple, which we treat as a chain of length 1.)
1025 : *
1026 : * We don't actually change the page here. We just add entries to the arrays in
1027 : * prstate showing the changes to be made. Items to be redirected are added
1028 : * to the redirected[] array (two entries per redirection); items to be set to
1029 : * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
1030 : * state are added to nowunused[]. We perform bookkeeping of live tuples,
1031 : * visibility etc. based on what the page will look like after the changes
1032 : * applied. All that bookkeeping is performed in the heap_prune_record_*()
1033 : * subroutines. The division of labor is that heap_prune_chain() decides the
1034 : * fate of each tuple, ie. whether it's going to be removed, redirected or
1035 : * left unchanged, and the heap_prune_record_*() subroutines update PruneState
1036 : * based on that outcome.
1037 : */
1038 : static void
1039 57244404 : heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff,
1040 : OffsetNumber rootoffnum, PruneState *prstate)
1041 : {
1042 57244404 : TransactionId priorXmax = InvalidTransactionId;
1043 : ItemId rootlp;
1044 : OffsetNumber offnum;
1045 : OffsetNumber chainitems[MaxHeapTuplesPerPage];
1046 :
1047 : /*
1048 : * After traversing the HOT chain, ndeadchain is the index in chainitems
1049 : * of the first live successor after the last dead item.
1050 : */
1051 57244404 : int ndeadchain = 0,
1052 57244404 : nchain = 0;
1053 :
1054 57244404 : rootlp = PageGetItemId(page, rootoffnum);
1055 :
1056 : /* Start from the root tuple */
1057 57244404 : offnum = rootoffnum;
1058 :
1059 : /* while not end of the chain */
1060 : for (;;)
1061 858646 : {
1062 : HeapTupleHeader htup;
1063 : ItemId lp;
1064 :
1065 : /* Sanity check (pure paranoia) */
1066 58103050 : if (offnum < FirstOffsetNumber)
1067 0 : break;
1068 :
1069 : /*
1070 : * An offset past the end of page's line pointer array is possible
1071 : * when the array was truncated (original item must have been unused)
1072 : */
1073 58103050 : if (offnum > maxoff)
1074 0 : break;
1075 :
1076 : /* If item is already processed, stop --- it must not be same chain */
1077 58103050 : if (prstate->processed[offnum])
1078 0 : break;
1079 :
1080 58103050 : lp = PageGetItemId(page, offnum);
1081 :
1082 : /*
1083 : * Unused item obviously isn't part of the chain. Likewise, a dead
1084 : * line pointer can't be part of the chain. Both of those cases were
1085 : * already marked as processed.
1086 : */
1087 : Assert(ItemIdIsUsed(lp));
1088 : Assert(!ItemIdIsDead(lp));
1089 :
1090 : /*
1091 : * If we are looking at the redirected root line pointer, jump to the
1092 : * first normal tuple in the chain. If we find a redirect somewhere
1093 : * else, stop --- it must not be same chain.
1094 : */
1095 58103050 : if (ItemIdIsRedirected(lp))
1096 : {
1097 647138 : if (nchain > 0)
1098 0 : break; /* not at start of chain */
1099 647138 : chainitems[nchain++] = offnum;
1100 647138 : offnum = ItemIdGetRedirect(rootlp);
1101 647138 : continue;
1102 : }
1103 :
1104 : Assert(ItemIdIsNormal(lp));
1105 :
1106 57455912 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1107 :
1108 : /*
1109 : * Check the tuple XMIN against prior XMAX, if any
1110 : */
1111 57667420 : if (TransactionIdIsValid(priorXmax) &&
1112 211508 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
1113 0 : break;
1114 :
1115 : /*
1116 : * OK, this tuple is indeed a member of the chain.
1117 : */
1118 57455912 : chainitems[nchain++] = offnum;
1119 :
1120 57455912 : switch (htsv_get_valid_status(prstate->htsv[offnum]))
1121 : {
1122 2850544 : case HEAPTUPLE_DEAD:
1123 :
1124 : /* Remember the last DEAD tuple seen */
1125 2850544 : ndeadchain = nchain;
1126 2850544 : HeapTupleHeaderAdvanceConflictHorizon(htup,
1127 : &prstate->latest_xid_removed);
1128 : /* Advance to next chain member */
1129 2850544 : break;
1130 :
1131 711084 : case HEAPTUPLE_RECENTLY_DEAD:
1132 :
1133 : /*
1134 : * We don't need to advance the conflict horizon for
1135 : * RECENTLY_DEAD tuples, even if we are removing them. This
1136 : * is because we only remove RECENTLY_DEAD tuples if they
1137 : * precede a DEAD tuple, and the DEAD tuple must have been
1138 : * inserted by a newer transaction than the RECENTLY_DEAD
1139 : * tuple by virtue of being later in the chain. We will have
1140 : * advanced the conflict horizon for the DEAD tuple.
1141 : */
1142 :
1143 : /*
1144 : * Advance past RECENTLY_DEAD tuples just in case there's a
1145 : * DEAD one after them. We have to make sure that we don't
1146 : * miss any DEAD tuples, since DEAD tuples that still have
1147 : * tuple storage after pruning will confuse VACUUM.
1148 : */
1149 711084 : break;
1150 :
1151 53894284 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1152 : case HEAPTUPLE_LIVE:
1153 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1154 53894284 : goto process_chain;
1155 :
1156 0 : default:
1157 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1158 : goto process_chain;
1159 : }
1160 :
1161 : /*
1162 : * If the tuple is not HOT-updated, then we are at the end of this
1163 : * HOT-update chain.
1164 : */
1165 3561628 : if (!HeapTupleHeaderIsHotUpdated(htup))
1166 3350120 : goto process_chain;
1167 :
1168 : /* HOT implies it can't have moved to different partition */
1169 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1170 :
1171 : /*
1172 : * Advance to next chain member.
1173 : */
1174 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blockno);
1175 211508 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1176 211508 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1177 : }
1178 :
1179 0 : if (ItemIdIsRedirected(rootlp) && nchain < 2)
1180 : {
1181 : /*
1182 : * We found a redirect item that doesn't point to a valid follow-on
1183 : * item. This can happen if the loop in heap_page_prune_and_freeze()
1184 : * caused us to visit the dead successor of a redirect item before
1185 : * visiting the redirect item. We can clean up by setting the
1186 : * redirect item to LP_DEAD state or LP_UNUSED if the caller
1187 : * indicated.
1188 : */
1189 0 : heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
1190 0 : return;
1191 : }
1192 :
1193 0 : process_chain:
1194 :
1195 57244404 : if (ndeadchain == 0)
1196 : {
1197 : /*
1198 : * No DEAD tuple was found, so the chain is entirely composed of
1199 : * normal, unchanged tuples. Leave it alone.
1200 : */
1201 54464286 : int i = 0;
1202 :
1203 54464286 : if (ItemIdIsRedirected(rootlp))
1204 : {
1205 612546 : heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
1206 612546 : i++;
1207 : }
1208 108936086 : for (; i < nchain; i++)
1209 54471800 : heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]);
1210 : }
1211 2780118 : else if (ndeadchain == nchain)
1212 : {
1213 : /*
1214 : * The entire chain is dead. Mark the root line pointer LP_DEAD, and
1215 : * fully remove the other tuples in the chain.
1216 : */
1217 2650308 : heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
1218 2718216 : for (int i = 1; i < nchain; i++)
1219 67908 : heap_prune_record_unused(prstate, chainitems[i], true);
1220 : }
1221 : else
1222 : {
1223 : /*
1224 : * We found a DEAD tuple in the chain. Redirect the root line pointer
1225 : * to the first non-DEAD tuple, and mark as unused each intermediate
1226 : * item that we are able to remove from the chain.
1227 : */
1228 129810 : heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
1229 129810 : ItemIdIsNormal(rootlp));
1230 166920 : for (int i = 1; i < ndeadchain; i++)
1231 37110 : heap_prune_record_unused(prstate, chainitems[i], true);
1232 :
1233 : /* the rest of tuples in the chain are normal, unchanged tuples */
1234 263378 : for (int i = ndeadchain; i < nchain; i++)
1235 133568 : heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]);
1236 : }
1237 : }
1238 :
1239 : /* Record lowest soon-prunable XID */
1240 : static void
1241 17916566 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
1242 : {
1243 : /*
1244 : * This should exactly match the PageSetPrunable macro. We can't store
1245 : * directly into the page header yet, so we update working state.
1246 : */
1247 : Assert(TransactionIdIsNormal(xid));
1248 35129228 : if (!TransactionIdIsValid(prstate->new_prune_xid) ||
1249 17212662 : TransactionIdPrecedes(xid, prstate->new_prune_xid))
1250 705986 : prstate->new_prune_xid = xid;
1251 17916566 : }
1252 :
1253 : /* Record line pointer to be redirected */
1254 : static void
1255 129810 : heap_prune_record_redirect(PruneState *prstate,
1256 : OffsetNumber offnum, OffsetNumber rdoffnum,
1257 : bool was_normal)
1258 : {
1259 : Assert(!prstate->processed[offnum]);
1260 129810 : prstate->processed[offnum] = true;
1261 :
1262 : /*
1263 : * Do not mark the redirect target here. It needs to be counted
1264 : * separately as an unchanged tuple.
1265 : */
1266 :
1267 : Assert(prstate->nredirected < MaxHeapTuplesPerPage);
1268 129810 : prstate->redirected[prstate->nredirected * 2] = offnum;
1269 129810 : prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
1270 :
1271 129810 : prstate->nredirected++;
1272 :
1273 : /*
1274 : * If the root entry had been a normal tuple, we are deleting it, so count
1275 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1276 : * count.
1277 : */
1278 129810 : if (was_normal)
1279 114546 : prstate->ndeleted++;
1280 :
1281 129810 : prstate->hastup = true;
1282 129810 : }
1283 :
1284 : /* Record line pointer to be marked dead */
1285 : static void
1286 2581234 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
1287 : bool was_normal)
1288 : {
1289 : Assert(!prstate->processed[offnum]);
1290 2581234 : prstate->processed[offnum] = true;
1291 :
1292 : Assert(prstate->ndead < MaxHeapTuplesPerPage);
1293 2581234 : prstate->nowdead[prstate->ndead] = offnum;
1294 2581234 : prstate->ndead++;
1295 :
1296 : /*
1297 : * Deliberately delay unsetting all_visible and all_frozen until later
1298 : * during pruning. Removable dead tuples shouldn't preclude freezing the
1299 : * page.
1300 : */
1301 :
1302 : /* Record the dead offset for vacuum */
1303 2581234 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1304 :
1305 : /*
1306 : * If the root entry had been a normal tuple, we are deleting it, so count
1307 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1308 : * count.
1309 : */
1310 2581234 : if (was_normal)
1311 2561906 : prstate->ndeleted++;
1312 2581234 : }
1313 :
1314 : /*
1315 : * Depending on whether or not the caller set mark_unused_now to true, record that a
1316 : * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in
1317 : * which we will mark line pointers LP_UNUSED, but we will not mark line
1318 : * pointers LP_DEAD if mark_unused_now is true.
1319 : */
1320 : static void
1321 2650308 : heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
1322 : bool was_normal)
1323 : {
1324 : /*
1325 : * If the caller set mark_unused_now to true, we can remove dead tuples
1326 : * during pruning instead of marking their line pointers dead. Set this
1327 : * tuple's line pointer LP_UNUSED. We hint that this option is less
1328 : * likely.
1329 : */
1330 2650308 : if (unlikely(prstate->mark_unused_now))
1331 69074 : heap_prune_record_unused(prstate, offnum, was_normal);
1332 : else
1333 2581234 : heap_prune_record_dead(prstate, offnum, was_normal);
1334 2650308 : }
1335 :
1336 : /* Record line pointer to be marked unused */
1337 : static void
1338 179466 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
1339 : {
1340 : Assert(!prstate->processed[offnum]);
1341 179466 : prstate->processed[offnum] = true;
1342 :
1343 : Assert(prstate->nunused < MaxHeapTuplesPerPage);
1344 179466 : prstate->nowunused[prstate->nunused] = offnum;
1345 179466 : prstate->nunused++;
1346 :
1347 : /*
1348 : * If the root entry had been a normal tuple, we are deleting it, so count
1349 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1350 : * count.
1351 : */
1352 179466 : if (was_normal)
1353 178414 : prstate->ndeleted++;
1354 179466 : }
1355 :
1356 : /*
1357 : * Record an unused line pointer that is left unchanged.
1358 : */
1359 : static void
1360 618914 : heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum)
1361 : {
1362 : Assert(!prstate->processed[offnum]);
1363 618914 : prstate->processed[offnum] = true;
1364 618914 : }
1365 :
1366 : /*
1367 : * Record line pointer that is left unchanged. We consider freezing it, and
1368 : * update bookkeeping of tuple counts and page visibility.
1369 : */
1370 : static void
1371 54628002 : heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum)
1372 : {
1373 : HeapTupleHeader htup;
1374 :
1375 : Assert(!prstate->processed[offnum]);
1376 54628002 : prstate->processed[offnum] = true;
1377 :
1378 54628002 : prstate->hastup = true; /* the page is not empty */
1379 :
1380 : /*
1381 : * The criteria for counting a tuple as live in this block need to match
1382 : * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and
1383 : * ANALYZE may produce wildly different reltuples values, e.g. when there
1384 : * are many recently-dead tuples.
1385 : *
1386 : * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM
1387 : * can't run inside a transaction block, which makes some cases impossible
1388 : * (e.g. in-progress insert from the same transaction).
1389 : *
1390 : * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*()
1391 : * subroutines. They don't count dead items like acquire_sample_rows()
1392 : * does, because we assume that all dead items will become LP_UNUSED
1393 : * before VACUUM finishes. This difference is only superficial. VACUUM
1394 : * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM
1395 : * won't remember LP_DEAD items, but only because they're not supposed to
1396 : * be left behind when it is done. (Cases where we bypass index vacuuming
1397 : * will violate this optimistic assumption, but the overall impact of that
1398 : * should be negligible.)
1399 : */
1400 54628002 : htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum));
1401 :
1402 54628002 : switch (prstate->htsv[offnum])
1403 : {
1404 36588222 : case HEAPTUPLE_LIVE:
1405 :
1406 : /*
1407 : * Count it as live. Not only is this natural, but it's also what
1408 : * acquire_sample_rows() does.
1409 : */
1410 36588222 : prstate->live_tuples++;
1411 :
1412 : /*
1413 : * Is the tuple definitely visible to all transactions?
1414 : *
1415 : * NB: Like with per-tuple hint bits, we can't set the
1416 : * PD_ALL_VISIBLE flag if the inserter committed asynchronously.
1417 : * See SetHintBits for more info. Check that the tuple is hinted
1418 : * xmin-committed because of that.
1419 : */
1420 36588222 : if (prstate->all_visible)
1421 : {
1422 : TransactionId xmin;
1423 :
1424 26584454 : if (!HeapTupleHeaderXminCommitted(htup))
1425 : {
1426 336 : prstate->all_visible = false;
1427 336 : prstate->all_frozen = false;
1428 336 : break;
1429 : }
1430 :
1431 : /*
1432 : * The inserter definitely committed. But is it old enough
1433 : * that everyone sees it as committed? A FrozenTransactionId
1434 : * is seen as committed to everyone. Otherwise, we check if
1435 : * there is a snapshot that considers this xid to still be
1436 : * running, and if so, we don't consider the page all-visible.
1437 : */
1438 26584118 : xmin = HeapTupleHeaderGetXmin(htup);
1439 :
1440 : /*
1441 : * For now always use prstate->cutoffs for this test, because
1442 : * we only update 'all_visible' and 'all_frozen' when freezing
1443 : * is requested. We could use GlobalVisTestIsRemovableXid
1444 : * instead, if a non-freezing caller wanted to set the VM bit.
1445 : */
1446 : Assert(prstate->cutoffs);
1447 26584118 : if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin))
1448 : {
1449 5964 : prstate->all_visible = false;
1450 5964 : prstate->all_frozen = false;
1451 5964 : break;
1452 : }
1453 :
1454 : /* Track newest xmin on page. */
1455 26578154 : if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) &&
1456 : TransactionIdIsNormal(xmin))
1457 227376 : prstate->visibility_cutoff_xid = xmin;
1458 : }
1459 36581922 : break;
1460 :
1461 711084 : case HEAPTUPLE_RECENTLY_DEAD:
1462 711084 : prstate->recently_dead_tuples++;
1463 711084 : prstate->all_visible = false;
1464 711084 : prstate->all_frozen = false;
1465 :
1466 : /*
1467 : * This tuple will soon become DEAD. Update the hint field so
1468 : * that the page is reconsidered for pruning in future.
1469 : */
1470 711084 : heap_prune_record_prunable(prstate,
1471 : HeapTupleHeaderGetUpdateXid(htup));
1472 711084 : break;
1473 :
1474 123214 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1475 :
1476 : /*
1477 : * We do not count these rows as live, because we expect the
1478 : * inserting transaction to update the counters at commit, and we
1479 : * assume that will happen only after we report our results. This
1480 : * assumption is a bit shaky, but it is what acquire_sample_rows()
1481 : * does, so be consistent.
1482 : */
1483 123214 : prstate->all_visible = false;
1484 123214 : prstate->all_frozen = false;
1485 :
1486 : /*
1487 : * If we wanted to optimize for aborts, we might consider marking
1488 : * the page prunable when we see INSERT_IN_PROGRESS. But we
1489 : * don't. See related decisions about when to mark the page
1490 : * prunable in heapam.c.
1491 : */
1492 123214 : break;
1493 :
1494 17205482 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1495 :
1496 : /*
1497 : * This an expected case during concurrent vacuum. Count such
1498 : * rows as live. As above, we assume the deleting transaction
1499 : * will commit and update the counters after we report.
1500 : */
1501 17205482 : prstate->live_tuples++;
1502 17205482 : prstate->all_visible = false;
1503 17205482 : prstate->all_frozen = false;
1504 :
1505 : /*
1506 : * This tuple may soon become DEAD. Update the hint field so that
1507 : * the page is reconsidered for pruning in future.
1508 : */
1509 17205482 : heap_prune_record_prunable(prstate,
1510 : HeapTupleHeaderGetUpdateXid(htup));
1511 17205482 : break;
1512 :
1513 0 : default:
1514 :
1515 : /*
1516 : * DEAD tuples should've been passed to heap_prune_record_dead()
1517 : * or heap_prune_record_unused() instead.
1518 : */
1519 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d",
1520 : prstate->htsv[offnum]);
1521 : break;
1522 : }
1523 :
1524 : /* Consider freezing any normal tuples which will not be removed */
1525 54628002 : if (prstate->attempt_freeze)
1526 : {
1527 : bool totally_frozen;
1528 :
1529 51698438 : if ((heap_prepare_freeze_tuple(htup,
1530 51698438 : prstate->cutoffs,
1531 : &prstate->pagefrz,
1532 51698438 : &prstate->frozen[prstate->nfrozen],
1533 : &totally_frozen)))
1534 : {
1535 : /* Save prepared freeze plan for later */
1536 4420416 : prstate->frozen[prstate->nfrozen++].offset = offnum;
1537 : }
1538 :
1539 : /*
1540 : * If any tuple isn't either totally frozen already or eligible to
1541 : * become totally frozen (according to its freeze plan), then the page
1542 : * definitely cannot be set all-frozen in the visibility map later on.
1543 : */
1544 51698438 : if (!totally_frozen)
1545 18475444 : prstate->all_frozen = false;
1546 : }
1547 54628002 : }
1548 :
1549 :
1550 : /*
1551 : * Record line pointer that was already LP_DEAD and is left unchanged.
1552 : */
1553 : static void
1554 2111386 : heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum)
1555 : {
1556 : Assert(!prstate->processed[offnum]);
1557 2111386 : prstate->processed[offnum] = true;
1558 :
1559 : /*
1560 : * Deliberately don't set hastup for LP_DEAD items. We make the soft
1561 : * assumption that any LP_DEAD items encountered here will become
1562 : * LP_UNUSED later on, before count_nondeletable_pages is reached. If we
1563 : * don't make this assumption then rel truncation will only happen every
1564 : * other VACUUM, at most. Besides, VACUUM must treat
1565 : * hastup/nonempty_pages as provisional no matter how LP_DEAD items are
1566 : * handled (handled here, or handled later on).
1567 : *
1568 : * Similarly, don't unset all_visible and all_frozen until later, at the
1569 : * end of heap_page_prune_and_freeze(). This will allow us to attempt to
1570 : * freeze the page after pruning. As long as we unset it before updating
1571 : * the visibility map, this will be correct.
1572 : */
1573 :
1574 : /* Record the dead offset for vacuum */
1575 2111386 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1576 2111386 : }
1577 :
1578 : /*
1579 : * Record LP_REDIRECT that is left unchanged.
1580 : */
1581 : static void
1582 612546 : heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum)
1583 : {
1584 : /*
1585 : * A redirect line pointer doesn't count as a live tuple.
1586 : *
1587 : * If we leave a redirect line pointer in place, there will be another
1588 : * tuple on the page that it points to. We will do the bookkeeping for
1589 : * that separately. So we have nothing to do here, except remember that
1590 : * we processed this item.
1591 : */
1592 : Assert(!prstate->processed[offnum]);
1593 612546 : prstate->processed[offnum] = true;
1594 612546 : }
1595 :
1596 : /*
1597 : * Perform the actual page changes needed by heap_page_prune_and_freeze().
1598 : *
1599 : * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers
1600 : * as unused, not redirecting or removing anything else. The
1601 : * PageRepairFragmentation() call is skipped in that case.
1602 : *
1603 : * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on
1604 : * the buffer. If it is set, an ordinary exclusive lock suffices.
1605 : */
1606 : void
1607 120956 : heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
1608 : OffsetNumber *redirected, int nredirected,
1609 : OffsetNumber *nowdead, int ndead,
1610 : OffsetNumber *nowunused, int nunused)
1611 : {
1612 120956 : Page page = BufferGetPage(buffer);
1613 : OffsetNumber *offnum;
1614 : HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
1615 :
1616 : /* Shouldn't be called unless there's something to do */
1617 : Assert(nredirected > 0 || ndead > 0 || nunused > 0);
1618 :
1619 : /* If 'lp_truncate_only', we can only remove already-dead line pointers */
1620 : Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
1621 :
1622 : /* Update all redirected line pointers */
1623 120956 : offnum = redirected;
1624 287828 : for (int i = 0; i < nredirected; i++)
1625 : {
1626 166872 : OffsetNumber fromoff = *offnum++;
1627 166872 : OffsetNumber tooff = *offnum++;
1628 166872 : ItemId fromlp = PageGetItemId(page, fromoff);
1629 : ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
1630 :
1631 : #ifdef USE_ASSERT_CHECKING
1632 :
1633 : /*
1634 : * Any existing item that we set as an LP_REDIRECT (any 'from' item)
1635 : * must be the first item from a HOT chain. If the item has tuple
1636 : * storage then it can't be a heap-only tuple. Otherwise we are just
1637 : * maintaining an existing LP_REDIRECT from an existing HOT chain that
1638 : * has been pruned at least once before now.
1639 : */
1640 : if (!ItemIdIsRedirected(fromlp))
1641 : {
1642 : Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
1643 :
1644 : htup = (HeapTupleHeader) PageGetItem(page, fromlp);
1645 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
1646 : }
1647 : else
1648 : {
1649 : /* We shouldn't need to redundantly set the redirect */
1650 : Assert(ItemIdGetRedirect(fromlp) != tooff);
1651 : }
1652 :
1653 : /*
1654 : * The item that we're about to set as an LP_REDIRECT (the 'from'
1655 : * item) will point to an existing item (the 'to' item) that is
1656 : * already a heap-only tuple. There can be at most one LP_REDIRECT
1657 : * item per HOT chain.
1658 : *
1659 : * We need to keep around an LP_REDIRECT item (after original
1660 : * non-heap-only root tuple gets pruned away) so that it's always
1661 : * possible for VACUUM to easily figure out what TID to delete from
1662 : * indexes when an entire HOT chain becomes dead. A heap-only tuple
1663 : * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
1664 : * tuple can.
1665 : *
1666 : * This check may miss problems, e.g. the target of a redirect could
1667 : * be marked as unused subsequently. The page_verify_redirects() check
1668 : * below will catch such problems.
1669 : */
1670 : tolp = PageGetItemId(page, tooff);
1671 : Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
1672 : htup = (HeapTupleHeader) PageGetItem(page, tolp);
1673 : Assert(HeapTupleHeaderIsHeapOnly(htup));
1674 : #endif
1675 :
1676 166872 : ItemIdSetRedirect(fromlp, tooff);
1677 : }
1678 :
1679 : /* Update all now-dead line pointers */
1680 120956 : offnum = nowdead;
1681 3082812 : for (int i = 0; i < ndead; i++)
1682 : {
1683 2961856 : OffsetNumber off = *offnum++;
1684 2961856 : ItemId lp = PageGetItemId(page, off);
1685 :
1686 : #ifdef USE_ASSERT_CHECKING
1687 :
1688 : /*
1689 : * An LP_DEAD line pointer must be left behind when the original item
1690 : * (which is dead to everybody) could still be referenced by a TID in
1691 : * an index. This should never be necessary with any individual
1692 : * heap-only tuple item, though. (It's not clear how much of a problem
1693 : * that would be, but there is no reason to allow it.)
1694 : */
1695 : if (ItemIdHasStorage(lp))
1696 : {
1697 : Assert(ItemIdIsNormal(lp));
1698 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1699 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
1700 : }
1701 : else
1702 : {
1703 : /* Whole HOT chain becomes dead */
1704 : Assert(ItemIdIsRedirected(lp));
1705 : }
1706 : #endif
1707 :
1708 2961856 : ItemIdSetDead(lp);
1709 : }
1710 :
1711 : /* Update all now-unused line pointers */
1712 120956 : offnum = nowunused;
1713 518028 : for (int i = 0; i < nunused; i++)
1714 : {
1715 397072 : OffsetNumber off = *offnum++;
1716 397072 : ItemId lp = PageGetItemId(page, off);
1717 :
1718 : #ifdef USE_ASSERT_CHECKING
1719 :
1720 : if (lp_truncate_only)
1721 : {
1722 : /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
1723 : Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
1724 : }
1725 : else
1726 : {
1727 : /*
1728 : * When heap_page_prune_and_freeze() was called, mark_unused_now
1729 : * may have been passed as true, which allows would-be LP_DEAD
1730 : * items to be made LP_UNUSED instead. This is only possible if
1731 : * the relation has no indexes. If there are any dead items, then
1732 : * mark_unused_now was not true and every item being marked
1733 : * LP_UNUSED must refer to a heap-only tuple.
1734 : */
1735 : if (ndead > 0)
1736 : {
1737 : Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
1738 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1739 : Assert(HeapTupleHeaderIsHeapOnly(htup));
1740 : }
1741 : else
1742 : Assert(ItemIdIsUsed(lp));
1743 : }
1744 :
1745 : #endif
1746 :
1747 397072 : ItemIdSetUnused(lp);
1748 : }
1749 :
1750 120956 : if (lp_truncate_only)
1751 2524 : PageTruncateLinePointerArray(page);
1752 : else
1753 : {
1754 : /*
1755 : * Finally, repair any fragmentation, and update the page's hint bit
1756 : * about whether it has free pointers.
1757 : */
1758 118432 : PageRepairFragmentation(page);
1759 :
1760 : /*
1761 : * Now that the page has been modified, assert that redirect items
1762 : * still point to valid targets.
1763 : */
1764 118432 : page_verify_redirects(page);
1765 : }
1766 120956 : }
1767 :
1768 :
1769 : /*
1770 : * If built with assertions, verify that all LP_REDIRECT items point to a
1771 : * valid item.
1772 : *
1773 : * One way that bugs related to HOT pruning show is redirect items pointing to
1774 : * removed tuples. It's not trivial to reliably check that marking an item
1775 : * unused will not orphan a redirect item during heap_prune_chain() /
1776 : * heap_page_prune_execute(), so we additionally check the whole page after
1777 : * pruning. Without this check such bugs would typically only cause asserts
1778 : * later, potentially well after the corruption has been introduced.
1779 : *
1780 : * Also check comments in heap_page_prune_execute()'s redirection loop.
1781 : */
1782 : static void
1783 118432 : page_verify_redirects(Page page)
1784 : {
1785 : #ifdef USE_ASSERT_CHECKING
1786 : OffsetNumber offnum;
1787 : OffsetNumber maxoff;
1788 :
1789 : maxoff = PageGetMaxOffsetNumber(page);
1790 : for (offnum = FirstOffsetNumber;
1791 : offnum <= maxoff;
1792 : offnum = OffsetNumberNext(offnum))
1793 : {
1794 : ItemId itemid = PageGetItemId(page, offnum);
1795 : OffsetNumber targoff;
1796 : ItemId targitem;
1797 : HeapTupleHeader htup;
1798 :
1799 : if (!ItemIdIsRedirected(itemid))
1800 : continue;
1801 :
1802 : targoff = ItemIdGetRedirect(itemid);
1803 : targitem = PageGetItemId(page, targoff);
1804 :
1805 : Assert(ItemIdIsUsed(targitem));
1806 : Assert(ItemIdIsNormal(targitem));
1807 : Assert(ItemIdHasStorage(targitem));
1808 : htup = (HeapTupleHeader) PageGetItem(page, targitem);
1809 : Assert(HeapTupleHeaderIsHeapOnly(htup));
1810 : }
1811 : #endif
1812 118432 : }
1813 :
1814 :
1815 : /*
1816 : * For all items in this page, find their respective root line pointers.
1817 : * If item k is part of a HOT-chain with root at item j, then we set
1818 : * root_offsets[k - 1] = j.
1819 : *
1820 : * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
1821 : * Unused entries are filled with InvalidOffsetNumber (zero).
1822 : *
1823 : * The function must be called with at least share lock on the buffer, to
1824 : * prevent concurrent prune operations.
1825 : *
1826 : * Note: The information collected here is valid only as long as the caller
1827 : * holds a pin on the buffer. Once pin is released, a tuple might be pruned
1828 : * and reused by a completely unrelated tuple.
1829 : */
1830 : void
1831 223762 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
1832 : {
1833 : OffsetNumber offnum,
1834 : maxoff;
1835 :
1836 223762 : MemSet(root_offsets, InvalidOffsetNumber,
1837 : MaxHeapTuplesPerPage * sizeof(OffsetNumber));
1838 :
1839 223762 : maxoff = PageGetMaxOffsetNumber(page);
1840 18058306 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
1841 : {
1842 17834544 : ItemId lp = PageGetItemId(page, offnum);
1843 : HeapTupleHeader htup;
1844 : OffsetNumber nextoffnum;
1845 : TransactionId priorXmax;
1846 :
1847 : /* skip unused and dead items */
1848 17834544 : if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
1849 22102 : continue;
1850 :
1851 17812442 : if (ItemIdIsNormal(lp))
1852 : {
1853 17804066 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1854 :
1855 : /*
1856 : * Check if this tuple is part of a HOT-chain rooted at some other
1857 : * tuple. If so, skip it for now; we'll process it when we find
1858 : * its root.
1859 : */
1860 17804066 : if (HeapTupleHeaderIsHeapOnly(htup))
1861 8952 : continue;
1862 :
1863 : /*
1864 : * This is either a plain tuple or the root of a HOT-chain.
1865 : * Remember it in the mapping.
1866 : */
1867 17795114 : root_offsets[offnum - 1] = offnum;
1868 :
1869 : /* If it's not the start of a HOT-chain, we're done with it */
1870 17795114 : if (!HeapTupleHeaderIsHotUpdated(htup))
1871 17794684 : continue;
1872 :
1873 : /* Set up to scan the HOT-chain */
1874 430 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1875 430 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1876 : }
1877 : else
1878 : {
1879 : /* Must be a redirect item. We do not set its root_offsets entry */
1880 : Assert(ItemIdIsRedirected(lp));
1881 : /* Set up to scan the HOT-chain */
1882 8376 : nextoffnum = ItemIdGetRedirect(lp);
1883 8376 : priorXmax = InvalidTransactionId;
1884 : }
1885 :
1886 : /*
1887 : * Now follow the HOT-chain and collect other tuples in the chain.
1888 : *
1889 : * Note: Even though this is a nested loop, the complexity of the
1890 : * function is O(N) because a tuple in the page should be visited not
1891 : * more than twice, once in the outer loop and once in HOT-chain
1892 : * chases.
1893 : */
1894 : for (;;)
1895 : {
1896 : /* Sanity check (pure paranoia) */
1897 8946 : if (offnum < FirstOffsetNumber)
1898 0 : break;
1899 :
1900 : /*
1901 : * An offset past the end of page's line pointer array is possible
1902 : * when the array was truncated
1903 : */
1904 8946 : if (offnum > maxoff)
1905 0 : break;
1906 :
1907 8946 : lp = PageGetItemId(page, nextoffnum);
1908 :
1909 : /* Check for broken chains */
1910 8946 : if (!ItemIdIsNormal(lp))
1911 0 : break;
1912 :
1913 8946 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1914 :
1915 9516 : if (TransactionIdIsValid(priorXmax) &&
1916 570 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
1917 0 : break;
1918 :
1919 : /* Remember the root line pointer for this item */
1920 8946 : root_offsets[nextoffnum - 1] = offnum;
1921 :
1922 : /* Advance to next chain member, if any */
1923 8946 : if (!HeapTupleHeaderIsHotUpdated(htup))
1924 8806 : break;
1925 :
1926 : /* HOT implies it can't have moved to different partition */
1927 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1928 :
1929 140 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1930 140 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1931 : }
1932 : }
1933 223762 : }
1934 :
1935 :
1936 : /*
1937 : * Compare fields that describe actions required to freeze tuple with caller's
1938 : * open plan. If everything matches then the frz tuple plan is equivalent to
1939 : * caller's plan.
1940 : */
1941 : static inline bool
1942 1876124 : heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
1943 : {
1944 1876124 : if (plan->xmax == frz->xmax &&
1945 1873534 : plan->t_infomask2 == frz->t_infomask2 &&
1946 1871726 : plan->t_infomask == frz->t_infomask &&
1947 1866560 : plan->frzflags == frz->frzflags)
1948 1866560 : return true;
1949 :
1950 : /* Caller must call heap_log_freeze_new_plan again for frz */
1951 9564 : return false;
1952 : }
1953 :
1954 : /*
1955 : * Comparator used to deduplicate the freeze plans used in WAL records.
1956 : */
1957 : static int
1958 2615354 : heap_log_freeze_cmp(const void *arg1, const void *arg2)
1959 : {
1960 2615354 : HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1;
1961 2615354 : HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2;
1962 :
1963 2615354 : if (frz1->xmax < frz2->xmax)
1964 26404 : return -1;
1965 2588950 : else if (frz1->xmax > frz2->xmax)
1966 28474 : return 1;
1967 :
1968 2560476 : if (frz1->t_infomask2 < frz2->t_infomask2)
1969 9086 : return -1;
1970 2551390 : else if (frz1->t_infomask2 > frz2->t_infomask2)
1971 10124 : return 1;
1972 :
1973 2541266 : if (frz1->t_infomask < frz2->t_infomask)
1974 22896 : return -1;
1975 2518370 : else if (frz1->t_infomask > frz2->t_infomask)
1976 31732 : return 1;
1977 :
1978 2486638 : if (frz1->frzflags < frz2->frzflags)
1979 0 : return -1;
1980 2486638 : else if (frz1->frzflags > frz2->frzflags)
1981 0 : return 1;
1982 :
1983 : /*
1984 : * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
1985 : * (So the tuples will share a single canonical freeze plan.)
1986 : *
1987 : * We tiebreak on page offset number to keep each freeze plan's page
1988 : * offset number array individually sorted. (Unnecessary, but be tidy.)
1989 : */
1990 2486638 : if (frz1->offset < frz2->offset)
1991 2093612 : return -1;
1992 393026 : else if (frz1->offset > frz2->offset)
1993 393026 : return 1;
1994 :
1995 : Assert(false);
1996 0 : return 0;
1997 : }
1998 :
1999 : /*
2000 : * Start new plan initialized using tuple-level actions. At least one tuple
2001 : * will have steps required to freeze described by caller's plan during REDO.
2002 : */
2003 : static inline void
2004 55590 : heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
2005 : {
2006 55590 : plan->xmax = frz->xmax;
2007 55590 : plan->t_infomask2 = frz->t_infomask2;
2008 55590 : plan->t_infomask = frz->t_infomask;
2009 55590 : plan->frzflags = frz->frzflags;
2010 55590 : plan->ntuples = 1; /* for now */
2011 55590 : }
2012 :
2013 : /*
2014 : * Deduplicate tuple-based freeze plans so that each distinct set of
2015 : * processing steps is only stored once in the WAL record.
2016 : * Called during original execution of freezing (for logged relations).
2017 : *
2018 : * Return value is number of plans set in *plans_out for caller. Also writes
2019 : * an array of offset numbers into *offsets_out output argument for caller
2020 : * (actually there is one array per freeze plan, but that's not of immediate
2021 : * concern to our caller).
2022 : */
2023 : static int
2024 46026 : heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
2025 : xlhp_freeze_plan *plans_out,
2026 : OffsetNumber *offsets_out)
2027 : {
2028 46026 : int nplans = 0;
2029 :
2030 : /* Sort tuple-based freeze plans in the order required to deduplicate */
2031 46026 : qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
2032 :
2033 1968176 : for (int i = 0; i < ntuples; i++)
2034 : {
2035 1922150 : HeapTupleFreeze *frz = tuples + i;
2036 :
2037 1922150 : if (i == 0)
2038 : {
2039 : /* New canonical freeze plan starting with first tup */
2040 46026 : heap_log_freeze_new_plan(plans_out, frz);
2041 46026 : nplans++;
2042 : }
2043 1876124 : else if (heap_log_freeze_eq(plans_out, frz))
2044 : {
2045 : /* tup matches open canonical plan -- include tup in it */
2046 : Assert(offsets_out[i - 1] < frz->offset);
2047 1866560 : plans_out->ntuples++;
2048 : }
2049 : else
2050 : {
2051 : /* Tup doesn't match current plan -- done with it now */
2052 9564 : plans_out++;
2053 :
2054 : /* New canonical freeze plan starting with this tup */
2055 9564 : heap_log_freeze_new_plan(plans_out, frz);
2056 9564 : nplans++;
2057 : }
2058 :
2059 : /*
2060 : * Save page offset number in dedicated buffer in passing.
2061 : *
2062 : * REDO routine relies on the record's offset numbers array grouping
2063 : * offset numbers by freeze plan. The sort order within each grouping
2064 : * is ascending offset number order, just to keep things tidy.
2065 : */
2066 1922150 : offsets_out[i] = frz->offset;
2067 : }
2068 :
2069 : Assert(nplans > 0 && nplans <= ntuples);
2070 :
2071 46026 : return nplans;
2072 : }
2073 :
2074 : /*
2075 : * Write an XLOG_HEAP2_PRUNE* WAL record
2076 : *
2077 : * This is used for several different page maintenance operations:
2078 : *
2079 : * - Page pruning, in VACUUM's 1st pass or on access: Some items are
2080 : * redirected, some marked dead, and some removed altogether.
2081 : *
2082 : * - Freezing: Items are marked as 'frozen'.
2083 : *
2084 : * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused.
2085 : *
2086 : * They have enough commonalities that we use a single WAL record for them
2087 : * all.
2088 : *
2089 : * If replaying the record requires a cleanup lock, pass cleanup_lock = true.
2090 : * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but
2091 : * replaying 'unused' items depends on whether they were all previously marked
2092 : * as dead.
2093 : *
2094 : * If the VM is being updated, vmflags will contain the bits to set. In this
2095 : * case, vmbuffer should already have been updated and marked dirty and should
2096 : * still be pinned and locked.
2097 : *
2098 : * Note: This function scribbles on the 'frozen' array.
2099 : *
2100 : * Note: This is called in a critical section, so careful what you do here.
2101 : */
2102 : void
2103 173656 : log_heap_prune_and_freeze(Relation relation, Buffer buffer,
2104 : Buffer vmbuffer, uint8 vmflags,
2105 : TransactionId conflict_xid,
2106 : bool cleanup_lock,
2107 : PruneReason reason,
2108 : HeapTupleFreeze *frozen, int nfrozen,
2109 : OffsetNumber *redirected, int nredirected,
2110 : OffsetNumber *dead, int ndead,
2111 : OffsetNumber *unused, int nunused)
2112 : {
2113 : xl_heap_prune xlrec;
2114 : XLogRecPtr recptr;
2115 : uint8 info;
2116 : uint8 regbuf_flags_heap;
2117 :
2118 : /* The following local variables hold data registered in the WAL record: */
2119 : xlhp_freeze_plan plans[MaxHeapTuplesPerPage];
2120 : xlhp_freeze_plans freeze_plans;
2121 : xlhp_prune_items redirect_items;
2122 : xlhp_prune_items dead_items;
2123 : xlhp_prune_items unused_items;
2124 : OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
2125 173656 : bool do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
2126 173656 : bool do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
2127 :
2128 : Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags);
2129 :
2130 173656 : xlrec.flags = 0;
2131 173656 : regbuf_flags_heap = REGBUF_STANDARD;
2132 :
2133 : /*
2134 : * We can avoid an FPI of the heap page if the only modification we are
2135 : * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are
2136 : * disabled. Note that if we explicitly skip an FPI, we must not stamp the
2137 : * heap page with this record's LSN. Recovery skips records <= the stamped
2138 : * LSN, so this could lead to skipping an earlier FPI needed to repair a
2139 : * torn page.
2140 : */
2141 173656 : if (!do_prune &&
2142 0 : nfrozen == 0 &&
2143 0 : (!do_set_vm || !XLogHintBitIsNeeded()))
2144 0 : regbuf_flags_heap |= REGBUF_NO_IMAGE;
2145 :
2146 : /*
2147 : * Prepare data for the buffer. The arrays are not actually in the
2148 : * buffer, but we pretend that they are. When XLogInsert stores a full
2149 : * page image, the arrays can be omitted.
2150 : */
2151 173656 : XLogBeginInsert();
2152 173656 : XLogRegisterBuffer(0, buffer, regbuf_flags_heap);
2153 :
2154 173656 : if (do_set_vm)
2155 27846 : XLogRegisterBuffer(1, vmbuffer, 0);
2156 :
2157 173656 : if (nfrozen > 0)
2158 : {
2159 : int nplans;
2160 :
2161 46026 : xlrec.flags |= XLHP_HAS_FREEZE_PLANS;
2162 :
2163 : /*
2164 : * Prepare deduplicated representation for use in the WAL record. This
2165 : * destructively sorts frozen tuples array in-place.
2166 : */
2167 46026 : nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets);
2168 :
2169 46026 : freeze_plans.nplans = nplans;
2170 46026 : XLogRegisterBufData(0, &freeze_plans,
2171 : offsetof(xlhp_freeze_plans, plans));
2172 46026 : XLogRegisterBufData(0, plans,
2173 : sizeof(xlhp_freeze_plan) * nplans);
2174 : }
2175 173656 : if (nredirected > 0)
2176 : {
2177 33424 : xlrec.flags |= XLHP_HAS_REDIRECTIONS;
2178 :
2179 33424 : redirect_items.ntargets = nredirected;
2180 33424 : XLogRegisterBufData(0, &redirect_items,
2181 : offsetof(xlhp_prune_items, data));
2182 33424 : XLogRegisterBufData(0, redirected,
2183 : sizeof(OffsetNumber[2]) * nredirected);
2184 : }
2185 173656 : if (ndead > 0)
2186 : {
2187 77890 : xlrec.flags |= XLHP_HAS_DEAD_ITEMS;
2188 :
2189 77890 : dead_items.ntargets = ndead;
2190 77890 : XLogRegisterBufData(0, &dead_items,
2191 : offsetof(xlhp_prune_items, data));
2192 77890 : XLogRegisterBufData(0, dead,
2193 : sizeof(OffsetNumber) * ndead);
2194 : }
2195 173656 : if (nunused > 0)
2196 : {
2197 50672 : xlrec.flags |= XLHP_HAS_NOW_UNUSED_ITEMS;
2198 :
2199 50672 : unused_items.ntargets = nunused;
2200 50672 : XLogRegisterBufData(0, &unused_items,
2201 : offsetof(xlhp_prune_items, data));
2202 50672 : XLogRegisterBufData(0, unused,
2203 : sizeof(OffsetNumber) * nunused);
2204 : }
2205 173656 : if (nfrozen > 0)
2206 46026 : XLogRegisterBufData(0, frz_offsets,
2207 : sizeof(OffsetNumber) * nfrozen);
2208 :
2209 : /*
2210 : * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_*
2211 : * flag above.
2212 : */
2213 173656 : if (vmflags & VISIBILITYMAP_ALL_VISIBLE)
2214 : {
2215 27846 : xlrec.flags |= XLHP_VM_ALL_VISIBLE;
2216 27846 : if (vmflags & VISIBILITYMAP_ALL_FROZEN)
2217 21738 : xlrec.flags |= XLHP_VM_ALL_FROZEN;
2218 : }
2219 173656 : if (RelationIsAccessibleInLogicalDecoding(relation))
2220 1284 : xlrec.flags |= XLHP_IS_CATALOG_REL;
2221 173656 : if (TransactionIdIsValid(conflict_xid))
2222 142114 : xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON;
2223 173656 : if (cleanup_lock)
2224 145694 : xlrec.flags |= XLHP_CLEANUP_LOCK;
2225 : else
2226 : {
2227 : Assert(nredirected == 0 && ndead == 0);
2228 : /* also, any items in 'unused' must've been LP_DEAD previously */
2229 : }
2230 173656 : XLogRegisterData(&xlrec, SizeOfHeapPrune);
2231 173656 : if (TransactionIdIsValid(conflict_xid))
2232 142114 : XLogRegisterData(&conflict_xid, sizeof(TransactionId));
2233 :
2234 173656 : switch (reason)
2235 : {
2236 82336 : case PRUNE_ON_ACCESS:
2237 82336 : info = XLOG_HEAP2_PRUNE_ON_ACCESS;
2238 82336 : break;
2239 63358 : case PRUNE_VACUUM_SCAN:
2240 63358 : info = XLOG_HEAP2_PRUNE_VACUUM_SCAN;
2241 63358 : break;
2242 27962 : case PRUNE_VACUUM_CLEANUP:
2243 27962 : info = XLOG_HEAP2_PRUNE_VACUUM_CLEANUP;
2244 27962 : break;
2245 0 : default:
2246 0 : elog(ERROR, "unrecognized prune reason: %d", (int) reason);
2247 : break;
2248 : }
2249 173656 : recptr = XLogInsert(RM_HEAP2_ID, info);
2250 :
2251 173656 : if (do_set_vm)
2252 : {
2253 : Assert(BufferIsDirty(vmbuffer));
2254 27846 : PageSetLSN(BufferGetPage(vmbuffer), recptr);
2255 : }
2256 :
2257 : /*
2258 : * See comment at the top of the function about regbuf_flags_heap for
2259 : * details on when we can advance the page LSN.
2260 : */
2261 173656 : if (do_prune || nfrozen > 0 || (do_set_vm && XLogHintBitIsNeeded()))
2262 : {
2263 : Assert(BufferIsDirty(buffer));
2264 173656 : PageSetLSN(BufferGetPage(buffer), recptr);
2265 : }
2266 173656 : }
|