Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pruneheap.c
4 : * heap page pruning and HOT-chain management code
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/pruneheap.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/heapam.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/htup_details.h"
20 : #include "access/multixact.h"
21 : #include "access/transam.h"
22 : #include "access/visibilitymap.h"
23 : #include "access/xlog.h"
24 : #include "access/xloginsert.h"
25 : #include "commands/vacuum.h"
26 : #include "executor/instrument.h"
27 : #include "miscadmin.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "utils/rel.h"
31 : #include "utils/snapmgr.h"
32 :
33 : /* Working data for heap_page_prune_and_freeze() and subroutines */
34 : typedef struct
35 : {
36 : /*-------------------------------------------------------
37 : * Arguments passed to heap_page_prune_and_freeze()
38 : *-------------------------------------------------------
39 : */
40 :
41 : /* tuple visibility test, initialized for the relation */
42 : GlobalVisState *vistest;
43 : /* whether or not dead items can be set LP_UNUSED during pruning */
44 : bool mark_unused_now;
45 : /* whether to attempt freezing tuples */
46 : bool attempt_freeze;
47 : struct VacuumCutoffs *cutoffs;
48 : Relation relation;
49 :
50 : /*
51 : * Keep the buffer, block, and page handy so that helpers needing to
52 : * access them don't need to make repeated calls to BufferGetBlockNumber()
53 : * and BufferGetPage().
54 : */
55 : BlockNumber block;
56 : Buffer buffer;
57 : Page page;
58 :
59 : /*-------------------------------------------------------
60 : * Fields describing what to do to the page
61 : *-------------------------------------------------------
62 : */
63 : TransactionId new_prune_xid; /* new prune hint value */
64 : TransactionId latest_xid_removed;
65 : int nredirected; /* numbers of entries in arrays below */
66 : int ndead;
67 : int nunused;
68 : int nfrozen;
69 : /* arrays that accumulate indexes of items to be changed */
70 : OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
71 : OffsetNumber nowdead[MaxHeapTuplesPerPage];
72 : OffsetNumber nowunused[MaxHeapTuplesPerPage];
73 : HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
74 :
75 : /*
76 : * set_all_visible and set_all_frozen indicate if the all-visible and
77 : * all-frozen bits in the visibility map can be set for this page after
78 : * pruning.
79 : *
80 : * NOTE: set_all_visible and set_all_frozen initially don't include
81 : * LP_DEAD items. That's convenient for heap_page_prune_and_freeze() to
82 : * use them to decide whether to opportunistically freeze the page or not.
83 : * The set_all_visible and set_all_frozen values ultimately used to set
84 : * the VM are adjusted to include LP_DEAD items after we determine whether
85 : * or not to opportunistically freeze.
86 : */
87 : bool set_all_visible;
88 : bool set_all_frozen;
89 :
90 : /*-------------------------------------------------------
91 : * Working state for HOT chain processing
92 : *-------------------------------------------------------
93 : */
94 :
95 : /*
96 : * 'root_items' contains offsets of all LP_REDIRECT line pointers and
97 : * normal non-HOT tuples. They can be stand-alone items or the first item
98 : * in a HOT chain. 'heaponly_items' contains heap-only tuples which can
99 : * only be removed as part of a HOT chain.
100 : */
101 : int nroot_items;
102 : OffsetNumber root_items[MaxHeapTuplesPerPage];
103 : int nheaponly_items;
104 : OffsetNumber heaponly_items[MaxHeapTuplesPerPage];
105 :
106 : /*
107 : * processed[offnum] is true if item at offnum has been processed.
108 : *
109 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
110 : * 1. Otherwise every access would need to subtract 1.
111 : */
112 : bool processed[MaxHeapTuplesPerPage + 1];
113 :
114 : /*
115 : * Tuple visibility is only computed once for each tuple, for correctness
116 : * and efficiency reasons; see comment in heap_page_prune_and_freeze() for
117 : * details. This is of type int8[], instead of HTSV_Result[], so we can
118 : * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD
119 : * items.
120 : *
121 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
122 : * 1. Otherwise every access would need to subtract 1.
123 : */
124 : int8 htsv[MaxHeapTuplesPerPage + 1];
125 :
126 : /*-------------------------------------------------------
127 : * Working state for freezing
128 : *-------------------------------------------------------
129 : */
130 : HeapPageFreeze pagefrz;
131 :
132 : /*-------------------------------------------------------
133 : * Working state for visibility map processing
134 : *-------------------------------------------------------
135 : */
136 :
137 : /*
138 : * Caller must provide a pinned vmbuffer corresponding to the heap block
139 : * passed to heap_page_prune_and_freeze(). We will fix any corruption
140 : * found in the VM and set the VM if the page is all-visible/all-frozen.
141 : */
142 : Buffer vmbuffer;
143 :
144 : /*
145 : * The state of the VM bits at the beginning of pruning and the state they
146 : * will be in at the end.
147 : */
148 : uint8 old_vmbits;
149 : uint8 new_vmbits;
150 :
151 : /* The newest xmin of live tuples on the page */
152 : TransactionId newest_live_xid;
153 :
154 : /*-------------------------------------------------------
155 : * Information about what was done
156 : *
157 : * These fields are not used by pruning itself for the most part, but are
158 : * used to collect information about what was pruned and what state the
159 : * page is in after pruning, for the benefit of the caller. They are
160 : * copied to the caller's PruneFreezeResult at the end.
161 : * -------------------------------------------------------
162 : */
163 :
164 : int ndeleted; /* Number of tuples deleted from the page */
165 :
166 : /* Number of live and recently dead tuples, after pruning */
167 : int live_tuples;
168 : int recently_dead_tuples;
169 :
170 : /* Whether or not the page makes rel truncation unsafe */
171 : bool hastup;
172 :
173 : /*
174 : * LP_DEAD items on the page after pruning. Includes existing LP_DEAD
175 : * items
176 : */
177 : int lpdead_items; /* number of items in the array */
178 : OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */
179 : } PruneState;
180 :
181 : /*
182 : * Type of visibility map corruption detected on a heap page and its
183 : * associated VM page. Passed to heap_page_fix_vm_corruption() so the caller
184 : * can specify what it found rather than having the function rederive the
185 : * corruption from page state.
186 : */
187 : typedef enum VMCorruptionType
188 : {
189 : /* VM bits are set but the heap page-level PD_ALL_VISIBLE flag is not */
190 : VM_CORRUPT_MISSING_PAGE_HINT,
191 : /* LP_DEAD line pointers found on a page marked all-visible */
192 : VM_CORRUPT_LPDEAD,
193 : /* Tuple not visible to all transactions on a page marked all-visible */
194 : VM_CORRUPT_TUPLE_VISIBILITY,
195 : } VMCorruptionType;
196 :
197 : /* Local functions */
198 : static void prune_freeze_setup(PruneFreezeParams *params,
199 : TransactionId *new_relfrozen_xid,
200 : MultiXactId *new_relmin_mxid,
201 : PruneFreezeResult *presult,
202 : PruneState *prstate);
203 : static void heap_page_fix_vm_corruption(PruneState *prstate,
204 : OffsetNumber offnum,
205 : VMCorruptionType ctype);
206 : static void prune_freeze_fast_path(PruneState *prstate,
207 : PruneFreezeResult *presult);
208 : static void prune_freeze_plan(PruneState *prstate,
209 : OffsetNumber *off_loc);
210 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
211 : HeapTuple tup);
212 : static inline HTSV_Result htsv_get_valid_status(int status);
213 : static void heap_prune_chain(OffsetNumber maxoff,
214 : OffsetNumber rootoffnum, PruneState *prstate);
215 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
216 : OffsetNumber offnum);
217 : static void heap_prune_record_redirect(PruneState *prstate,
218 : OffsetNumber offnum, OffsetNumber rdoffnum,
219 : bool was_normal);
220 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
221 : bool was_normal);
222 : static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
223 : bool was_normal);
224 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
225 :
226 : static void heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum);
227 : static void heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum);
228 : static void heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum);
229 : static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum);
230 :
231 : static void page_verify_redirects(Page page);
232 :
233 : static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune,
234 : PruneState *prstate);
235 : static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason);
236 :
237 :
238 : /*
239 : * Optionally prune and repair fragmentation in the specified page.
240 : *
241 : * This is an opportunistic function. It will perform housekeeping
242 : * only if the page heuristically looks like a candidate for pruning and we
243 : * can acquire buffer cleanup lock without blocking.
244 : *
245 : * Note: this is called quite often. It's important that it fall out quickly
246 : * if there's not any use in pruning.
247 : *
248 : * Caller must have pin on the buffer, and must *not* have a lock on it.
249 : *
250 : * This function may pin *vmbuffer. It's passed by reference so the caller can
251 : * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find
252 : * VM corruption during pruning, we will fix it. Caller is responsible for
253 : * unpinning *vmbuffer.
254 : */
255 : void
256 21392803 : heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
257 : {
258 21392803 : Page page = BufferGetPage(buffer);
259 : TransactionId prune_xid;
260 : GlobalVisState *vistest;
261 : Size minfree;
262 :
263 : /*
264 : * We can't write WAL in recovery mode, so there's no point trying to
265 : * clean the page. The primary will likely issue a cleaning WAL record
266 : * soon anyway, so this is no particular loss.
267 : */
268 21392803 : if (RecoveryInProgress())
269 240445 : return;
270 :
271 : /*
272 : * First check whether there's any chance there's something to prune,
273 : * determining the appropriate horizon is a waste if there's no prune_xid
274 : * (i.e. no updates/deletes left potentially dead tuples around).
275 : */
276 21152358 : prune_xid = PageGetPruneXid(page);
277 21152358 : if (!TransactionIdIsValid(prune_xid))
278 10879194 : return;
279 :
280 : /*
281 : * Check whether prune_xid indicates that there may be dead rows that can
282 : * be cleaned up.
283 : */
284 10273164 : vistest = GlobalVisTestFor(relation);
285 :
286 10273164 : if (!GlobalVisTestIsRemovableXid(vistest, prune_xid, true))
287 8530369 : return;
288 :
289 : /*
290 : * We prune when a previous UPDATE failed to find enough space on the page
291 : * for a new tuple version, or when free space falls below the relation's
292 : * fill-factor target (but not less than 10%).
293 : *
294 : * Checking free space here is questionable since we aren't holding any
295 : * lock on the buffer; in the worst case we could get a bogus answer. It's
296 : * unlikely to be *seriously* wrong, though, since reading either pd_lower
297 : * or pd_upper is probably atomic. Avoiding taking a lock seems more
298 : * important than sometimes getting a wrong answer in what is after all
299 : * just a heuristic estimate.
300 : */
301 1742795 : minfree = RelationGetTargetPageFreeSpace(relation,
302 : HEAP_DEFAULT_FILLFACTOR);
303 1742795 : minfree = Max(minfree, BLCKSZ / 10);
304 :
305 1742795 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
306 : {
307 : /* OK, try to get exclusive buffer lock */
308 45480 : if (!ConditionalLockBufferForCleanup(buffer))
309 511 : return;
310 :
311 : /*
312 : * Now that we have buffer lock, get accurate information about the
313 : * page's free space, and recheck the heuristic about whether to
314 : * prune.
315 : */
316 44969 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
317 : {
318 : OffsetNumber dummy_off_loc;
319 : PruneFreezeResult presult;
320 : PruneFreezeParams params;
321 :
322 44969 : visibilitymap_pin(relation, BufferGetBlockNumber(buffer),
323 : vmbuffer);
324 :
325 44969 : params.relation = relation;
326 44969 : params.buffer = buffer;
327 44969 : params.vmbuffer = *vmbuffer;
328 44969 : params.reason = PRUNE_ON_ACCESS;
329 44969 : params.vistest = vistest;
330 44969 : params.cutoffs = NULL;
331 :
332 : /*
333 : * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
334 : * regardless of whether or not the relation has indexes, since we
335 : * cannot safely determine that during on-access pruning with the
336 : * current implementation.
337 : */
338 44969 : params.options = HEAP_PAGE_PRUNE_ALLOW_FAST_PATH;
339 :
340 44969 : heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc,
341 : NULL, NULL);
342 :
343 : /*
344 : * Report the number of tuples reclaimed to pgstats. This is
345 : * presult.ndeleted minus the number of newly-LP_DEAD-set items.
346 : *
347 : * We derive the number of dead tuples like this to avoid totally
348 : * forgetting about items that were set to LP_DEAD, since they
349 : * still need to be cleaned up by VACUUM. We only want to count
350 : * heap-only tuples that just became LP_UNUSED in our report,
351 : * which don't.
352 : *
353 : * VACUUM doesn't have to compensate in the same way when it
354 : * tracks ndeleted, since it will set the same LP_DEAD items to
355 : * LP_UNUSED separately.
356 : */
357 44969 : if (presult.ndeleted > presult.nnewlpdead)
358 20253 : pgstat_update_heap_dead_tuples(relation,
359 20253 : presult.ndeleted - presult.nnewlpdead);
360 : }
361 :
362 : /* And release buffer lock */
363 44969 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
364 :
365 : /*
366 : * We avoid reuse of any free space created on the page by unrelated
367 : * UPDATEs/INSERTs by opting to not update the FSM at this point. The
368 : * free space should be reused by UPDATEs to *this* page.
369 : */
370 : }
371 : }
372 :
373 : /*
374 : * Helper for heap_page_prune_and_freeze() to initialize the PruneState using
375 : * the provided parameters.
376 : *
377 : * params, new_relfrozen_xid, new_relmin_mxid, and presult are input
378 : * parameters and are not modified by this function. Only prstate is modified.
379 : */
380 : static void
381 492708 : prune_freeze_setup(PruneFreezeParams *params,
382 : TransactionId *new_relfrozen_xid,
383 : MultiXactId *new_relmin_mxid,
384 : PruneFreezeResult *presult,
385 : PruneState *prstate)
386 : {
387 : /* Copy parameters to prstate */
388 492708 : prstate->vistest = params->vistest;
389 492708 : prstate->mark_unused_now =
390 492708 : (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0;
391 :
392 : /* cutoffs must be provided if we will attempt freezing */
393 : Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs);
394 492708 : prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0;
395 492708 : prstate->cutoffs = params->cutoffs;
396 492708 : prstate->relation = params->relation;
397 492708 : prstate->block = BufferGetBlockNumber(params->buffer);
398 492708 : prstate->buffer = params->buffer;
399 492708 : prstate->page = BufferGetPage(params->buffer);
400 :
401 : Assert(BufferIsValid(params->vmbuffer));
402 492708 : prstate->vmbuffer = params->vmbuffer;
403 492708 : prstate->new_vmbits = 0;
404 492708 : prstate->old_vmbits = visibilitymap_get_status(prstate->relation,
405 : prstate->block,
406 : &prstate->vmbuffer);
407 :
408 : /*
409 : * Our strategy is to scan the page and make lists of items to change,
410 : * then apply the changes within a critical section. This keeps as much
411 : * logic as possible out of the critical section, and also ensures that
412 : * WAL replay will work the same as the normal case.
413 : *
414 : * First, initialize the new pd_prune_xid value to zero (indicating no
415 : * prunable tuples). If we find any tuples which may soon become
416 : * prunable, we will save the lowest relevant XID in new_prune_xid. Also
417 : * initialize the rest of our working state.
418 : */
419 492708 : prstate->new_prune_xid = InvalidTransactionId;
420 492708 : prstate->latest_xid_removed = InvalidTransactionId;
421 492708 : prstate->nredirected = prstate->ndead = prstate->nunused = 0;
422 492708 : prstate->nfrozen = 0;
423 492708 : prstate->nroot_items = 0;
424 492708 : prstate->nheaponly_items = 0;
425 :
426 : /* initialize page freezing working state */
427 492708 : prstate->pagefrz.freeze_required = false;
428 492708 : prstate->pagefrz.FreezePageConflictXid = InvalidTransactionId;
429 492708 : if (prstate->attempt_freeze)
430 : {
431 : Assert(new_relfrozen_xid && new_relmin_mxid);
432 447739 : prstate->pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid;
433 447739 : prstate->pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid;
434 447739 : prstate->pagefrz.FreezePageRelminMxid = *new_relmin_mxid;
435 447739 : prstate->pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid;
436 : }
437 : else
438 : {
439 : Assert(!new_relfrozen_xid && !new_relmin_mxid);
440 44969 : prstate->pagefrz.FreezePageRelminMxid = InvalidMultiXactId;
441 44969 : prstate->pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId;
442 44969 : prstate->pagefrz.FreezePageRelfrozenXid = InvalidTransactionId;
443 44969 : prstate->pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId;
444 : }
445 :
446 492708 : prstate->ndeleted = 0;
447 492708 : prstate->live_tuples = 0;
448 492708 : prstate->recently_dead_tuples = 0;
449 492708 : prstate->hastup = false;
450 492708 : prstate->lpdead_items = 0;
451 :
452 : /*
453 : * deadoffsets are filled in during pruning but are only used to populate
454 : * PruneFreezeResult->deadoffsets. To avoid needing two copies of the
455 : * array, just save a pointer to the result offsets array in the
456 : * PruneState.
457 : */
458 492708 : prstate->deadoffsets = presult->deadoffsets;
459 :
460 : /*
461 : * We track whether the page will be all-visible/all-frozen at the end of
462 : * pruning and freezing. While examining tuple visibility, we'll set
463 : * set_all_visible to false if there are tuples on the page not visible to
464 : * all running and future transactions. set_all_visible is always
465 : * maintained but only VACUUM will set the VM if the page ends up being
466 : * all-visible.
467 : *
468 : * We also keep track of the newest live XID, which is used to calculate
469 : * the snapshot conflict horizon for a WAL record setting the VM.
470 : */
471 492708 : prstate->set_all_visible = true;
472 492708 : prstate->newest_live_xid = InvalidTransactionId;
473 :
474 : /*
475 : * Currently, only VACUUM performs freezing, but other callers may in the
476 : * future. We must initialize set_all_frozen based on whether or not the
477 : * caller passed HEAP_PAGE_PRUNE_FREEZE, because if they did not, we won't
478 : * call heap_prepare_freeze_tuple() for each tuple, and set_all_frozen
479 : * will never be cleared for tuples that need freezing. This would lead to
480 : * incorrectly setting the visibility map all-frozen for this page.
481 : *
482 : * When freezing is not required (no XIDs/MXIDs older than the freeze
483 : * cutoff), we may still choose to "opportunistically" freeze if doing so
484 : * would make the page all-frozen.
485 : *
486 : * We will not be able to freeze the whole page at the end of vacuum if
487 : * there are tuples present that are not visible to everyone or if there
488 : * are dead tuples which will not be removable. However, dead tuples that
489 : * will be removed by the end of vacuum should not prevent this
490 : * opportunistic freezing.
491 : *
492 : * Therefore, we do not clear set_all_visible and set_all_frozen when we
493 : * encounter LP_DEAD items. Instead, we correct them after deciding
494 : * whether to freeze, but before updating the VM, to avoid setting the VM
495 : * bits incorrectly.
496 : */
497 492708 : prstate->set_all_frozen = prstate->attempt_freeze;
498 492708 : }
499 :
500 : /*
501 : * Helper for heap_page_prune_and_freeze(). Iterates over every tuple on the
502 : * page, examines its visibility information, and determines the appropriate
503 : * action for each tuple. All tuples are processed and classified during this
504 : * phase, but no modifications are made to the page until the later execution
505 : * stage.
506 : *
507 : * *off_loc is used for error callback and cleared before returning.
508 : */
509 : static void
510 308838 : prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc)
511 : {
512 308838 : Page page = prstate->page;
513 308838 : BlockNumber blockno = prstate->block;
514 308838 : OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
515 : OffsetNumber offnum;
516 : HeapTupleData tup;
517 :
518 308838 : tup.t_tableOid = RelationGetRelid(prstate->relation);
519 :
520 : /*
521 : * Determine HTSV for all tuples, and queue them up for processing as HOT
522 : * chain roots or as heap-only items.
523 : *
524 : * Determining HTSV only once for each tuple is required for correctness,
525 : * to deal with cases where running HTSV twice could result in different
526 : * results. For example, RECENTLY_DEAD can turn to DEAD if another
527 : * checked item causes GlobalVisTestIsRemovableFullXid() to update the
528 : * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting
529 : * transaction aborts.
530 : *
531 : * It's also good for performance. Most commonly tuples within a page are
532 : * stored at decreasing offsets (while the items are stored at increasing
533 : * offsets). When processing all tuples on a page this leads to reading
534 : * memory at decreasing offsets within a page, with a variable stride.
535 : * That's hard for CPU prefetchers to deal with. Processing the items in
536 : * reverse order (and thus the tuples in increasing order) increases
537 : * prefetching efficiency significantly / decreases the number of cache
538 : * misses.
539 : */
540 308838 : for (offnum = maxoff;
541 16301542 : offnum >= FirstOffsetNumber;
542 15992704 : offnum = OffsetNumberPrev(offnum))
543 : {
544 15992704 : ItemId itemid = PageGetItemId(page, offnum);
545 : HeapTupleHeader htup;
546 :
547 : /*
548 : * Set the offset number so that we can display it along with any
549 : * error that occurred while processing this tuple.
550 : */
551 15992704 : *off_loc = offnum;
552 :
553 15992704 : prstate->processed[offnum] = false;
554 15992704 : prstate->htsv[offnum] = -1;
555 :
556 : /* Nothing to do if slot doesn't contain a tuple */
557 15992704 : if (!ItemIdIsUsed(itemid))
558 : {
559 94618 : heap_prune_record_unchanged_lp_unused(prstate, offnum);
560 94618 : continue;
561 : }
562 :
563 15898086 : if (ItemIdIsDead(itemid))
564 : {
565 : /*
566 : * If the caller set mark_unused_now true, we can set dead line
567 : * pointers LP_UNUSED now.
568 : */
569 1294951 : if (unlikely(prstate->mark_unused_now))
570 1991 : heap_prune_record_unused(prstate, offnum, false);
571 : else
572 1292960 : heap_prune_record_unchanged_lp_dead(prstate, offnum);
573 1294951 : continue;
574 : }
575 :
576 14603135 : if (ItemIdIsRedirected(itemid))
577 : {
578 : /* This is the start of a HOT chain */
579 173027 : prstate->root_items[prstate->nroot_items++] = offnum;
580 173027 : continue;
581 : }
582 :
583 : Assert(ItemIdIsNormal(itemid));
584 :
585 : /*
586 : * Get the tuple's visibility status and queue it up for processing.
587 : */
588 14430108 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
589 14430108 : tup.t_data = htup;
590 14430108 : tup.t_len = ItemIdGetLength(itemid);
591 14430108 : ItemPointerSet(&tup.t_self, blockno, offnum);
592 :
593 14430108 : prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, &tup);
594 :
595 14430108 : if (!HeapTupleHeaderIsHeapOnly(htup))
596 14118157 : prstate->root_items[prstate->nroot_items++] = offnum;
597 : else
598 311951 : prstate->heaponly_items[prstate->nheaponly_items++] = offnum;
599 : }
600 :
601 : /*
602 : * Process HOT chains.
603 : *
604 : * We added the items to the array starting from 'maxoff', so by
605 : * processing the array in reverse order, we process the items in
606 : * ascending offset number order. The order doesn't matter for
607 : * correctness, but some quick micro-benchmarking suggests that this is
608 : * faster. (Earlier PostgreSQL versions, which scanned all the items on
609 : * the page instead of using the root_items array, also did it in
610 : * ascending offset number order.)
611 : */
612 14600022 : for (int i = prstate->nroot_items - 1; i >= 0; i--)
613 : {
614 14291184 : offnum = prstate->root_items[i];
615 :
616 : /* Ignore items already processed as part of an earlier chain */
617 14291184 : if (prstate->processed[offnum])
618 0 : continue;
619 :
620 : /* see preceding loop */
621 14291184 : *off_loc = offnum;
622 :
623 : /* Process this item or chain of items */
624 14291184 : heap_prune_chain(maxoff, offnum, prstate);
625 : }
626 :
627 : /*
628 : * Process any heap-only tuples that were not already processed as part of
629 : * a HOT chain.
630 : */
631 620789 : for (int i = prstate->nheaponly_items - 1; i >= 0; i--)
632 : {
633 311951 : offnum = prstate->heaponly_items[i];
634 :
635 311951 : if (prstate->processed[offnum])
636 296117 : continue;
637 :
638 : /* see preceding loop */
639 15834 : *off_loc = offnum;
640 :
641 : /*
642 : * If the tuple is DEAD and doesn't chain to anything else, mark it
643 : * unused. (If it does chain, we can only remove it as part of
644 : * pruning its chain.)
645 : *
646 : * We need this primarily to handle aborted HOT updates, that is,
647 : * XMIN_INVALID heap-only tuples. Those might not be linked to by any
648 : * chain, since the parent tuple might be re-updated before any
649 : * pruning occurs. So we have to be able to reap them separately from
650 : * chain-pruning. (Note that HeapTupleHeaderIsHotUpdated will never
651 : * return true for an XMIN_INVALID tuple, so this code will work even
652 : * when there were sequential updates within the aborted transaction.)
653 : */
654 15834 : if (prstate->htsv[offnum] == HEAPTUPLE_DEAD)
655 : {
656 3079 : ItemId itemid = PageGetItemId(page, offnum);
657 3079 : HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
658 :
659 3079 : if (likely(!HeapTupleHeaderIsHotUpdated(htup)))
660 : {
661 3079 : HeapTupleHeaderAdvanceConflictHorizon(htup,
662 : &prstate->latest_xid_removed);
663 3079 : heap_prune_record_unused(prstate, offnum, true);
664 : }
665 : else
666 : {
667 : /*
668 : * This tuple should've been processed and removed as part of
669 : * a HOT chain, so something's wrong. To preserve evidence,
670 : * we don't dare to remove it. We cannot leave behind a DEAD
671 : * tuple either, because that will cause VACUUM to error out.
672 : * Throwing an error with a distinct error message seems like
673 : * the least bad option.
674 : */
675 0 : elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
676 : blockno, offnum);
677 : }
678 : }
679 : else
680 12755 : heap_prune_record_unchanged_lp_normal(prstate, offnum);
681 : }
682 :
683 : /* We should now have processed every tuple exactly once */
684 : #ifdef USE_ASSERT_CHECKING
685 : for (offnum = FirstOffsetNumber;
686 : offnum <= maxoff;
687 : offnum = OffsetNumberNext(offnum))
688 : {
689 : *off_loc = offnum;
690 :
691 : Assert(prstate->processed[offnum]);
692 : }
693 : #endif
694 :
695 : /* Clear the offset information once we have processed the given page. */
696 308838 : *off_loc = InvalidOffsetNumber;
697 308838 : }
698 :
699 : /*
700 : * Decide whether to proceed with freezing according to the freeze plans
701 : * prepared for the current heap buffer. If freezing is chosen, this function
702 : * performs several pre-freeze checks.
703 : *
704 : * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be
705 : * determined before calling this function.
706 : *
707 : * prstate is both an input and output parameter.
708 : *
709 : * Returns true if we should apply the freeze plans and freeze tuples on the
710 : * page, and false otherwise.
711 : */
712 : static bool
713 308838 : heap_page_will_freeze(bool did_tuple_hint_fpi,
714 : bool do_prune,
715 : bool do_hint_prune,
716 : PruneState *prstate)
717 : {
718 308838 : bool do_freeze = false;
719 :
720 : /*
721 : * If the caller specified we should not attempt to freeze any tuples,
722 : * validate that everything is in the right state and return.
723 : */
724 308838 : if (!prstate->attempt_freeze)
725 : {
726 : Assert(!prstate->set_all_frozen && prstate->nfrozen == 0);
727 44969 : return false;
728 : }
729 :
730 263869 : if (prstate->pagefrz.freeze_required)
731 : {
732 : /*
733 : * heap_prepare_freeze_tuple indicated that at least one XID/MXID from
734 : * before FreezeLimit/MultiXactCutoff is present. Must freeze to
735 : * advance relfrozenxid/relminmxid.
736 : */
737 22060 : do_freeze = true;
738 : }
739 : else
740 : {
741 : /*
742 : * Opportunistically freeze the page if we are generating an FPI
743 : * anyway and if doing so means that we can set the page all-frozen
744 : * afterwards (might not happen until VACUUM's final heap pass).
745 : *
746 : * XXX: Previously, we knew if pruning emitted an FPI by checking
747 : * pgWalUsage.wal_fpi before and after pruning. Once the freeze and
748 : * prune records were combined, this heuristic couldn't be used
749 : * anymore. The opportunistic freeze heuristic must be improved;
750 : * however, for now, try to approximate the old logic.
751 : */
752 241809 : if (prstate->set_all_frozen && prstate->nfrozen > 0)
753 : {
754 : Assert(prstate->set_all_visible);
755 :
756 : /*
757 : * Freezing would make the page all-frozen. Have already emitted
758 : * an FPI or will do so anyway?
759 : */
760 20614 : if (RelationNeedsWAL(prstate->relation))
761 : {
762 18957 : if (did_tuple_hint_fpi)
763 1485 : do_freeze = true;
764 17472 : else if (do_prune)
765 : {
766 1655 : if (XLogCheckBufferNeedsBackup(prstate->buffer))
767 551 : do_freeze = true;
768 : }
769 15817 : else if (do_hint_prune)
770 : {
771 20 : if (XLogHintBitIsNeeded() &&
772 10 : XLogCheckBufferNeedsBackup(prstate->buffer))
773 2 : do_freeze = true;
774 : }
775 : }
776 : }
777 : }
778 :
779 263869 : if (do_freeze)
780 : {
781 : /*
782 : * Validate the tuples we will be freezing before entering the
783 : * critical section.
784 : */
785 24098 : heap_pre_freeze_checks(prstate->buffer, prstate->frozen, prstate->nfrozen);
786 : Assert(TransactionIdPrecedes(prstate->pagefrz.FreezePageConflictXid,
787 : prstate->cutoffs->OldestXmin));
788 : }
789 239771 : else if (prstate->nfrozen > 0)
790 : {
791 : /*
792 : * The page contained some tuples that were not already frozen, and we
793 : * chose not to freeze them now. The page won't be all-frozen then.
794 : */
795 : Assert(!prstate->pagefrz.freeze_required);
796 :
797 19129 : prstate->set_all_frozen = false;
798 19129 : prstate->nfrozen = 0; /* avoid miscounts in instrumentation */
799 : }
800 : else
801 : {
802 : /*
803 : * We have no freeze plans to execute. The page might already be
804 : * all-frozen (perhaps only following pruning), though. Such pages
805 : * can be marked all-frozen in the VM by our caller, even though none
806 : * of its tuples were newly frozen here.
807 : */
808 : }
809 :
810 263869 : return do_freeze;
811 : }
812 :
813 : /*
814 : * Emit a warning about and fix visibility map corruption on the given page.
815 : *
816 : * The caller specifies the type of corruption it has already detected via
817 : * corruption_type, so that we can emit the appropriate warning. All cases
818 : * result in the VM bits being cleared; corruption types where PD_ALL_VISIBLE
819 : * is incorrectly set also clear PD_ALL_VISIBLE.
820 : *
821 : * Must be called while holding an exclusive lock on the heap buffer. Dead
822 : * items and not all-visible tuples must have been discovered under that same
823 : * lock. Although we do not hold a lock on the VM buffer, it is pinned, and
824 : * the heap buffer is exclusively locked, ensuring that no other backend can
825 : * update the VM bits corresponding to this heap page.
826 : *
827 : * This function makes changes to the VM and, potentially, the heap page, but
828 : * it does not need to be done in a critical section.
829 : */
830 : static void
831 0 : heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum,
832 : VMCorruptionType corruption_type)
833 : {
834 0 : const char *relname = RelationGetRelationName(prstate->relation);
835 0 : bool do_clear_vm = false;
836 0 : bool do_clear_heap = false;
837 :
838 : Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE));
839 :
840 0 : switch (corruption_type)
841 : {
842 0 : case VM_CORRUPT_LPDEAD:
843 0 : ereport(WARNING,
844 : (errcode(ERRCODE_DATA_CORRUPTED),
845 : errmsg("dead line pointer found on page marked all-visible"),
846 : errcontext("relation \"%s\", page %u, tuple %u",
847 : relname, prstate->block, offnum)));
848 0 : do_clear_vm = true;
849 0 : do_clear_heap = true;
850 0 : break;
851 :
852 0 : case VM_CORRUPT_TUPLE_VISIBILITY:
853 :
854 : /*
855 : * A HEAPTUPLE_LIVE tuple on an all-visible page can appear to not
856 : * be visible to everyone when
857 : * GetOldestNonRemovableTransactionId() returns a conservative
858 : * value that's older than the real safe xmin. That is not
859 : * corruption -- the PD_ALL_VISIBLE flag is still correct.
860 : *
861 : * However, dead tuple versions, in-progress inserts, and
862 : * in-progress deletes should never appear on a page marked
863 : * all-visible. That indicates real corruption. PD_ALL_VISIBLE
864 : * should have been cleared by the DML operation that deleted or
865 : * inserted the tuple.
866 : */
867 0 : ereport(WARNING,
868 : (errcode(ERRCODE_DATA_CORRUPTED),
869 : errmsg("tuple not visible to all transactions found on page marked all-visible"),
870 : errcontext("relation \"%s\", page %u, tuple %u",
871 : relname, prstate->block, offnum)));
872 0 : do_clear_vm = true;
873 0 : do_clear_heap = true;
874 0 : break;
875 :
876 0 : case VM_CORRUPT_MISSING_PAGE_HINT:
877 :
878 : /*
879 : * As of PostgreSQL 9.2, the visibility map bit should never be
880 : * set if the page-level bit is clear. However, for vacuum, it's
881 : * possible that the bit got cleared after
882 : * heap_vac_scan_next_block() was called, so we must recheck now
883 : * that we have the buffer lock before concluding that the VM is
884 : * corrupt.
885 : */
886 : Assert(!PageIsAllVisible(prstate->page));
887 : Assert(prstate->old_vmbits & VISIBILITYMAP_VALID_BITS);
888 0 : ereport(WARNING,
889 : (errcode(ERRCODE_DATA_CORRUPTED),
890 : errmsg("page is not marked all-visible but visibility map bit is set"),
891 : errcontext("relation \"%s\", page %u",
892 : relname, prstate->block)));
893 0 : do_clear_vm = true;
894 0 : break;
895 : }
896 :
897 : Assert(do_clear_heap || do_clear_vm);
898 :
899 : /* Avoid marking the buffer dirty if PD_ALL_VISIBLE is already clear */
900 0 : if (do_clear_heap)
901 : {
902 : Assert(PageIsAllVisible(prstate->page));
903 0 : PageClearAllVisible(prstate->page);
904 0 : MarkBufferDirtyHint(prstate->buffer, true);
905 : }
906 :
907 0 : if (do_clear_vm)
908 : {
909 0 : visibilitymap_clear(prstate->relation, prstate->block,
910 : prstate->vmbuffer,
911 : VISIBILITYMAP_VALID_BITS);
912 0 : prstate->old_vmbits = 0;
913 : }
914 0 : }
915 :
916 : /*
917 : * Decide whether to set the visibility map bits (all-visible and all-frozen)
918 : * for the current page using information from the PruneState and VM.
919 : *
920 : * This function does not actually set the VM bits or page-level visibility
921 : * hint, PD_ALL_VISIBLE.
922 : *
923 : * Returns true if one or both VM bits should be set and false otherwise.
924 : */
925 : static bool
926 308838 : heap_page_will_set_vm(PruneState *prstate, PruneReason reason)
927 : {
928 : /*
929 : * Though on-access pruning maintains prstate->set_all_visible, we don't
930 : * set the VM on-access for now.
931 : */
932 308838 : if (reason == PRUNE_ON_ACCESS)
933 44969 : return false;
934 :
935 263869 : if (!prstate->set_all_visible)
936 218379 : return false;
937 :
938 45490 : prstate->new_vmbits = VISIBILITYMAP_ALL_VISIBLE;
939 :
940 45490 : if (prstate->set_all_frozen)
941 30427 : prstate->new_vmbits |= VISIBILITYMAP_ALL_FROZEN;
942 :
943 45490 : if (prstate->new_vmbits == prstate->old_vmbits)
944 : {
945 1127 : prstate->new_vmbits = 0;
946 1127 : return false;
947 : }
948 :
949 44363 : return true;
950 : }
951 :
952 : /*
953 : * If the page is already all-frozen, or already all-visible and freezing
954 : * won't be attempted, there is no remaining work and we can use the fast path
955 : * to avoid the expensive overhead of heap_page_prune_and_freeze().
956 : *
957 : * This can happen when the page has a stale prune hint, or if VACUUM is
958 : * scanning an already all-frozen page due to SKIP_PAGES_THRESHOLD.
959 : *
960 : * The caller must already have examined the visibility map and saved the
961 : * status of the page's VM bits in prstate->old_vmbits. Caller must hold a
962 : * content lock on the heap page since it will examine line pointers.
963 : *
964 : * Before calling prune_freeze_fast_path(), the caller should first
965 : * check for and fix any discrepancy between the page-level visibility hint
966 : * and the visibility map. Otherwise, the fast path will always prevent us
967 : * from getting them in sync. Note that if there are tuples on the page that
968 : * are not visible to all but the VM is incorrectly marked
969 : * all-visible/all-frozen, we will not get the chance to fix that corruption
970 : * when using the fast path.
971 : */
972 : static void
973 183870 : prune_freeze_fast_path(PruneState *prstate, PruneFreezeResult *presult)
974 : {
975 183870 : OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
976 183870 : Page page = prstate->page;
977 :
978 : Assert((prstate->old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
979 : ((prstate->old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
980 : !prstate->attempt_freeze));
981 :
982 : /* We'll fill in presult for the caller */
983 183870 : memset(presult, 0, sizeof(PruneFreezeResult));
984 :
985 : /* Clear any stale prune hint */
986 183870 : if (TransactionIdIsValid(PageGetPruneXid(page)))
987 : {
988 0 : PageClearPrunable(page);
989 0 : MarkBufferDirtyHint(prstate->buffer, true);
990 : }
991 :
992 183870 : if (PageIsEmpty(page))
993 0 : return;
994 :
995 : /*
996 : * Since the page is all-visible, a count of the normal ItemIds on the
997 : * page should be sufficient for vacuum's live tuple count.
998 : */
999 183870 : for (OffsetNumber off = FirstOffsetNumber;
1000 10661875 : off <= maxoff;
1001 10478005 : off = OffsetNumberNext(off))
1002 : {
1003 10478005 : ItemId lp = PageGetItemId(page, off);
1004 :
1005 10478005 : if (!ItemIdIsUsed(lp))
1006 172449 : continue;
1007 :
1008 10305556 : presult->hastup = true;
1009 :
1010 10305556 : if (ItemIdIsNormal(lp))
1011 10123490 : prstate->live_tuples++;
1012 : }
1013 :
1014 183870 : presult->live_tuples = prstate->live_tuples;
1015 : }
1016 :
1017 : /*
1018 : * Prune and repair fragmentation and potentially freeze tuples on the
1019 : * specified page. If the page's visibility status has changed, update it in
1020 : * the VM.
1021 : *
1022 : * Caller must have pin and buffer cleanup lock on the page. Note that we
1023 : * don't update the FSM information for page on caller's behalf. Caller might
1024 : * also need to account for a reduction in the length of the line pointer
1025 : * array following array truncation by us.
1026 : *
1027 : * params contains the input parameters used to control freezing and pruning
1028 : * behavior. See the definition of PruneFreezeParams for more on what each
1029 : * parameter does.
1030 : *
1031 : * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze
1032 : * tuples if it's required in order to advance relfrozenxid / relminmxid, or
1033 : * if it's considered advantageous for overall system performance to do so
1034 : * now. The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and
1035 : * 'new_relmin_mxid' arguments are required when freezing.
1036 : *
1037 : * A vmbuffer corresponding to the heap page is also passed and if the page is
1038 : * found to be all-visible/all-frozen, we will set it in the VM.
1039 : *
1040 : * presult contains output parameters needed by callers, such as the number of
1041 : * tuples removed and the offsets of dead items on the page after pruning.
1042 : * heap_page_prune_and_freeze() is responsible for initializing it. Required
1043 : * by all callers.
1044 : *
1045 : * off_loc is the offset location required by the caller to use in error
1046 : * callback.
1047 : *
1048 : * new_relfrozen_xid and new_relmin_mxid must be provided by the caller if the
1049 : * HEAP_PAGE_PRUNE_FREEZE option is set in params. On entry, they contain the
1050 : * oldest XID and multi-XID seen on the relation so far. They will be updated
1051 : * with the oldest values present on the page after pruning. After processing
1052 : * the whole relation, VACUUM can use these values as the new
1053 : * relfrozenxid/relminmxid for the relation.
1054 : */
1055 : void
1056 492708 : heap_page_prune_and_freeze(PruneFreezeParams *params,
1057 : PruneFreezeResult *presult,
1058 : OffsetNumber *off_loc,
1059 : TransactionId *new_relfrozen_xid,
1060 : MultiXactId *new_relmin_mxid)
1061 : {
1062 : PruneState prstate;
1063 : bool do_freeze;
1064 : bool do_prune;
1065 : bool do_hint_prune;
1066 : bool do_set_vm;
1067 : bool did_tuple_hint_fpi;
1068 492708 : int64 fpi_before = pgWalUsage.wal_fpi;
1069 : TransactionId conflict_xid;
1070 :
1071 : /* Initialize prstate */
1072 492708 : prune_freeze_setup(params,
1073 : new_relfrozen_xid, new_relmin_mxid,
1074 : presult, &prstate);
1075 :
1076 : /*
1077 : * If the VM is set but PD_ALL_VISIBLE is clear, fix that corruption
1078 : * before pruning and freezing so that the page and VM start out in a
1079 : * consistent state.
1080 : */
1081 492708 : if ((prstate.old_vmbits & VISIBILITYMAP_VALID_BITS) &&
1082 185023 : !PageIsAllVisible(prstate.page))
1083 0 : heap_page_fix_vm_corruption(&prstate, InvalidOffsetNumber,
1084 : VM_CORRUPT_MISSING_PAGE_HINT);
1085 :
1086 : /*
1087 : * If the page is already all-frozen, or already all-visible when freezing
1088 : * is not being attempted, take the fast path, skipping pruning and
1089 : * freezing code entirely. This must be done after fixing any discrepancy
1090 : * between the page-level visibility hint and the VM, since that may have
1091 : * cleared old_vmbits.
1092 : */
1093 492708 : if ((params->options & HEAP_PAGE_PRUNE_ALLOW_FAST_PATH) != 0 &&
1094 491416 : ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
1095 307546 : ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
1096 695 : !prstate.attempt_freeze)))
1097 : {
1098 183870 : prune_freeze_fast_path(&prstate, presult);
1099 183870 : return;
1100 : }
1101 :
1102 : /*
1103 : * Examine all line pointers and tuple visibility information to determine
1104 : * which line pointers should change state and which tuples may be frozen.
1105 : * Prepare queue of state changes to later be executed in a critical
1106 : * section.
1107 : */
1108 308838 : prune_freeze_plan(&prstate, off_loc);
1109 :
1110 : /*
1111 : * After processing all the live tuples on the page, if the newest xmin
1112 : * amongst them may be considered running by any snapshot, the page cannot
1113 : * be all-visible. This should be done before determining whether or not
1114 : * to opportunistically freeze.
1115 : */
1116 308838 : if (prstate.set_all_visible &&
1117 156743 : TransactionIdIsNormal(prstate.newest_live_xid) &&
1118 63242 : GlobalVisTestXidConsideredRunning(prstate.vistest,
1119 : prstate.newest_live_xid,
1120 : true))
1121 3606 : prstate.set_all_visible = prstate.set_all_frozen = false;
1122 :
1123 : /*
1124 : * If checksums are enabled, calling heap_prune_satisfies_vacuum() while
1125 : * checking tuple visibility information in prune_freeze_plan() may have
1126 : * caused an FPI to be emitted.
1127 : */
1128 308838 : did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi;
1129 :
1130 909360 : do_prune = prstate.nredirected > 0 ||
1131 560091 : prstate.ndead > 0 ||
1132 251253 : prstate.nunused > 0;
1133 :
1134 : /*
1135 : * Even if we don't prune anything, if we found a new value for the
1136 : * pd_prune_xid field or the page was marked full, we will update the hint
1137 : * bit.
1138 : */
1139 559746 : do_hint_prune = PageGetPruneXid(prstate.page) != prstate.new_prune_xid ||
1140 250908 : PageIsFull(prstate.page);
1141 :
1142 : /*
1143 : * Decide if we want to go ahead with freezing according to the freeze
1144 : * plans we prepared, or not.
1145 : */
1146 308838 : do_freeze = heap_page_will_freeze(did_tuple_hint_fpi,
1147 : do_prune,
1148 : do_hint_prune,
1149 : &prstate);
1150 :
1151 : /*
1152 : * While scanning the line pointers, we did not clear
1153 : * set_all_visible/set_all_frozen when encountering LP_DEAD items because
1154 : * we wanted the decision whether or not to freeze the page to be
1155 : * unaffected by the short-term presence of LP_DEAD items. These LP_DEAD
1156 : * items are effectively assumed to be LP_UNUSED items in the making. It
1157 : * doesn't matter which vacuum heap pass (initial pass or final pass) ends
1158 : * up setting the page all-frozen, as long as the ongoing VACUUM does it.
1159 : *
1160 : * Now that we finished determining whether or not to freeze the page,
1161 : * update set_all_visible and set_all_frozen so that they reflect the true
1162 : * state of the page for setting PD_ALL_VISIBLE and VM bits.
1163 : */
1164 308838 : if (prstate.lpdead_items > 0)
1165 56333 : prstate.set_all_visible = prstate.set_all_frozen = false;
1166 :
1167 : Assert(!prstate.set_all_frozen || prstate.set_all_visible);
1168 : Assert(!prstate.set_all_visible || (prstate.lpdead_items == 0));
1169 :
1170 308838 : do_set_vm = heap_page_will_set_vm(&prstate, params->reason);
1171 :
1172 : /*
1173 : * new_vmbits should be 0 regardless of whether or not the page is
1174 : * all-visible if we do not intend to set the VM.
1175 : */
1176 : Assert(do_set_vm || prstate.new_vmbits == 0);
1177 :
1178 : /*
1179 : * The snapshot conflict horizon for the whole record is the most
1180 : * conservative (newest) horizon required by any change in the record.
1181 : */
1182 308838 : conflict_xid = InvalidTransactionId;
1183 308838 : if (do_set_vm)
1184 44363 : conflict_xid = prstate.newest_live_xid;
1185 308838 : if (do_freeze && TransactionIdFollows(prstate.pagefrz.FreezePageConflictXid, conflict_xid))
1186 4237 : conflict_xid = prstate.pagefrz.FreezePageConflictXid;
1187 308838 : if (do_prune && TransactionIdFollows(prstate.latest_xid_removed, conflict_xid))
1188 51611 : conflict_xid = prstate.latest_xid_removed;
1189 :
1190 : /* Lock vmbuffer before entering a critical section */
1191 308838 : if (do_set_vm)
1192 44363 : LockBuffer(prstate.vmbuffer, BUFFER_LOCK_EXCLUSIVE);
1193 :
1194 : /* Any error while applying the changes is critical */
1195 308838 : START_CRIT_SECTION();
1196 :
1197 308838 : if (do_hint_prune)
1198 : {
1199 : /*
1200 : * Update the page's pd_prune_xid field to either zero, or the lowest
1201 : * XID of any soon-prunable tuple.
1202 : */
1203 57994 : ((PageHeader) prstate.page)->pd_prune_xid = prstate.new_prune_xid;
1204 :
1205 : /*
1206 : * Also clear the "page is full" flag, since there's no point in
1207 : * repeating the prune/defrag process until something else happens to
1208 : * the page.
1209 : */
1210 57994 : PageClearFull(prstate.page);
1211 :
1212 : /*
1213 : * If that's all we had to do to the page, this is a non-WAL-logged
1214 : * hint. If we are going to freeze or prune the page or set
1215 : * PD_ALL_VISIBLE, we will mark the buffer dirty below.
1216 : *
1217 : * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden
1218 : * for the VM to be set and PD_ALL_VISIBLE to be clear.
1219 : */
1220 57994 : if (!do_freeze && !do_prune && !do_set_vm)
1221 203 : MarkBufferDirtyHint(prstate.buffer, true);
1222 : }
1223 :
1224 308838 : if (do_prune || do_freeze || do_set_vm)
1225 : {
1226 : /* Apply the planned item changes and repair page fragmentation. */
1227 104355 : if (do_prune)
1228 : {
1229 58023 : heap_page_prune_execute(prstate.buffer, false,
1230 : prstate.redirected, prstate.nredirected,
1231 : prstate.nowdead, prstate.ndead,
1232 : prstate.nowunused, prstate.nunused);
1233 : }
1234 :
1235 104355 : if (do_freeze)
1236 24098 : heap_freeze_prepared_tuples(prstate.buffer, prstate.frozen, prstate.nfrozen);
1237 :
1238 : /* Set the visibility map and page visibility hint */
1239 104355 : if (do_set_vm)
1240 : {
1241 : /*
1242 : * While it is valid for PD_ALL_VISIBLE to be set when the
1243 : * corresponding VM bit is clear, we strongly prefer to keep them
1244 : * in sync.
1245 : *
1246 : * The heap buffer must be marked dirty before adding it to the
1247 : * WAL chain when setting the VM. We don't worry about
1248 : * unnecessarily dirtying the heap buffer if PD_ALL_VISIBLE is
1249 : * already set, though. It is extremely rare to have a clean heap
1250 : * buffer with PD_ALL_VISIBLE already set and the VM bits clear,
1251 : * so there is no point in optimizing it.
1252 : */
1253 44363 : PageSetAllVisible(prstate.page);
1254 44363 : PageClearPrunable(prstate.page);
1255 44363 : visibilitymap_set(prstate.block, prstate.vmbuffer, prstate.new_vmbits,
1256 44363 : prstate.relation->rd_locator);
1257 : }
1258 :
1259 104355 : MarkBufferDirty(prstate.buffer);
1260 :
1261 : /*
1262 : * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
1263 : */
1264 104355 : if (RelationNeedsWAL(prstate.relation))
1265 : {
1266 144472 : log_heap_prune_and_freeze(prstate.relation, prstate.buffer,
1267 : do_set_vm ? prstate.vmbuffer : InvalidBuffer,
1268 42794 : do_set_vm ? prstate.new_vmbits : 0,
1269 : conflict_xid,
1270 : true, /* cleanup lock */
1271 : params->reason,
1272 : prstate.frozen, prstate.nfrozen,
1273 : prstate.redirected, prstate.nredirected,
1274 : prstate.nowdead, prstate.ndead,
1275 : prstate.nowunused, prstate.nunused);
1276 : }
1277 : }
1278 :
1279 308838 : END_CRIT_SECTION();
1280 :
1281 308838 : if (do_set_vm)
1282 44363 : LockBuffer(prstate.vmbuffer, BUFFER_LOCK_UNLOCK);
1283 :
1284 : /*
1285 : * During its second pass over the heap, VACUUM calls
1286 : * heap_page_would_be_all_visible() to determine whether a page is
1287 : * all-visible and all-frozen. The logic here is similar. After completing
1288 : * pruning and freezing, use an assertion to verify that our results
1289 : * remain consistent with heap_page_would_be_all_visible(). It's also a
1290 : * valuable cross-check of the page state after pruning and freezing.
1291 : */
1292 : #ifdef USE_ASSERT_CHECKING
1293 : if (prstate.set_all_visible)
1294 : {
1295 : TransactionId debug_cutoff;
1296 : bool debug_all_frozen;
1297 :
1298 : Assert(prstate.lpdead_items == 0);
1299 :
1300 : Assert(heap_page_is_all_visible(prstate.relation, prstate.buffer,
1301 : prstate.vistest,
1302 : &debug_all_frozen,
1303 : &debug_cutoff, off_loc));
1304 :
1305 : Assert(!TransactionIdIsValid(debug_cutoff) ||
1306 : debug_cutoff == prstate.newest_live_xid);
1307 :
1308 : /*
1309 : * It's possible the page is composed entirely of frozen tuples but is
1310 : * not set all-frozen in the VM and did not pass
1311 : * HEAP_PAGE_PRUNE_FREEZE. In this case, it's possible
1312 : * heap_page_is_all_visible() finds the page completely frozen, even
1313 : * though prstate.set_all_frozen is false.
1314 : */
1315 : Assert(!prstate.set_all_frozen || debug_all_frozen);
1316 : }
1317 : #endif
1318 :
1319 : /* Copy information back for caller */
1320 308838 : presult->ndeleted = prstate.ndeleted;
1321 308838 : presult->nnewlpdead = prstate.ndead;
1322 308838 : presult->nfrozen = prstate.nfrozen;
1323 308838 : presult->live_tuples = prstate.live_tuples;
1324 308838 : presult->recently_dead_tuples = prstate.recently_dead_tuples;
1325 308838 : presult->hastup = prstate.hastup;
1326 :
1327 308838 : presult->lpdead_items = prstate.lpdead_items;
1328 : /* the presult->deadoffsets array was already filled in */
1329 :
1330 308838 : presult->newly_all_visible = false;
1331 308838 : presult->newly_all_frozen = false;
1332 308838 : presult->newly_all_visible_frozen = false;
1333 308838 : if (do_set_vm)
1334 : {
1335 44363 : if ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
1336 : {
1337 44337 : presult->newly_all_visible = true;
1338 44337 : if (prstate.set_all_frozen)
1339 29944 : presult->newly_all_visible_frozen = true;
1340 : }
1341 26 : else if ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 &&
1342 26 : prstate.set_all_frozen)
1343 26 : presult->newly_all_frozen = true;
1344 : }
1345 :
1346 308838 : if (prstate.attempt_freeze)
1347 : {
1348 263869 : if (presult->nfrozen > 0)
1349 : {
1350 24098 : *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid;
1351 24098 : *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid;
1352 : }
1353 : else
1354 : {
1355 239771 : *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid;
1356 239771 : *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid;
1357 : }
1358 : }
1359 : }
1360 :
1361 :
1362 : /*
1363 : * Perform visibility checks for heap pruning.
1364 : */
1365 : static HTSV_Result
1366 14430108 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup)
1367 : {
1368 : HTSV_Result res;
1369 : TransactionId dead_after;
1370 :
1371 14430108 : res = HeapTupleSatisfiesVacuumHorizon(tup, prstate->buffer, &dead_after);
1372 :
1373 14430108 : if (res != HEAPTUPLE_RECENTLY_DEAD)
1374 12364299 : return res;
1375 :
1376 : /*
1377 : * For VACUUM, we must be sure to prune tuples with xmax older than
1378 : * OldestXmin -- a visibility cutoff determined at the beginning of
1379 : * vacuuming the relation. OldestXmin is used for freezing determination
1380 : * and we cannot freeze dead tuples' xmaxes.
1381 : */
1382 2065809 : if (prstate->cutoffs &&
1383 1122151 : TransactionIdIsValid(prstate->cutoffs->OldestXmin) &&
1384 1122151 : NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin))
1385 799950 : return HEAPTUPLE_DEAD;
1386 :
1387 : /*
1388 : * Determine whether or not the tuple is considered dead when compared
1389 : * with the provided GlobalVisState. On-access pruning does not provide
1390 : * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older
1391 : * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead
1392 : * if the GlobalVisState has been updated since the beginning of vacuuming
1393 : * the relation.
1394 : */
1395 1265859 : if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after, true))
1396 904655 : return HEAPTUPLE_DEAD;
1397 :
1398 361204 : return res;
1399 : }
1400 :
1401 :
1402 : /*
1403 : * Pruning calculates tuple visibility once and saves the results in an array
1404 : * of int8. See PruneState.htsv for details. This helper function is meant
1405 : * to guard against examining visibility status array members which have not
1406 : * yet been computed.
1407 : */
1408 : static inline HTSV_Result
1409 14414274 : htsv_get_valid_status(int status)
1410 : {
1411 : Assert(status >= HEAPTUPLE_DEAD &&
1412 : status <= HEAPTUPLE_DELETE_IN_PROGRESS);
1413 14414274 : return (HTSV_Result) status;
1414 : }
1415 :
1416 : /*
1417 : * Prune specified line pointer or a HOT chain originating at line pointer.
1418 : *
1419 : * Tuple visibility information is provided in prstate->htsv.
1420 : *
1421 : * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
1422 : * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
1423 : * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
1424 : * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
1425 : * DEAD, our visibility test is just too coarse to detect it.
1426 : *
1427 : * Pruning must never leave behind a DEAD tuple that still has tuple storage.
1428 : * VACUUM isn't prepared to deal with that case.
1429 : *
1430 : * The root line pointer is redirected to the tuple immediately after the
1431 : * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
1432 : * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
1433 : * tuple, which we treat as a chain of length 1.)
1434 : *
1435 : * We don't actually change the page here. We just add entries to the arrays in
1436 : * prstate showing the changes to be made. Items to be redirected are added
1437 : * to the redirected[] array (two entries per redirection); items to be set to
1438 : * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
1439 : * state are added to nowunused[]. We perform bookkeeping of live tuples,
1440 : * visibility etc. based on what the page will look like after the changes
1441 : * applied. All that bookkeeping is performed in the heap_prune_record_*()
1442 : * subroutines. The division of labor is that heap_prune_chain() decides the
1443 : * fate of each tuple, ie. whether it's going to be removed, redirected or
1444 : * left unchanged, and the heap_prune_record_*() subroutines update PruneState
1445 : * based on that outcome.
1446 : */
1447 : static void
1448 14291184 : heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum,
1449 : PruneState *prstate)
1450 : {
1451 14291184 : TransactionId priorXmax = InvalidTransactionId;
1452 : ItemId rootlp;
1453 : OffsetNumber offnum;
1454 : OffsetNumber chainitems[MaxHeapTuplesPerPage];
1455 14291184 : Page page = prstate->page;
1456 :
1457 : /*
1458 : * After traversing the HOT chain, ndeadchain is the index in chainitems
1459 : * of the first live successor after the last dead item.
1460 : */
1461 14291184 : int ndeadchain = 0,
1462 14291184 : nchain = 0;
1463 :
1464 14291184 : rootlp = PageGetItemId(page, rootoffnum);
1465 :
1466 : /* Start from the root tuple */
1467 14291184 : offnum = rootoffnum;
1468 :
1469 : /* while not end of the chain */
1470 : for (;;)
1471 296117 : {
1472 : HeapTupleHeader htup;
1473 : ItemId lp;
1474 :
1475 : /* Sanity check (pure paranoia) */
1476 14587301 : if (offnum < FirstOffsetNumber)
1477 0 : break;
1478 :
1479 : /*
1480 : * An offset past the end of page's line pointer array is possible
1481 : * when the array was truncated (original item must have been unused)
1482 : */
1483 14587301 : if (offnum > maxoff)
1484 0 : break;
1485 :
1486 : /* If item is already processed, stop --- it must not be same chain */
1487 14587301 : if (prstate->processed[offnum])
1488 0 : break;
1489 :
1490 14587301 : lp = PageGetItemId(page, offnum);
1491 :
1492 : /*
1493 : * Unused item obviously isn't part of the chain. Likewise, a dead
1494 : * line pointer can't be part of the chain. Both of those cases were
1495 : * already marked as processed.
1496 : */
1497 : Assert(ItemIdIsUsed(lp));
1498 : Assert(!ItemIdIsDead(lp));
1499 :
1500 : /*
1501 : * If we are looking at the redirected root line pointer, jump to the
1502 : * first normal tuple in the chain. If we find a redirect somewhere
1503 : * else, stop --- it must not be same chain.
1504 : */
1505 14587301 : if (ItemIdIsRedirected(lp))
1506 : {
1507 173027 : if (nchain > 0)
1508 0 : break; /* not at start of chain */
1509 173027 : chainitems[nchain++] = offnum;
1510 173027 : offnum = ItemIdGetRedirect(rootlp);
1511 173027 : continue;
1512 : }
1513 :
1514 : Assert(ItemIdIsNormal(lp));
1515 :
1516 14414274 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1517 :
1518 : /*
1519 : * Check the tuple XMIN against prior XMAX, if any
1520 : */
1521 14537364 : if (TransactionIdIsValid(priorXmax) &&
1522 123090 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
1523 0 : break;
1524 :
1525 : /*
1526 : * OK, this tuple is indeed a member of the chain.
1527 : */
1528 14414274 : chainitems[nchain++] = offnum;
1529 :
1530 14414274 : switch (htsv_get_valid_status(prstate->htsv[offnum]))
1531 : {
1532 1762798 : case HEAPTUPLE_DEAD:
1533 :
1534 : /* Remember the last DEAD tuple seen */
1535 1762798 : ndeadchain = nchain;
1536 1762798 : HeapTupleHeaderAdvanceConflictHorizon(htup,
1537 : &prstate->latest_xid_removed);
1538 : /* Advance to next chain member */
1539 1762798 : break;
1540 :
1541 361204 : case HEAPTUPLE_RECENTLY_DEAD:
1542 :
1543 : /*
1544 : * We don't need to advance the conflict horizon for
1545 : * RECENTLY_DEAD tuples, even if we are removing them. This
1546 : * is because we only remove RECENTLY_DEAD tuples if they
1547 : * precede a DEAD tuple, and the DEAD tuple must have been
1548 : * inserted by a newer transaction than the RECENTLY_DEAD
1549 : * tuple by virtue of being later in the chain. We will have
1550 : * advanced the conflict horizon for the DEAD tuple.
1551 : */
1552 :
1553 : /*
1554 : * Advance past RECENTLY_DEAD tuples just in case there's a
1555 : * DEAD one after them. We have to make sure that we don't
1556 : * miss any DEAD tuples, since DEAD tuples that still have
1557 : * tuple storage after pruning will confuse VACUUM.
1558 : */
1559 361204 : break;
1560 :
1561 12290272 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1562 : case HEAPTUPLE_LIVE:
1563 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1564 12290272 : goto process_chain;
1565 :
1566 0 : default:
1567 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1568 : goto process_chain;
1569 : }
1570 :
1571 : /*
1572 : * If the tuple is not HOT-updated, then we are at the end of this
1573 : * HOT-update chain.
1574 : */
1575 2124002 : if (!HeapTupleHeaderIsHotUpdated(htup))
1576 2000912 : goto process_chain;
1577 :
1578 : /* HOT implies it can't have moved to different partition */
1579 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1580 :
1581 : /*
1582 : * Advance to next chain member.
1583 : */
1584 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == prstate->block);
1585 123090 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1586 123090 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1587 : }
1588 :
1589 0 : if (ItemIdIsRedirected(rootlp) && nchain < 2)
1590 : {
1591 : /*
1592 : * We found a redirect item that doesn't point to a valid follow-on
1593 : * item. This can happen if the loop in heap_page_prune_and_freeze()
1594 : * caused us to visit the dead successor of a redirect item before
1595 : * visiting the redirect item. We can clean up by setting the
1596 : * redirect item to LP_DEAD state or LP_UNUSED if the caller
1597 : * indicated.
1598 : */
1599 0 : heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
1600 0 : return;
1601 : }
1602 :
1603 0 : process_chain:
1604 :
1605 14291184 : if (ndeadchain == 0)
1606 : {
1607 : /*
1608 : * No DEAD tuple was found, so the chain is entirely composed of
1609 : * normal, unchanged tuples. Leave it alone.
1610 : */
1611 12573714 : int i = 0;
1612 :
1613 12573714 : if (ItemIdIsRedirected(rootlp))
1614 : {
1615 152842 : heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
1616 152842 : i++;
1617 : }
1618 25152416 : for (; i < nchain; i++)
1619 12578702 : heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
1620 : }
1621 1717470 : else if (ndeadchain == nchain)
1622 : {
1623 : /*
1624 : * The entire chain is dead. Mark the root line pointer LP_DEAD, and
1625 : * fully remove the other tuples in the chain.
1626 : */
1627 1647310 : heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
1628 1691383 : for (int i = 1; i < nchain; i++)
1629 44073 : heap_prune_record_unused(prstate, chainitems[i], true);
1630 : }
1631 : else
1632 : {
1633 : /*
1634 : * We found a DEAD tuple in the chain. Redirect the root line pointer
1635 : * to the first non-DEAD tuple, and mark as unused each intermediate
1636 : * item that we are able to remove from the chain.
1637 : */
1638 70160 : heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
1639 70160 : ItemIdIsNormal(rootlp));
1640 91600 : for (int i = 1; i < ndeadchain; i++)
1641 21440 : heap_prune_record_unused(prstate, chainitems[i], true);
1642 :
1643 : /* the rest of tuples in the chain are normal, unchanged tuples */
1644 142934 : for (int i = ndeadchain; i < nchain; i++)
1645 72774 : heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
1646 : }
1647 : }
1648 :
1649 : /* Record lowest soon-prunable XID */
1650 : static void
1651 5237642 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
1652 : OffsetNumber offnum)
1653 : {
1654 : /*
1655 : * This should exactly match the PageSetPrunable macro. We can't store
1656 : * directly into the page header yet, so we update working state.
1657 : */
1658 : Assert(TransactionIdIsNormal(xid));
1659 10270526 : if (!TransactionIdIsValid(prstate->new_prune_xid) ||
1660 5032884 : TransactionIdPrecedes(xid, prstate->new_prune_xid))
1661 206103 : prstate->new_prune_xid = xid;
1662 :
1663 : /*
1664 : * It's incorrect for a page to be marked all-visible if it contains
1665 : * prunable items.
1666 : */
1667 5237642 : if (PageIsAllVisible(prstate->page))
1668 0 : heap_page_fix_vm_corruption(prstate, offnum,
1669 : VM_CORRUPT_TUPLE_VISIBILITY);
1670 5237642 : }
1671 :
1672 : /* Record line pointer to be redirected */
1673 : static void
1674 70160 : heap_prune_record_redirect(PruneState *prstate,
1675 : OffsetNumber offnum, OffsetNumber rdoffnum,
1676 : bool was_normal)
1677 : {
1678 : Assert(!prstate->processed[offnum]);
1679 70160 : prstate->processed[offnum] = true;
1680 :
1681 : /*
1682 : * Do not mark the redirect target here. It needs to be counted
1683 : * separately as an unchanged tuple.
1684 : */
1685 :
1686 : Assert(prstate->nredirected < MaxHeapTuplesPerPage);
1687 70160 : prstate->redirected[prstate->nredirected * 2] = offnum;
1688 70160 : prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
1689 :
1690 70160 : prstate->nredirected++;
1691 :
1692 : /*
1693 : * If the root entry had been a normal tuple, we are deleting it, so count
1694 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1695 : * count.
1696 : */
1697 70160 : if (was_normal)
1698 61948 : prstate->ndeleted++;
1699 :
1700 70160 : prstate->hastup = true;
1701 70160 : }
1702 :
1703 : /* Record line pointer to be marked dead */
1704 : static void
1705 1613413 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
1706 : bool was_normal)
1707 : {
1708 : Assert(!prstate->processed[offnum]);
1709 1613413 : prstate->processed[offnum] = true;
1710 :
1711 : Assert(prstate->ndead < MaxHeapTuplesPerPage);
1712 1613413 : prstate->nowdead[prstate->ndead] = offnum;
1713 1613413 : prstate->ndead++;
1714 :
1715 : /*
1716 : * Deliberately delay unsetting set_all_visible and set_all_frozen until
1717 : * later during pruning. Removable dead tuples shouldn't preclude freezing
1718 : * the page.
1719 : */
1720 :
1721 : /* Record the dead offset for vacuum */
1722 1613413 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1723 :
1724 : /*
1725 : * If the root entry had been a normal tuple, we are deleting it, so count
1726 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1727 : * count.
1728 : */
1729 1613413 : if (was_normal)
1730 1601440 : prstate->ndeleted++;
1731 1613413 : }
1732 :
1733 : /*
1734 : * Depending on whether or not the caller set mark_unused_now to true, record that a
1735 : * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in
1736 : * which we will mark line pointers LP_UNUSED, but we will not mark line
1737 : * pointers LP_DEAD if mark_unused_now is true.
1738 : */
1739 : static void
1740 1647310 : heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
1741 : bool was_normal)
1742 : {
1743 : /*
1744 : * If the caller set mark_unused_now to true, we can remove dead tuples
1745 : * during pruning instead of marking their line pointers dead. Set this
1746 : * tuple's line pointer LP_UNUSED. We hint that this option is less
1747 : * likely.
1748 : */
1749 1647310 : if (unlikely(prstate->mark_unused_now))
1750 33897 : heap_prune_record_unused(prstate, offnum, was_normal);
1751 : else
1752 1613413 : heap_prune_record_dead(prstate, offnum, was_normal);
1753 :
1754 : /*
1755 : * It's incorrect for the page to be set all-visible if it contains dead
1756 : * items. Fix that on the heap page and check the VM for corruption as
1757 : * well. Do that here rather than in heap_prune_record_dead() so we also
1758 : * cover tuples that are directly marked LP_UNUSED via mark_unused_now.
1759 : */
1760 1647310 : if (PageIsAllVisible(prstate->page))
1761 0 : heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
1762 1647310 : }
1763 :
1764 : /* Record line pointer to be marked unused */
1765 : static void
1766 104480 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
1767 : {
1768 : Assert(!prstate->processed[offnum]);
1769 104480 : prstate->processed[offnum] = true;
1770 :
1771 : Assert(prstate->nunused < MaxHeapTuplesPerPage);
1772 104480 : prstate->nowunused[prstate->nunused] = offnum;
1773 104480 : prstate->nunused++;
1774 :
1775 : /*
1776 : * If the root entry had been a normal tuple, we are deleting it, so count
1777 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1778 : * count.
1779 : */
1780 104480 : if (was_normal)
1781 102489 : prstate->ndeleted++;
1782 104480 : }
1783 :
1784 : /*
1785 : * Record an unused line pointer that is left unchanged.
1786 : */
1787 : static void
1788 94618 : heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum)
1789 : {
1790 : Assert(!prstate->processed[offnum]);
1791 94618 : prstate->processed[offnum] = true;
1792 94618 : }
1793 :
1794 : /*
1795 : * Record line pointer that is left unchanged. We consider freezing it, and
1796 : * update bookkeeping of tuple counts and page visibility.
1797 : */
1798 : static void
1799 12664231 : heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
1800 : {
1801 : HeapTupleHeader htup;
1802 : TransactionId xmin;
1803 12664231 : Page page = prstate->page;
1804 :
1805 : Assert(!prstate->processed[offnum]);
1806 12664231 : prstate->processed[offnum] = true;
1807 :
1808 12664231 : prstate->hastup = true; /* the page is not empty */
1809 :
1810 : /*
1811 : * The criteria for counting a tuple as live in this block need to match
1812 : * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and
1813 : * ANALYZE may produce wildly different reltuples values, e.g. when there
1814 : * are many recently-dead tuples.
1815 : *
1816 : * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM
1817 : * can't run inside a transaction block, which makes some cases impossible
1818 : * (e.g. in-progress insert from the same transaction).
1819 : *
1820 : * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*()
1821 : * subroutines. They don't count dead items like acquire_sample_rows()
1822 : * does, because we assume that all dead items will become LP_UNUSED
1823 : * before VACUUM finishes. This difference is only superficial. VACUUM
1824 : * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM
1825 : * won't remember LP_DEAD items, but only because they're not supposed to
1826 : * be left behind when it is done. (Cases where we bypass index vacuuming
1827 : * will violate this optimistic assumption, but the overall impact of that
1828 : * should be negligible.)
1829 : */
1830 12664231 : htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum));
1831 :
1832 12664231 : switch (prstate->htsv[offnum])
1833 : {
1834 7349335 : case HEAPTUPLE_LIVE:
1835 :
1836 : /*
1837 : * Count it as live. Not only is this natural, but it's also what
1838 : * acquire_sample_rows() does.
1839 : */
1840 7349335 : prstate->live_tuples++;
1841 :
1842 : /*
1843 : * Is the tuple definitely visible to all transactions?
1844 : *
1845 : * NB: Like with per-tuple hint bits, we can't set the
1846 : * PD_ALL_VISIBLE flag if the inserter committed asynchronously.
1847 : * See SetHintBits for more info. Check that the tuple is hinted
1848 : * xmin-committed because of that.
1849 : */
1850 7349335 : if (!HeapTupleHeaderXminCommitted(htup))
1851 : {
1852 23786 : prstate->set_all_visible = false;
1853 23786 : prstate->set_all_frozen = false;
1854 23786 : break;
1855 : }
1856 :
1857 : /*
1858 : * The inserter definitely committed. But we don't know if it is
1859 : * old enough that everyone sees it as committed. Later, after
1860 : * processing all the tuples on the page, we'll check if there is
1861 : * any snapshot that still considers the newest xid on the page to
1862 : * be running. If so, we don't consider the page all-visible.
1863 : */
1864 7325549 : xmin = HeapTupleHeaderGetXmin(htup);
1865 :
1866 : /* Track newest xmin on page. */
1867 7325549 : if (TransactionIdFollows(xmin, prstate->newest_live_xid) &&
1868 : TransactionIdIsNormal(xmin))
1869 352394 : prstate->newest_live_xid = xmin;
1870 :
1871 7325549 : break;
1872 :
1873 361204 : case HEAPTUPLE_RECENTLY_DEAD:
1874 361204 : prstate->recently_dead_tuples++;
1875 361204 : prstate->set_all_visible = false;
1876 361204 : prstate->set_all_frozen = false;
1877 :
1878 : /*
1879 : * This tuple will soon become DEAD. Update the hint field so
1880 : * that the page is reconsidered for pruning in future.
1881 : */
1882 361204 : heap_prune_record_prunable(prstate,
1883 : HeapTupleHeaderGetUpdateXid(htup),
1884 : offnum);
1885 361204 : break;
1886 :
1887 77254 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1888 :
1889 : /*
1890 : * We do not count these rows as live, because we expect the
1891 : * inserting transaction to update the counters at commit, and we
1892 : * assume that will happen only after we report our results. This
1893 : * assumption is a bit shaky, but it is what acquire_sample_rows()
1894 : * does, so be consistent.
1895 : */
1896 77254 : prstate->set_all_visible = false;
1897 77254 : prstate->set_all_frozen = false;
1898 :
1899 : /* The page should not be marked all-visible */
1900 77254 : if (PageIsAllVisible(page))
1901 0 : heap_page_fix_vm_corruption(prstate, offnum,
1902 : VM_CORRUPT_TUPLE_VISIBILITY);
1903 :
1904 : /*
1905 : * If we wanted to optimize for aborts, we might consider marking
1906 : * the page prunable when we see INSERT_IN_PROGRESS. But we
1907 : * don't. See related decisions about when to mark the page
1908 : * prunable in heapam.c.
1909 : */
1910 77254 : break;
1911 :
1912 4876438 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1913 :
1914 : /*
1915 : * This an expected case during concurrent vacuum. Count such
1916 : * rows as live. As above, we assume the deleting transaction
1917 : * will commit and update the counters after we report.
1918 : */
1919 4876438 : prstate->live_tuples++;
1920 4876438 : prstate->set_all_visible = false;
1921 4876438 : prstate->set_all_frozen = false;
1922 :
1923 : /*
1924 : * This tuple may soon become DEAD. Update the hint field so that
1925 : * the page is reconsidered for pruning in future.
1926 : */
1927 4876438 : heap_prune_record_prunable(prstate,
1928 : HeapTupleHeaderGetUpdateXid(htup),
1929 : offnum);
1930 4876438 : break;
1931 :
1932 0 : default:
1933 :
1934 : /*
1935 : * DEAD tuples should've been passed to heap_prune_record_dead()
1936 : * or heap_prune_record_unused() instead.
1937 : */
1938 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d",
1939 : prstate->htsv[offnum]);
1940 : break;
1941 : }
1942 :
1943 : /* Consider freezing any normal tuples which will not be removed */
1944 12664231 : if (prstate->attempt_freeze)
1945 : {
1946 : bool totally_frozen;
1947 :
1948 11108688 : if ((heap_prepare_freeze_tuple(htup,
1949 11108688 : prstate->cutoffs,
1950 : &prstate->pagefrz,
1951 11108688 : &prstate->frozen[prstate->nfrozen],
1952 : &totally_frozen)))
1953 : {
1954 : /* Save prepared freeze plan for later */
1955 2933083 : prstate->frozen[prstate->nfrozen++].offset = offnum;
1956 : }
1957 :
1958 : /*
1959 : * If any tuple isn't either totally frozen already or eligible to
1960 : * become totally frozen (according to its freeze plan), then the page
1961 : * definitely cannot be set all-frozen in the visibility map later on.
1962 : */
1963 11108688 : if (!totally_frozen)
1964 5595692 : prstate->set_all_frozen = false;
1965 : }
1966 12664231 : }
1967 :
1968 :
1969 : /*
1970 : * Record line pointer that was already LP_DEAD and is left unchanged.
1971 : */
1972 : static void
1973 1292960 : heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum)
1974 : {
1975 : Assert(!prstate->processed[offnum]);
1976 1292960 : prstate->processed[offnum] = true;
1977 :
1978 : /*
1979 : * Deliberately don't set hastup for LP_DEAD items. We make the soft
1980 : * assumption that any LP_DEAD items encountered here will become
1981 : * LP_UNUSED later on, before count_nondeletable_pages is reached. If we
1982 : * don't make this assumption then rel truncation will only happen every
1983 : * other VACUUM, at most. Besides, VACUUM must treat
1984 : * hastup/nonempty_pages as provisional no matter how LP_DEAD items are
1985 : * handled (handled here, or handled later on).
1986 : *
1987 : * Similarly, don't unset set_all_visible and set_all_frozen until later,
1988 : * at the end of heap_page_prune_and_freeze(). This will allow us to
1989 : * attempt to freeze the page after pruning. As long as we unset it
1990 : * before updating the visibility map, this will be correct.
1991 : */
1992 :
1993 : /* Record the dead offset for vacuum */
1994 1292960 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1995 :
1996 : /*
1997 : * It's incorrect for a page to be marked all-visible if it contains dead
1998 : * items.
1999 : */
2000 1292960 : if (PageIsAllVisible(prstate->page))
2001 0 : heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
2002 1292960 : }
2003 :
2004 : /*
2005 : * Record LP_REDIRECT that is left unchanged.
2006 : */
2007 : static void
2008 152842 : heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum)
2009 : {
2010 : /*
2011 : * A redirect line pointer doesn't count as a live tuple.
2012 : *
2013 : * If we leave a redirect line pointer in place, there will be another
2014 : * tuple on the page that it points to. We will do the bookkeeping for
2015 : * that separately. So we have nothing to do here, except remember that
2016 : * we processed this item.
2017 : */
2018 : Assert(!prstate->processed[offnum]);
2019 152842 : prstate->processed[offnum] = true;
2020 152842 : }
2021 :
2022 : /*
2023 : * Perform the actual page changes needed by heap_page_prune_and_freeze().
2024 : *
2025 : * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers
2026 : * as unused, not redirecting or removing anything else. The
2027 : * PageRepairFragmentation() call is skipped in that case.
2028 : *
2029 : * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on
2030 : * the buffer. If it is set, an ordinary exclusive lock suffices.
2031 : */
2032 : void
2033 67738 : heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
2034 : OffsetNumber *redirected, int nredirected,
2035 : OffsetNumber *nowdead, int ndead,
2036 : OffsetNumber *nowunused, int nunused)
2037 : {
2038 67738 : Page page = BufferGetPage(buffer);
2039 : OffsetNumber *offnum;
2040 : HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
2041 :
2042 : /* Shouldn't be called unless there's something to do */
2043 : Assert(nredirected > 0 || ndead > 0 || nunused > 0);
2044 :
2045 : /* If 'lp_truncate_only', we can only remove already-dead line pointers */
2046 : Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
2047 :
2048 : /* Update all redirected line pointers */
2049 67738 : offnum = redirected;
2050 157212 : for (int i = 0; i < nredirected; i++)
2051 : {
2052 89474 : OffsetNumber fromoff = *offnum++;
2053 89474 : OffsetNumber tooff = *offnum++;
2054 89474 : ItemId fromlp = PageGetItemId(page, fromoff);
2055 : ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
2056 :
2057 : #ifdef USE_ASSERT_CHECKING
2058 :
2059 : /*
2060 : * Any existing item that we set as an LP_REDIRECT (any 'from' item)
2061 : * must be the first item from a HOT chain. If the item has tuple
2062 : * storage then it can't be a heap-only tuple. Otherwise we are just
2063 : * maintaining an existing LP_REDIRECT from an existing HOT chain that
2064 : * has been pruned at least once before now.
2065 : */
2066 : if (!ItemIdIsRedirected(fromlp))
2067 : {
2068 : Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
2069 :
2070 : htup = (HeapTupleHeader) PageGetItem(page, fromlp);
2071 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
2072 : }
2073 : else
2074 : {
2075 : /* We shouldn't need to redundantly set the redirect */
2076 : Assert(ItemIdGetRedirect(fromlp) != tooff);
2077 : }
2078 :
2079 : /*
2080 : * The item that we're about to set as an LP_REDIRECT (the 'from'
2081 : * item) will point to an existing item (the 'to' item) that is
2082 : * already a heap-only tuple. There can be at most one LP_REDIRECT
2083 : * item per HOT chain.
2084 : *
2085 : * We need to keep around an LP_REDIRECT item (after original
2086 : * non-heap-only root tuple gets pruned away) so that it's always
2087 : * possible for VACUUM to easily figure out what TID to delete from
2088 : * indexes when an entire HOT chain becomes dead. A heap-only tuple
2089 : * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
2090 : * tuple can.
2091 : *
2092 : * This check may miss problems, e.g. the target of a redirect could
2093 : * be marked as unused subsequently. The page_verify_redirects() check
2094 : * below will catch such problems.
2095 : */
2096 : tolp = PageGetItemId(page, tooff);
2097 : Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
2098 : htup = (HeapTupleHeader) PageGetItem(page, tolp);
2099 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2100 : #endif
2101 :
2102 89474 : ItemIdSetRedirect(fromlp, tooff);
2103 : }
2104 :
2105 : /* Update all now-dead line pointers */
2106 67738 : offnum = nowdead;
2107 1960081 : for (int i = 0; i < ndead; i++)
2108 : {
2109 1892343 : OffsetNumber off = *offnum++;
2110 1892343 : ItemId lp = PageGetItemId(page, off);
2111 :
2112 : #ifdef USE_ASSERT_CHECKING
2113 :
2114 : /*
2115 : * An LP_DEAD line pointer must be left behind when the original item
2116 : * (which is dead to everybody) could still be referenced by a TID in
2117 : * an index. This should never be necessary with any individual
2118 : * heap-only tuple item, though. (It's not clear how much of a problem
2119 : * that would be, but there is no reason to allow it.)
2120 : */
2121 : if (ItemIdHasStorage(lp))
2122 : {
2123 : Assert(ItemIdIsNormal(lp));
2124 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2125 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
2126 : }
2127 : else
2128 : {
2129 : /* Whole HOT chain becomes dead */
2130 : Assert(ItemIdIsRedirected(lp));
2131 : }
2132 : #endif
2133 :
2134 1892343 : ItemIdSetDead(lp);
2135 : }
2136 :
2137 : /* Update all now-unused line pointers */
2138 67738 : offnum = nowunused;
2139 305959 : for (int i = 0; i < nunused; i++)
2140 : {
2141 238221 : OffsetNumber off = *offnum++;
2142 238221 : ItemId lp = PageGetItemId(page, off);
2143 :
2144 : #ifdef USE_ASSERT_CHECKING
2145 :
2146 : if (lp_truncate_only)
2147 : {
2148 : /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
2149 : Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
2150 : }
2151 : else
2152 : {
2153 : /*
2154 : * When heap_page_prune_and_freeze() was called, mark_unused_now
2155 : * may have been passed as true, which allows would-be LP_DEAD
2156 : * items to be made LP_UNUSED instead. This is only possible if
2157 : * the relation has no indexes. If there are any dead items, then
2158 : * mark_unused_now was not true and every item being marked
2159 : * LP_UNUSED must refer to a heap-only tuple.
2160 : */
2161 : if (ndead > 0)
2162 : {
2163 : Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
2164 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2165 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2166 : }
2167 : else
2168 : Assert(ItemIdIsUsed(lp));
2169 : }
2170 :
2171 : #endif
2172 :
2173 238221 : ItemIdSetUnused(lp);
2174 : }
2175 :
2176 67738 : if (lp_truncate_only)
2177 1585 : PageTruncateLinePointerArray(page);
2178 : else
2179 : {
2180 : /*
2181 : * Finally, repair any fragmentation, and update the page's hint bit
2182 : * about whether it has free pointers.
2183 : */
2184 66153 : PageRepairFragmentation(page);
2185 :
2186 : /*
2187 : * Now that the page has been modified, assert that redirect items
2188 : * still point to valid targets.
2189 : */
2190 66153 : page_verify_redirects(page);
2191 : }
2192 67738 : }
2193 :
2194 :
2195 : /*
2196 : * If built with assertions, verify that all LP_REDIRECT items point to a
2197 : * valid item.
2198 : *
2199 : * One way that bugs related to HOT pruning show is redirect items pointing to
2200 : * removed tuples. It's not trivial to reliably check that marking an item
2201 : * unused will not orphan a redirect item during heap_prune_chain() /
2202 : * heap_page_prune_execute(), so we additionally check the whole page after
2203 : * pruning. Without this check such bugs would typically only cause asserts
2204 : * later, potentially well after the corruption has been introduced.
2205 : *
2206 : * Also check comments in heap_page_prune_execute()'s redirection loop.
2207 : */
2208 : static void
2209 66153 : page_verify_redirects(Page page)
2210 : {
2211 : #ifdef USE_ASSERT_CHECKING
2212 : OffsetNumber offnum;
2213 : OffsetNumber maxoff;
2214 :
2215 : maxoff = PageGetMaxOffsetNumber(page);
2216 : for (offnum = FirstOffsetNumber;
2217 : offnum <= maxoff;
2218 : offnum = OffsetNumberNext(offnum))
2219 : {
2220 : ItemId itemid = PageGetItemId(page, offnum);
2221 : OffsetNumber targoff;
2222 : ItemId targitem;
2223 : HeapTupleHeader htup;
2224 :
2225 : if (!ItemIdIsRedirected(itemid))
2226 : continue;
2227 :
2228 : targoff = ItemIdGetRedirect(itemid);
2229 : targitem = PageGetItemId(page, targoff);
2230 :
2231 : Assert(ItemIdIsUsed(targitem));
2232 : Assert(ItemIdIsNormal(targitem));
2233 : Assert(ItemIdHasStorage(targitem));
2234 : htup = (HeapTupleHeader) PageGetItem(page, targitem);
2235 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2236 : }
2237 : #endif
2238 66153 : }
2239 :
2240 :
2241 : /*
2242 : * For all items in this page, find their respective root line pointers.
2243 : * If item k is part of a HOT-chain with root at item j, then we set
2244 : * root_offsets[k - 1] = j.
2245 : *
2246 : * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
2247 : * Unused entries are filled with InvalidOffsetNumber (zero).
2248 : *
2249 : * The function must be called with at least share lock on the buffer, to
2250 : * prevent concurrent prune operations.
2251 : *
2252 : * Note: The information collected here is valid only as long as the caller
2253 : * holds a pin on the buffer. Once pin is released, a tuple might be pruned
2254 : * and reused by a completely unrelated tuple.
2255 : */
2256 : void
2257 133140 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
2258 : {
2259 : OffsetNumber offnum,
2260 : maxoff;
2261 :
2262 133140 : MemSet(root_offsets, InvalidOffsetNumber,
2263 : MaxHeapTuplesPerPage * sizeof(OffsetNumber));
2264 :
2265 133140 : maxoff = PageGetMaxOffsetNumber(page);
2266 10782711 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2267 : {
2268 10649571 : ItemId lp = PageGetItemId(page, offnum);
2269 : HeapTupleHeader htup;
2270 : OffsetNumber nextoffnum;
2271 : TransactionId priorXmax;
2272 :
2273 : /* skip unused and dead items */
2274 10649571 : if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
2275 9467 : continue;
2276 :
2277 10640104 : if (ItemIdIsNormal(lp))
2278 : {
2279 10636286 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2280 :
2281 : /*
2282 : * Check if this tuple is part of a HOT-chain rooted at some other
2283 : * tuple. If so, skip it for now; we'll process it when we find
2284 : * its root.
2285 : */
2286 10636286 : if (HeapTupleHeaderIsHeapOnly(htup))
2287 4158 : continue;
2288 :
2289 : /*
2290 : * This is either a plain tuple or the root of a HOT-chain.
2291 : * Remember it in the mapping.
2292 : */
2293 10632128 : root_offsets[offnum - 1] = offnum;
2294 :
2295 : /* If it's not the start of a HOT-chain, we're done with it */
2296 10632128 : if (!HeapTupleHeaderIsHotUpdated(htup))
2297 10631864 : continue;
2298 :
2299 : /* Set up to scan the HOT-chain */
2300 264 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
2301 264 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
2302 : }
2303 : else
2304 : {
2305 : /* Must be a redirect item. We do not set its root_offsets entry */
2306 : Assert(ItemIdIsRedirected(lp));
2307 : /* Set up to scan the HOT-chain */
2308 3818 : nextoffnum = ItemIdGetRedirect(lp);
2309 3818 : priorXmax = InvalidTransactionId;
2310 : }
2311 :
2312 : /*
2313 : * Now follow the HOT-chain and collect other tuples in the chain.
2314 : *
2315 : * Note: Even though this is a nested loop, the complexity of the
2316 : * function is O(N) because a tuple in the page should be visited not
2317 : * more than twice, once in the outer loop and once in HOT-chain
2318 : * chases.
2319 : */
2320 : for (;;)
2321 : {
2322 : /* Sanity check (pure paranoia) */
2323 4154 : if (offnum < FirstOffsetNumber)
2324 0 : break;
2325 :
2326 : /*
2327 : * An offset past the end of page's line pointer array is possible
2328 : * when the array was truncated
2329 : */
2330 4154 : if (offnum > maxoff)
2331 0 : break;
2332 :
2333 4154 : lp = PageGetItemId(page, nextoffnum);
2334 :
2335 : /* Check for broken chains */
2336 4154 : if (!ItemIdIsNormal(lp))
2337 0 : break;
2338 :
2339 4154 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2340 :
2341 4490 : if (TransactionIdIsValid(priorXmax) &&
2342 336 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
2343 0 : break;
2344 :
2345 : /* Remember the root line pointer for this item */
2346 4154 : root_offsets[nextoffnum - 1] = offnum;
2347 :
2348 : /* Advance to next chain member, if any */
2349 4154 : if (!HeapTupleHeaderIsHotUpdated(htup))
2350 4082 : break;
2351 :
2352 : /* HOT implies it can't have moved to different partition */
2353 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
2354 :
2355 72 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
2356 72 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
2357 : }
2358 : }
2359 133140 : }
2360 :
2361 :
2362 : /*
2363 : * Compare fields that describe actions required to freeze tuple with caller's
2364 : * open plan. If everything matches then the frz tuple plan is equivalent to
2365 : * caller's plan.
2366 : */
2367 : static inline bool
2368 1064066 : heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
2369 : {
2370 1064066 : if (plan->xmax == frz->xmax &&
2371 1062770 : plan->t_infomask2 == frz->t_infomask2 &&
2372 1062026 : plan->t_infomask == frz->t_infomask &&
2373 1059235 : plan->frzflags == frz->frzflags)
2374 1059235 : return true;
2375 :
2376 : /* Caller must call heap_log_freeze_new_plan again for frz */
2377 4831 : return false;
2378 : }
2379 :
2380 : /*
2381 : * Comparator used to deduplicate the freeze plans used in WAL records.
2382 : */
2383 : static int
2384 1461280 : heap_log_freeze_cmp(const void *arg1, const void *arg2)
2385 : {
2386 1461280 : const HeapTupleFreeze *frz1 = arg1;
2387 1461280 : const HeapTupleFreeze *frz2 = arg2;
2388 :
2389 1461280 : if (frz1->xmax < frz2->xmax)
2390 13152 : return -1;
2391 1448128 : else if (frz1->xmax > frz2->xmax)
2392 14762 : return 1;
2393 :
2394 1433366 : if (frz1->t_infomask2 < frz2->t_infomask2)
2395 5238 : return -1;
2396 1428128 : else if (frz1->t_infomask2 > frz2->t_infomask2)
2397 4369 : return 1;
2398 :
2399 1423759 : if (frz1->t_infomask < frz2->t_infomask)
2400 13194 : return -1;
2401 1410565 : else if (frz1->t_infomask > frz2->t_infomask)
2402 15790 : return 1;
2403 :
2404 1394775 : if (frz1->frzflags < frz2->frzflags)
2405 0 : return -1;
2406 1394775 : else if (frz1->frzflags > frz2->frzflags)
2407 0 : return 1;
2408 :
2409 : /*
2410 : * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
2411 : * (So the tuples will share a single canonical freeze plan.)
2412 : *
2413 : * We tiebreak on page offset number to keep each freeze plan's page
2414 : * offset number array individually sorted. (Unnecessary, but be tidy.)
2415 : */
2416 1394775 : if (frz1->offset < frz2->offset)
2417 1183915 : return -1;
2418 210860 : else if (frz1->offset > frz2->offset)
2419 210860 : return 1;
2420 :
2421 : Assert(false);
2422 0 : return 0;
2423 : }
2424 :
2425 : /*
2426 : * Start new plan initialized using tuple-level actions. At least one tuple
2427 : * will have steps required to freeze described by caller's plan during REDO.
2428 : */
2429 : static inline void
2430 28926 : heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
2431 : {
2432 28926 : plan->xmax = frz->xmax;
2433 28926 : plan->t_infomask2 = frz->t_infomask2;
2434 28926 : plan->t_infomask = frz->t_infomask;
2435 28926 : plan->frzflags = frz->frzflags;
2436 28926 : plan->ntuples = 1; /* for now */
2437 28926 : }
2438 :
2439 : /*
2440 : * Deduplicate tuple-based freeze plans so that each distinct set of
2441 : * processing steps is only stored once in the WAL record.
2442 : * Called during original execution of freezing (for logged relations).
2443 : *
2444 : * Return value is number of plans set in *plans_out for caller. Also writes
2445 : * an array of offset numbers into *offsets_out output argument for caller
2446 : * (actually there is one array per freeze plan, but that's not of immediate
2447 : * concern to our caller).
2448 : */
2449 : static int
2450 24095 : heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
2451 : xlhp_freeze_plan *plans_out,
2452 : OffsetNumber *offsets_out)
2453 : {
2454 24095 : int nplans = 0;
2455 :
2456 : /* Sort tuple-based freeze plans in the order required to deduplicate */
2457 24095 : qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
2458 :
2459 1112256 : for (int i = 0; i < ntuples; i++)
2460 : {
2461 1088161 : HeapTupleFreeze *frz = tuples + i;
2462 :
2463 1088161 : if (i == 0)
2464 : {
2465 : /* New canonical freeze plan starting with first tup */
2466 24095 : heap_log_freeze_new_plan(plans_out, frz);
2467 24095 : nplans++;
2468 : }
2469 1064066 : else if (heap_log_freeze_eq(plans_out, frz))
2470 : {
2471 : /* tup matches open canonical plan -- include tup in it */
2472 : Assert(offsets_out[i - 1] < frz->offset);
2473 1059235 : plans_out->ntuples++;
2474 : }
2475 : else
2476 : {
2477 : /* Tup doesn't match current plan -- done with it now */
2478 4831 : plans_out++;
2479 :
2480 : /* New canonical freeze plan starting with this tup */
2481 4831 : heap_log_freeze_new_plan(plans_out, frz);
2482 4831 : nplans++;
2483 : }
2484 :
2485 : /*
2486 : * Save page offset number in dedicated buffer in passing.
2487 : *
2488 : * REDO routine relies on the record's offset numbers array grouping
2489 : * offset numbers by freeze plan. The sort order within each grouping
2490 : * is ascending offset number order, just to keep things tidy.
2491 : */
2492 1088161 : offsets_out[i] = frz->offset;
2493 : }
2494 :
2495 : Assert(nplans > 0 && nplans <= ntuples);
2496 :
2497 24095 : return nplans;
2498 : }
2499 :
2500 : /*
2501 : * Write an XLOG_HEAP2_PRUNE* WAL record
2502 : *
2503 : * This is used for several different page maintenance operations:
2504 : *
2505 : * - Page pruning, in VACUUM's 1st pass or on access: Some items are
2506 : * redirected, some marked dead, and some removed altogether.
2507 : *
2508 : * - Freezing: Items are marked as 'frozen'.
2509 : *
2510 : * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused.
2511 : *
2512 : * They have enough commonalities that we use a single WAL record for them
2513 : * all.
2514 : *
2515 : * If replaying the record requires a cleanup lock, pass cleanup_lock = true.
2516 : * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but
2517 : * replaying 'unused' items depends on whether they were all previously marked
2518 : * as dead.
2519 : *
2520 : * If the VM is being updated, vmflags will contain the bits to set. In this
2521 : * case, vmbuffer should already have been updated and marked dirty and should
2522 : * still be pinned and locked.
2523 : *
2524 : * Note: This function scribbles on the 'frozen' array.
2525 : *
2526 : * Note: This is called in a critical section, so careful what you do here.
2527 : */
2528 : void
2529 115604 : log_heap_prune_and_freeze(Relation relation, Buffer buffer,
2530 : Buffer vmbuffer, uint8 vmflags,
2531 : TransactionId conflict_xid,
2532 : bool cleanup_lock,
2533 : PruneReason reason,
2534 : HeapTupleFreeze *frozen, int nfrozen,
2535 : OffsetNumber *redirected, int nredirected,
2536 : OffsetNumber *dead, int ndead,
2537 : OffsetNumber *unused, int nunused)
2538 : {
2539 : xl_heap_prune xlrec;
2540 : XLogRecPtr recptr;
2541 : uint8 info;
2542 : uint8 regbuf_flags_heap;
2543 :
2544 115604 : Page heap_page = BufferGetPage(buffer);
2545 :
2546 : /* The following local variables hold data registered in the WAL record: */
2547 : xlhp_freeze_plan plans[MaxHeapTuplesPerPage];
2548 : xlhp_freeze_plans freeze_plans;
2549 : xlhp_prune_items redirect_items;
2550 : xlhp_prune_items dead_items;
2551 : xlhp_prune_items unused_items;
2552 : OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
2553 115604 : bool do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
2554 115604 : bool do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
2555 115604 : bool heap_fpi_allowed = true;
2556 :
2557 : Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags);
2558 :
2559 115604 : xlrec.flags = 0;
2560 115604 : regbuf_flags_heap = REGBUF_STANDARD;
2561 :
2562 : /*
2563 : * We can avoid an FPI of the heap page if the only modification we are
2564 : * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are
2565 : * disabled.
2566 : *
2567 : * However, if the page has never been WAL-logged (LSN is invalid), we
2568 : * must force an FPI regardless. This can happen when another backend
2569 : * extends the heap, initializes the page, and then fails before WAL-
2570 : * logging it. Since heap extension is not WAL-logged, recovery might try
2571 : * to replay our record and find that the page isn't initialized, which
2572 : * would cause a PANIC.
2573 : */
2574 115604 : if (!XLogRecPtrIsValid(PageGetLSN(heap_page)))
2575 0 : regbuf_flags_heap |= REGBUF_FORCE_IMAGE;
2576 115604 : else if (!do_prune && nfrozen == 0 && (!do_set_vm || !XLogHintBitIsNeeded()))
2577 : {
2578 1128 : regbuf_flags_heap |= REGBUF_NO_IMAGE;
2579 1128 : heap_fpi_allowed = false;
2580 : }
2581 :
2582 : /*
2583 : * Prepare data for the buffer. The arrays are not actually in the
2584 : * buffer, but we pretend that they are. When XLogInsert stores a full
2585 : * page image, the arrays can be omitted.
2586 : */
2587 115604 : XLogBeginInsert();
2588 115604 : XLogRegisterBuffer(0, buffer, regbuf_flags_heap);
2589 :
2590 115604 : if (do_set_vm)
2591 56647 : XLogRegisterBuffer(1, vmbuffer, 0);
2592 :
2593 115604 : if (nfrozen > 0)
2594 : {
2595 : int nplans;
2596 :
2597 24095 : xlrec.flags |= XLHP_HAS_FREEZE_PLANS;
2598 :
2599 : /*
2600 : * Prepare deduplicated representation for use in the WAL record. This
2601 : * destructively sorts frozen tuples array in-place.
2602 : */
2603 24095 : nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets);
2604 :
2605 24095 : freeze_plans.nplans = nplans;
2606 24095 : XLogRegisterBufData(0, &freeze_plans,
2607 : offsetof(xlhp_freeze_plans, plans));
2608 24095 : XLogRegisterBufData(0, plans,
2609 : sizeof(xlhp_freeze_plan) * nplans);
2610 : }
2611 115604 : if (nredirected > 0)
2612 : {
2613 17146 : xlrec.flags |= XLHP_HAS_REDIRECTIONS;
2614 :
2615 17146 : redirect_items.ntargets = nredirected;
2616 17146 : XLogRegisterBufData(0, &redirect_items,
2617 : offsetof(xlhp_prune_items, data));
2618 17146 : XLogRegisterBufData(0, redirected,
2619 : sizeof(OffsetNumber[2]) * nredirected);
2620 : }
2621 115604 : if (ndead > 0)
2622 : {
2623 44524 : xlrec.flags |= XLHP_HAS_DEAD_ITEMS;
2624 :
2625 44524 : dead_items.ntargets = ndead;
2626 44524 : XLogRegisterBufData(0, &dead_items,
2627 : offsetof(xlhp_prune_items, data));
2628 44524 : XLogRegisterBufData(0, dead,
2629 : sizeof(OffsetNumber) * ndead);
2630 : }
2631 115604 : if (nunused > 0)
2632 : {
2633 27638 : xlrec.flags |= XLHP_HAS_NOW_UNUSED_ITEMS;
2634 :
2635 27638 : unused_items.ntargets = nunused;
2636 27638 : XLogRegisterBufData(0, &unused_items,
2637 : offsetof(xlhp_prune_items, data));
2638 27638 : XLogRegisterBufData(0, unused,
2639 : sizeof(OffsetNumber) * nunused);
2640 : }
2641 115604 : if (nfrozen > 0)
2642 24095 : XLogRegisterBufData(0, frz_offsets,
2643 : sizeof(OffsetNumber) * nfrozen);
2644 :
2645 : /*
2646 : * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_*
2647 : * flag above.
2648 : */
2649 115604 : if (vmflags & VISIBILITYMAP_ALL_VISIBLE)
2650 : {
2651 56647 : xlrec.flags |= XLHP_VM_ALL_VISIBLE;
2652 56647 : if (vmflags & VISIBILITYMAP_ALL_FROZEN)
2653 40393 : xlrec.flags |= XLHP_VM_ALL_FROZEN;
2654 : }
2655 115604 : if (RelationIsAccessibleInLogicalDecoding(relation))
2656 599 : xlrec.flags |= XLHP_IS_CATALOG_REL;
2657 115604 : if (TransactionIdIsValid(conflict_xid))
2658 89983 : xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON;
2659 115604 : if (cleanup_lock)
2660 101678 : xlrec.flags |= XLHP_CLEANUP_LOCK;
2661 : else
2662 : {
2663 : Assert(nredirected == 0 && ndead == 0);
2664 : /* also, any items in 'unused' must've been LP_DEAD previously */
2665 : }
2666 115604 : XLogRegisterData(&xlrec, SizeOfHeapPrune);
2667 115604 : if (TransactionIdIsValid(conflict_xid))
2668 89983 : XLogRegisterData(&conflict_xid, sizeof(TransactionId));
2669 :
2670 115604 : switch (reason)
2671 : {
2672 44718 : case PRUNE_ON_ACCESS:
2673 44718 : info = XLOG_HEAP2_PRUNE_ON_ACCESS;
2674 44718 : break;
2675 56960 : case PRUNE_VACUUM_SCAN:
2676 56960 : info = XLOG_HEAP2_PRUNE_VACUUM_SCAN;
2677 56960 : break;
2678 13926 : case PRUNE_VACUUM_CLEANUP:
2679 13926 : info = XLOG_HEAP2_PRUNE_VACUUM_CLEANUP;
2680 13926 : break;
2681 0 : default:
2682 0 : elog(ERROR, "unrecognized prune reason: %d", (int) reason);
2683 : break;
2684 : }
2685 115604 : recptr = XLogInsert(RM_HEAP2_ID, info);
2686 :
2687 115604 : if (do_set_vm)
2688 : {
2689 : Assert(BufferIsDirty(vmbuffer));
2690 56647 : PageSetLSN(BufferGetPage(vmbuffer), recptr);
2691 : }
2692 :
2693 : /*
2694 : * If we explicitly skip an FPI, we must not stamp the heap page with this
2695 : * record's LSN. Recovery skips records <= the stamped LSN, so this could
2696 : * lead to skipping an earlier FPI needed to repair a torn page.
2697 : */
2698 115604 : if (heap_fpi_allowed)
2699 : {
2700 : Assert(BufferIsDirty(buffer));
2701 114476 : PageSetLSN(heap_page, recptr);
2702 : }
2703 115604 : }
|