Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pruneheap.c
4 : * heap page pruning and HOT-chain management code
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/pruneheap.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/heapam.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/htup_details.h"
20 : #include "access/multixact.h"
21 : #include "access/transam.h"
22 : #include "access/visibilitymap.h"
23 : #include "access/xlog.h"
24 : #include "access/xloginsert.h"
25 : #include "commands/vacuum.h"
26 : #include "executor/instrument.h"
27 : #include "miscadmin.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "utils/rel.h"
31 : #include "utils/snapmgr.h"
32 :
33 : /* Working data for heap_page_prune_and_freeze() and subroutines */
34 : typedef struct
35 : {
36 : /*-------------------------------------------------------
37 : * Arguments passed to heap_page_prune_and_freeze()
38 : *-------------------------------------------------------
39 : */
40 :
41 : /* tuple visibility test, initialized for the relation */
42 : GlobalVisState *vistest;
43 : /* whether or not dead items can be set LP_UNUSED during pruning */
44 : bool mark_unused_now;
45 : /* whether to attempt freezing tuples */
46 : bool attempt_freeze;
47 : /* whether to attempt setting the VM */
48 : bool attempt_set_vm;
49 : struct VacuumCutoffs *cutoffs;
50 : Relation relation;
51 :
52 : /*
53 : * Keep the buffer, block, and page handy so that helpers needing to
54 : * access them don't need to make repeated calls to BufferGetBlockNumber()
55 : * and BufferGetPage().
56 : */
57 : BlockNumber block;
58 : Buffer buffer;
59 : Page page;
60 :
61 : /*-------------------------------------------------------
62 : * Fields describing what to do to the page
63 : *-------------------------------------------------------
64 : */
65 : TransactionId new_prune_xid; /* new prune hint value */
66 : TransactionId latest_xid_removed;
67 : int nredirected; /* numbers of entries in arrays below */
68 : int ndead;
69 : int nunused;
70 : int nfrozen;
71 : /* arrays that accumulate indexes of items to be changed */
72 : OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
73 : OffsetNumber nowdead[MaxHeapTuplesPerPage];
74 : OffsetNumber nowunused[MaxHeapTuplesPerPage];
75 : HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
76 :
77 : /*
78 : * set_all_visible and set_all_frozen indicate if the all-visible and
79 : * all-frozen bits in the visibility map can be set for this page after
80 : * pruning. They are only tracked when the caller requests VM updates
81 : * (attempt_set_vm); otherwise they remain false throughout.
82 : *
83 : * NOTE: set_all_visible and set_all_frozen initially don't include
84 : * LP_DEAD items. That's convenient for heap_page_prune_and_freeze() to
85 : * use them to decide whether to opportunistically freeze the page or not.
86 : * The set_all_visible and set_all_frozen values ultimately used to set
87 : * the VM are adjusted to include LP_DEAD items after we determine whether
88 : * or not to opportunistically freeze.
89 : */
90 : bool set_all_visible;
91 : bool set_all_frozen;
92 :
93 : /*-------------------------------------------------------
94 : * Working state for HOT chain processing
95 : *-------------------------------------------------------
96 : */
97 :
98 : /*
99 : * 'root_items' contains offsets of all LP_REDIRECT line pointers and
100 : * normal non-HOT tuples. They can be stand-alone items or the first item
101 : * in a HOT chain. 'heaponly_items' contains heap-only tuples which can
102 : * only be removed as part of a HOT chain.
103 : */
104 : int nroot_items;
105 : OffsetNumber root_items[MaxHeapTuplesPerPage];
106 : int nheaponly_items;
107 : OffsetNumber heaponly_items[MaxHeapTuplesPerPage];
108 :
109 : /*
110 : * processed[offnum] is true if item at offnum has been processed.
111 : *
112 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
113 : * 1. Otherwise every access would need to subtract 1.
114 : */
115 : bool processed[MaxHeapTuplesPerPage + 1];
116 :
117 : /*
118 : * Tuple visibility is only computed once for each tuple, for correctness
119 : * and efficiency reasons; see comment in heap_page_prune_and_freeze() for
120 : * details. This is of type int8[], instead of HTSV_Result[], so we can
121 : * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD
122 : * items.
123 : *
124 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
125 : * 1. Otherwise every access would need to subtract 1.
126 : */
127 : int8 htsv[MaxHeapTuplesPerPage + 1];
128 :
129 : /*-------------------------------------------------------
130 : * Working state for freezing
131 : *-------------------------------------------------------
132 : */
133 : HeapPageFreeze pagefrz;
134 :
135 : /*-------------------------------------------------------
136 : * Working state for visibility map processing
137 : *-------------------------------------------------------
138 : */
139 :
140 : /*
141 : * Caller must provide a pinned vmbuffer corresponding to the heap block
142 : * passed to heap_page_prune_and_freeze(). We will fix any corruption
143 : * found in the VM and set the VM if the page is all-visible/all-frozen.
144 : */
145 : Buffer vmbuffer;
146 :
147 : /*
148 : * The state of the VM bits at the beginning of pruning and the state they
149 : * will be in at the end.
150 : */
151 : uint8 old_vmbits;
152 : uint8 new_vmbits;
153 :
154 : /* The newest xmin of live tuples on the page */
155 : TransactionId newest_live_xid;
156 :
157 : /*-------------------------------------------------------
158 : * Information about what was done
159 : *
160 : * These fields are not used by pruning itself for the most part, but are
161 : * used to collect information about what was pruned and what state the
162 : * page is in after pruning, for the benefit of the caller. They are
163 : * copied to the caller's PruneFreezeResult at the end.
164 : * -------------------------------------------------------
165 : */
166 :
167 : int ndeleted; /* Number of tuples deleted from the page */
168 :
169 : /* Number of live and recently dead tuples, after pruning */
170 : int live_tuples;
171 : int recently_dead_tuples;
172 :
173 : /* Whether or not the page makes rel truncation unsafe */
174 : bool hastup;
175 :
176 : /*
177 : * LP_DEAD items on the page after pruning. Includes existing LP_DEAD
178 : * items
179 : */
180 : int lpdead_items; /* number of items in the array */
181 : OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */
182 : } PruneState;
183 :
184 : /*
185 : * Type of visibility map corruption detected on a heap page and its
186 : * associated VM page. Passed to heap_page_fix_vm_corruption() so the caller
187 : * can specify what it found rather than having the function rederive the
188 : * corruption from page state.
189 : */
190 : typedef enum VMCorruptionType
191 : {
192 : /* VM bits are set but the heap page-level PD_ALL_VISIBLE flag is not */
193 : VM_CORRUPT_MISSING_PAGE_HINT,
194 : /* LP_DEAD line pointers found on a page marked all-visible */
195 : VM_CORRUPT_LPDEAD,
196 : /* Tuple not visible to all transactions on a page marked all-visible */
197 : VM_CORRUPT_TUPLE_VISIBILITY,
198 : } VMCorruptionType;
199 :
200 : /* Local functions */
201 : static void prune_freeze_setup(PruneFreezeParams *params,
202 : TransactionId *new_relfrozen_xid,
203 : MultiXactId *new_relmin_mxid,
204 : PruneFreezeResult *presult,
205 : PruneState *prstate);
206 : static void heap_page_fix_vm_corruption(PruneState *prstate,
207 : OffsetNumber offnum,
208 : VMCorruptionType ctype);
209 : static void prune_freeze_fast_path(PruneState *prstate,
210 : PruneFreezeResult *presult);
211 : static void prune_freeze_plan(PruneState *prstate,
212 : OffsetNumber *off_loc);
213 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
214 : HeapTuple tup);
215 : static inline HTSV_Result htsv_get_valid_status(int status);
216 : static void heap_prune_chain(OffsetNumber maxoff,
217 : OffsetNumber rootoffnum, PruneState *prstate);
218 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
219 : OffsetNumber offnum);
220 : static void heap_prune_record_redirect(PruneState *prstate,
221 : OffsetNumber offnum, OffsetNumber rdoffnum,
222 : bool was_normal);
223 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
224 : bool was_normal);
225 : static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
226 : bool was_normal);
227 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal);
228 :
229 : static void heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum);
230 : static void heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum);
231 : static void heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum);
232 : static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum);
233 :
234 : static void page_verify_redirects(Page page);
235 :
236 : static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune,
237 : PruneState *prstate);
238 : static bool heap_page_will_set_vm(PruneState *prstate, PruneReason reason,
239 : bool do_prune, bool do_freeze);
240 :
241 :
242 : /*
243 : * Optionally prune and repair fragmentation in the specified page.
244 : *
245 : * This is an opportunistic function. It will perform housekeeping
246 : * only if the page heuristically looks like a candidate for pruning and we
247 : * can acquire buffer cleanup lock without blocking.
248 : *
249 : * Note: this is called quite often. It's important that it fall out quickly
250 : * if there's not any use in pruning.
251 : *
252 : * Caller must have pin on the buffer, and must *not* have a lock on it.
253 : *
254 : * This function may pin *vmbuffer. It's passed by reference so the caller can
255 : * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find
256 : * VM corruption during pruning, we will fix it. Caller is responsible for
257 : * unpinning *vmbuffer.
258 : *
259 : * rel_read_only is true if we determined at plan time that the query does not
260 : * modify the relation. It is counterproductive to set the VM if the query
261 : * will immediately clear it.
262 : *
263 : * As noted in ScanRelIsReadOnly(), INSERT ... SELECT from the same table will
264 : * report the scan relation as read-only. This is usually harmless in
265 : * practice. It is useful to set scanned pages all-visible that won't be
266 : * inserted into. Pages it does insert to will rarely meet the criteria for
267 : * pruning, and those that do are likely to contain in-progress inserts which
268 : * make the page not fully all-visible.
269 : */
270 : void
271 21107923 : heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer,
272 : bool rel_read_only)
273 : {
274 21107923 : Page page = BufferGetPage(buffer);
275 : TransactionId prune_xid;
276 : GlobalVisState *vistest;
277 : Size minfree;
278 :
279 : /*
280 : * We can't write WAL in recovery mode, so there's no point trying to
281 : * clean the page. The primary will likely issue a cleaning WAL record
282 : * soon anyway, so this is no particular loss.
283 : */
284 21107923 : if (RecoveryInProgress())
285 241635 : return;
286 :
287 : /*
288 : * First check whether there's any chance there's something to prune,
289 : * determining the appropriate horizon is a waste if there's no prune_xid
290 : * (i.e. no updates/deletes left potentially dead tuples around and no
291 : * inserts inserted new tuples that may be visible to all).
292 : */
293 20866288 : prune_xid = PageGetPruneXid(page);
294 20866288 : if (!TransactionIdIsValid(prune_xid))
295 9425118 : return;
296 :
297 : /*
298 : * Check whether prune_xid indicates that there may be dead rows that can
299 : * be cleaned up.
300 : */
301 11441170 : vistest = GlobalVisTestFor(relation);
302 :
303 11441170 : if (!GlobalVisTestIsRemovableXid(vistest, prune_xid, true))
304 8984204 : return;
305 :
306 : /*
307 : * We prune when a previous UPDATE failed to find enough space on the page
308 : * for a new tuple version, or when free space falls below the relation's
309 : * fill-factor target (but not less than 10%).
310 : *
311 : * Checking free space here is questionable since we aren't holding any
312 : * lock on the buffer; in the worst case we could get a bogus answer. It's
313 : * unlikely to be *seriously* wrong, though, since reading either pd_lower
314 : * or pd_upper is probably atomic. Avoiding taking a lock seems more
315 : * important than sometimes getting a wrong answer in what is after all
316 : * just a heuristic estimate.
317 : */
318 2456966 : minfree = RelationGetTargetPageFreeSpace(relation,
319 : HEAP_DEFAULT_FILLFACTOR);
320 2456966 : minfree = Max(minfree, BLCKSZ / 10);
321 :
322 2456966 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
323 : {
324 : /* OK, try to get exclusive buffer lock */
325 107597 : if (!ConditionalLockBufferForCleanup(buffer))
326 1996 : return;
327 :
328 : /*
329 : * Now that we have buffer lock, get accurate information about the
330 : * page's free space, and recheck the heuristic about whether to
331 : * prune.
332 : */
333 105601 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
334 : {
335 : OffsetNumber dummy_off_loc;
336 : PruneFreezeResult presult;
337 : PruneFreezeParams params;
338 :
339 105601 : visibilitymap_pin(relation, BufferGetBlockNumber(buffer),
340 : vmbuffer);
341 :
342 105601 : params.relation = relation;
343 105601 : params.buffer = buffer;
344 105601 : params.vmbuffer = *vmbuffer;
345 105601 : params.reason = PRUNE_ON_ACCESS;
346 105601 : params.vistest = vistest;
347 105601 : params.cutoffs = NULL;
348 :
349 : /*
350 : * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
351 : * regardless of whether or not the relation has indexes, since we
352 : * cannot safely determine that during on-access pruning with the
353 : * current implementation.
354 : */
355 105601 : params.options = HEAP_PAGE_PRUNE_ALLOW_FAST_PATH;
356 105601 : if (rel_read_only)
357 37506 : params.options |= HEAP_PAGE_PRUNE_SET_VM;
358 :
359 105601 : heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc,
360 : NULL, NULL);
361 :
362 : /*
363 : * Report the number of tuples reclaimed to pgstats. This is
364 : * presult.ndeleted minus the number of newly-LP_DEAD-set items.
365 : *
366 : * We derive the number of dead tuples like this to avoid totally
367 : * forgetting about items that were set to LP_DEAD, since they
368 : * still need to be cleaned up by VACUUM. We only want to count
369 : * heap-only tuples that just became LP_UNUSED in our report,
370 : * which don't.
371 : *
372 : * VACUUM doesn't have to compensate in the same way when it
373 : * tracks ndeleted, since it will set the same LP_DEAD items to
374 : * LP_UNUSED separately.
375 : */
376 105601 : if (presult.ndeleted > presult.nnewlpdead)
377 20684 : pgstat_update_heap_dead_tuples(relation,
378 20684 : presult.ndeleted - presult.nnewlpdead);
379 : }
380 :
381 : /* And release buffer lock */
382 105601 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
383 :
384 : /*
385 : * We avoid reuse of any free space created on the page by unrelated
386 : * UPDATEs/INSERTs by opting to not update the FSM at this point. The
387 : * free space should be reused by UPDATEs to *this* page.
388 : */
389 : }
390 : }
391 :
392 : /*
393 : * Helper for heap_page_prune_and_freeze() to initialize the PruneState using
394 : * the provided parameters.
395 : *
396 : * params, new_relfrozen_xid, new_relmin_mxid, and presult are input
397 : * parameters and are not modified by this function. Only prstate is modified.
398 : */
399 : static void
400 507791 : prune_freeze_setup(PruneFreezeParams *params,
401 : TransactionId *new_relfrozen_xid,
402 : MultiXactId *new_relmin_mxid,
403 : PruneFreezeResult *presult,
404 : PruneState *prstate)
405 : {
406 : /* Copy parameters to prstate */
407 507791 : prstate->vistest = params->vistest;
408 507791 : prstate->mark_unused_now =
409 507791 : (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0;
410 :
411 : /* cutoffs must be provided if we will attempt freezing */
412 : Assert(!(params->options & HEAP_PAGE_PRUNE_FREEZE) || params->cutoffs);
413 507791 : prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0;
414 507791 : prstate->attempt_set_vm = (params->options & HEAP_PAGE_PRUNE_SET_VM) != 0;
415 507791 : prstate->cutoffs = params->cutoffs;
416 507791 : prstate->relation = params->relation;
417 507791 : prstate->block = BufferGetBlockNumber(params->buffer);
418 507791 : prstate->buffer = params->buffer;
419 507791 : prstate->page = BufferGetPage(params->buffer);
420 :
421 : Assert(BufferIsValid(params->vmbuffer));
422 507791 : prstate->vmbuffer = params->vmbuffer;
423 507791 : prstate->new_vmbits = 0;
424 507791 : prstate->old_vmbits = visibilitymap_get_status(prstate->relation,
425 : prstate->block,
426 : &prstate->vmbuffer);
427 :
428 : /*
429 : * Our strategy is to scan the page and make lists of items to change,
430 : * then apply the changes within a critical section. This keeps as much
431 : * logic as possible out of the critical section, and also ensures that
432 : * WAL replay will work the same as the normal case.
433 : *
434 : * First, initialize the new pd_prune_xid value to zero (indicating no
435 : * prunable tuples). If we find any tuples which may soon become
436 : * prunable, we will save the lowest relevant XID in new_prune_xid. Also
437 : * initialize the rest of our working state.
438 : */
439 507791 : prstate->new_prune_xid = InvalidTransactionId;
440 507791 : prstate->latest_xid_removed = InvalidTransactionId;
441 507791 : prstate->nredirected = prstate->ndead = prstate->nunused = 0;
442 507791 : prstate->nfrozen = 0;
443 507791 : prstate->nroot_items = 0;
444 507791 : prstate->nheaponly_items = 0;
445 :
446 : /* initialize page freezing working state */
447 507791 : prstate->pagefrz.freeze_required = false;
448 507791 : prstate->pagefrz.FreezePageConflictXid = InvalidTransactionId;
449 507791 : if (prstate->attempt_freeze)
450 : {
451 : Assert(new_relfrozen_xid && new_relmin_mxid);
452 402190 : prstate->pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid;
453 402190 : prstate->pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid;
454 402190 : prstate->pagefrz.FreezePageRelminMxid = *new_relmin_mxid;
455 402190 : prstate->pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid;
456 : }
457 : else
458 : {
459 : Assert(!new_relfrozen_xid && !new_relmin_mxid);
460 105601 : prstate->pagefrz.FreezePageRelminMxid = InvalidMultiXactId;
461 105601 : prstate->pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId;
462 105601 : prstate->pagefrz.FreezePageRelfrozenXid = InvalidTransactionId;
463 105601 : prstate->pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId;
464 : }
465 :
466 507791 : prstate->ndeleted = 0;
467 507791 : prstate->live_tuples = 0;
468 507791 : prstate->recently_dead_tuples = 0;
469 507791 : prstate->hastup = false;
470 507791 : prstate->lpdead_items = 0;
471 :
472 : /*
473 : * deadoffsets are filled in during pruning but are only used to populate
474 : * PruneFreezeResult->deadoffsets. To avoid needing two copies of the
475 : * array, just save a pointer to the result offsets array in the
476 : * PruneState.
477 : */
478 507791 : prstate->deadoffsets = presult->deadoffsets;
479 :
480 : /*
481 : * We track whether the page will be all-visible/all-frozen at the end of
482 : * pruning and freezing. While examining tuple visibility, we'll set
483 : * set_all_visible to false if there are tuples on the page not visible to
484 : * all running and future transactions. If setting the VM is enabled for
485 : * this scan, we will do so if the page ends up being all-visible.
486 : *
487 : * We also keep track of the newest live XID, which is used to calculate
488 : * the snapshot conflict horizon for a WAL record setting the VM.
489 : */
490 507791 : prstate->set_all_visible = prstate->attempt_set_vm;
491 507791 : prstate->newest_live_xid = InvalidTransactionId;
492 :
493 : /*
494 : * Currently, only VACUUM performs freezing, but other callers may in the
495 : * future. We must initialize set_all_frozen based on whether or not the
496 : * caller passed HEAP_PAGE_PRUNE_FREEZE, because if they did not, we won't
497 : * call heap_prepare_freeze_tuple() for each tuple, and set_all_frozen
498 : * will never be cleared for tuples that need freezing. This would lead to
499 : * incorrectly setting the visibility map all-frozen for this page. We
500 : * can't set the page all-frozen in the VM if the caller didn't pass
501 : * HEAP_PAGE_PRUNE_SET_VM.
502 : *
503 : * When freezing is not required (no XIDs/MXIDs older than the freeze
504 : * cutoff), we may still choose to "opportunistically" freeze if doing so
505 : * would make the page all-frozen.
506 : *
507 : * We will not be able to freeze the whole page at the end of vacuum if
508 : * there are tuples present that are not visible to everyone or if there
509 : * are dead tuples which will not be removable. However, dead tuples that
510 : * will be removed by the end of vacuum should not prevent this
511 : * opportunistic freezing.
512 : *
513 : * Therefore, we do not clear set_all_visible and set_all_frozen when we
514 : * encounter LP_DEAD items. Instead, we correct them after deciding
515 : * whether to freeze, but before updating the VM, to avoid setting the VM
516 : * bits incorrectly.
517 : */
518 507791 : prstate->set_all_frozen = prstate->attempt_freeze && prstate->attempt_set_vm;
519 507791 : }
520 :
521 : /*
522 : * Helper for heap_page_prune_and_freeze(). Iterates over every tuple on the
523 : * page, examines its visibility information, and determines the appropriate
524 : * action for each tuple. All tuples are processed and classified during this
525 : * phase, but no modifications are made to the page until the later execution
526 : * stage.
527 : *
528 : * *off_loc is used for error callback and cleared before returning.
529 : */
530 : static void
531 336771 : prune_freeze_plan(PruneState *prstate, OffsetNumber *off_loc)
532 : {
533 336771 : Page page = prstate->page;
534 336771 : BlockNumber blockno = prstate->block;
535 336771 : OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
536 : OffsetNumber offnum;
537 : HeapTupleData tup;
538 :
539 336771 : tup.t_tableOid = RelationGetRelid(prstate->relation);
540 :
541 : /*
542 : * Determine HTSV for all tuples, and queue them up for processing as HOT
543 : * chain roots or as heap-only items.
544 : *
545 : * Determining HTSV only once for each tuple is required for correctness,
546 : * to deal with cases where running HTSV twice could result in different
547 : * results. For example, RECENTLY_DEAD can turn to DEAD if another
548 : * checked item causes GlobalVisTestIsRemovableFullXid() to update the
549 : * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting
550 : * transaction aborts.
551 : *
552 : * It's also good for performance. Most commonly tuples within a page are
553 : * stored at decreasing offsets (while the items are stored at increasing
554 : * offsets). When processing all tuples on a page this leads to reading
555 : * memory at decreasing offsets within a page, with a variable stride.
556 : * That's hard for CPU prefetchers to deal with. Processing the items in
557 : * reverse order (and thus the tuples in increasing order) increases
558 : * prefetching efficiency significantly / decreases the number of cache
559 : * misses.
560 : */
561 336771 : for (offnum = maxoff;
562 20616561 : offnum >= FirstOffsetNumber;
563 20279790 : offnum = OffsetNumberPrev(offnum))
564 : {
565 20279790 : ItemId itemid = PageGetItemId(page, offnum);
566 : HeapTupleHeader htup;
567 :
568 : /*
569 : * Set the offset number so that we can display it along with any
570 : * error that occurred while processing this tuple.
571 : */
572 20279790 : *off_loc = offnum;
573 :
574 20279790 : prstate->processed[offnum] = false;
575 20279790 : prstate->htsv[offnum] = -1;
576 :
577 : /* Nothing to do if slot doesn't contain a tuple */
578 20279790 : if (!ItemIdIsUsed(itemid))
579 : {
580 135740 : heap_prune_record_unchanged_lp_unused(prstate, offnum);
581 135740 : continue;
582 : }
583 :
584 20144050 : if (ItemIdIsDead(itemid))
585 : {
586 : /*
587 : * If the caller set mark_unused_now true, we can set dead line
588 : * pointers LP_UNUSED now.
589 : */
590 1555675 : if (unlikely(prstate->mark_unused_now))
591 2067 : heap_prune_record_unused(prstate, offnum, false);
592 : else
593 1553608 : heap_prune_record_unchanged_lp_dead(prstate, offnum);
594 1555675 : continue;
595 : }
596 :
597 18588375 : if (ItemIdIsRedirected(itemid))
598 : {
599 : /* This is the start of a HOT chain */
600 198029 : prstate->root_items[prstate->nroot_items++] = offnum;
601 198029 : continue;
602 : }
603 :
604 : Assert(ItemIdIsNormal(itemid));
605 :
606 : /*
607 : * Get the tuple's visibility status and queue it up for processing.
608 : */
609 18390346 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
610 18390346 : tup.t_data = htup;
611 18390346 : tup.t_len = ItemIdGetLength(itemid);
612 18390346 : ItemPointerSet(&tup.t_self, blockno, offnum);
613 :
614 18390346 : prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, &tup);
615 :
616 18390346 : if (!HeapTupleHeaderIsHeapOnly(htup))
617 18050731 : prstate->root_items[prstate->nroot_items++] = offnum;
618 : else
619 339615 : prstate->heaponly_items[prstate->nheaponly_items++] = offnum;
620 : }
621 :
622 : /*
623 : * Process HOT chains.
624 : *
625 : * We added the items to the array starting from 'maxoff', so by
626 : * processing the array in reverse order, we process the items in
627 : * ascending offset number order. The order doesn't matter for
628 : * correctness, but some quick micro-benchmarking suggests that this is
629 : * faster. (Earlier PostgreSQL versions, which scanned all the items on
630 : * the page instead of using the root_items array, also did it in
631 : * ascending offset number order.)
632 : */
633 18585531 : for (int i = prstate->nroot_items - 1; i >= 0; i--)
634 : {
635 18248760 : offnum = prstate->root_items[i];
636 :
637 : /* Ignore items already processed as part of an earlier chain */
638 18248760 : if (prstate->processed[offnum])
639 0 : continue;
640 :
641 : /* see preceding loop */
642 18248760 : *off_loc = offnum;
643 :
644 : /* Process this item or chain of items */
645 18248760 : heap_prune_chain(maxoff, offnum, prstate);
646 : }
647 :
648 : /*
649 : * Process any heap-only tuples that were not already processed as part of
650 : * a HOT chain.
651 : */
652 676386 : for (int i = prstate->nheaponly_items - 1; i >= 0; i--)
653 : {
654 339615 : offnum = prstate->heaponly_items[i];
655 :
656 339615 : if (prstate->processed[offnum])
657 322655 : continue;
658 :
659 : /* see preceding loop */
660 16960 : *off_loc = offnum;
661 :
662 : /*
663 : * If the tuple is DEAD and doesn't chain to anything else, mark it
664 : * unused. (If it does chain, we can only remove it as part of
665 : * pruning its chain.)
666 : *
667 : * We need this primarily to handle aborted HOT updates, that is,
668 : * XMIN_INVALID heap-only tuples. Those might not be linked to by any
669 : * chain, since the parent tuple might be re-updated before any
670 : * pruning occurs. So we have to be able to reap them separately from
671 : * chain-pruning. (Note that HeapTupleHeaderIsHotUpdated will never
672 : * return true for an XMIN_INVALID tuple, so this code will work even
673 : * when there were sequential updates within the aborted transaction.)
674 : */
675 16960 : if (prstate->htsv[offnum] == HEAPTUPLE_DEAD)
676 : {
677 3244 : ItemId itemid = PageGetItemId(page, offnum);
678 3244 : HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid);
679 :
680 3244 : if (likely(!HeapTupleHeaderIsHotUpdated(htup)))
681 : {
682 3244 : HeapTupleHeaderAdvanceConflictHorizon(htup,
683 : &prstate->latest_xid_removed);
684 3244 : heap_prune_record_unused(prstate, offnum, true);
685 : }
686 : else
687 : {
688 : /*
689 : * This tuple should've been processed and removed as part of
690 : * a HOT chain, so something's wrong. To preserve evidence,
691 : * we don't dare to remove it. We cannot leave behind a DEAD
692 : * tuple either, because that will cause VACUUM to error out.
693 : * Throwing an error with a distinct error message seems like
694 : * the least bad option.
695 : */
696 0 : elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain",
697 : blockno, offnum);
698 : }
699 : }
700 : else
701 13716 : heap_prune_record_unchanged_lp_normal(prstate, offnum);
702 : }
703 :
704 : /* We should now have processed every tuple exactly once */
705 : #ifdef USE_ASSERT_CHECKING
706 : for (offnum = FirstOffsetNumber;
707 : offnum <= maxoff;
708 : offnum = OffsetNumberNext(offnum))
709 : {
710 : *off_loc = offnum;
711 :
712 : Assert(prstate->processed[offnum]);
713 : }
714 : #endif
715 :
716 : /* Clear the offset information once we have processed the given page. */
717 336771 : *off_loc = InvalidOffsetNumber;
718 336771 : }
719 :
720 : /*
721 : * Decide whether to proceed with freezing according to the freeze plans
722 : * prepared for the current heap buffer. If freezing is chosen, this function
723 : * performs several pre-freeze checks.
724 : *
725 : * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be
726 : * determined before calling this function.
727 : *
728 : * prstate is both an input and output parameter.
729 : *
730 : * Returns true if we should apply the freeze plans and freeze tuples on the
731 : * page, and false otherwise.
732 : */
733 : static bool
734 336771 : heap_page_will_freeze(bool did_tuple_hint_fpi,
735 : bool do_prune,
736 : bool do_hint_prune,
737 : PruneState *prstate)
738 : {
739 336771 : bool do_freeze = false;
740 :
741 : /*
742 : * If the caller specified we should not attempt to freeze any tuples,
743 : * validate that everything is in the right state and return.
744 : */
745 336771 : if (!prstate->attempt_freeze)
746 : {
747 : Assert(!prstate->set_all_frozen && prstate->nfrozen == 0);
748 105503 : return false;
749 : }
750 :
751 231268 : if (prstate->pagefrz.freeze_required)
752 : {
753 : /*
754 : * heap_prepare_freeze_tuple indicated that at least one XID/MXID from
755 : * before FreezeLimit/MultiXactCutoff is present. Must freeze to
756 : * advance relfrozenxid/relminmxid.
757 : */
758 22143 : do_freeze = true;
759 : }
760 : else
761 : {
762 : /*
763 : * Opportunistically freeze the page if we are generating an FPI
764 : * anyway and if doing so means that we can set the page all-frozen
765 : * afterwards (might not happen until VACUUM's final heap pass).
766 : *
767 : * XXX: Previously, we knew if pruning emitted an FPI by checking
768 : * pgWalUsage.wal_fpi before and after pruning. Once the freeze and
769 : * prune records were combined, this heuristic couldn't be used
770 : * anymore. The opportunistic freeze heuristic must be improved;
771 : * however, for now, try to approximate the old logic.
772 : */
773 209125 : if (prstate->set_all_frozen && prstate->nfrozen > 0)
774 : {
775 : Assert(prstate->set_all_visible);
776 :
777 : /*
778 : * Freezing would make the page all-frozen. Have already emitted
779 : * an FPI or will do so anyway?
780 : */
781 19111 : if (RelationNeedsWAL(prstate->relation))
782 : {
783 17453 : if (did_tuple_hint_fpi)
784 1195 : do_freeze = true;
785 16258 : else if (do_prune)
786 : {
787 1885 : if (XLogCheckBufferNeedsBackup(prstate->buffer))
788 668 : do_freeze = true;
789 : }
790 14373 : else if (do_hint_prune)
791 : {
792 16146 : if (XLogHintBitIsNeeded() &&
793 8073 : XLogCheckBufferNeedsBackup(prstate->buffer))
794 1349 : do_freeze = true;
795 : }
796 : }
797 : }
798 : }
799 :
800 231268 : if (do_freeze)
801 : {
802 : /*
803 : * Validate the tuples we will be freezing before entering the
804 : * critical section.
805 : */
806 25355 : heap_pre_freeze_checks(prstate->buffer, prstate->frozen, prstate->nfrozen);
807 : Assert(TransactionIdPrecedes(prstate->pagefrz.FreezePageConflictXid,
808 : prstate->cutoffs->OldestXmin));
809 : }
810 205913 : else if (prstate->nfrozen > 0)
811 : {
812 : /*
813 : * The page contained some tuples that were not already frozen, and we
814 : * chose not to freeze them now. The page won't be all-frozen then.
815 : */
816 : Assert(!prstate->pagefrz.freeze_required);
817 :
818 16469 : prstate->set_all_frozen = false;
819 16469 : prstate->nfrozen = 0; /* avoid miscounts in instrumentation */
820 : }
821 : else
822 : {
823 : /*
824 : * We have no freeze plans to execute. The page might already be
825 : * all-frozen (perhaps only following pruning), though. Such pages
826 : * can be marked all-frozen in the VM by our caller, even though none
827 : * of its tuples were newly frozen here.
828 : */
829 : }
830 :
831 231268 : return do_freeze;
832 : }
833 :
834 : /*
835 : * Emit a warning about and fix visibility map corruption on the given page.
836 : *
837 : * The caller specifies the type of corruption it has already detected via
838 : * corruption_type, so that we can emit the appropriate warning. All cases
839 : * result in the VM bits being cleared; corruption types where PD_ALL_VISIBLE
840 : * is incorrectly set also clear PD_ALL_VISIBLE.
841 : *
842 : * Must be called while holding an exclusive lock on the heap buffer. Dead
843 : * items and not all-visible tuples must have been discovered under that same
844 : * lock. Although we do not hold a lock on the VM buffer, it is pinned, and
845 : * the heap buffer is exclusively locked, ensuring that no other backend can
846 : * update the VM bits corresponding to this heap page.
847 : *
848 : * This function makes changes to the VM and, potentially, the heap page, but
849 : * it does not need to be done in a critical section.
850 : */
851 : static void
852 0 : heap_page_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum,
853 : VMCorruptionType corruption_type)
854 : {
855 0 : const char *relname = RelationGetRelationName(prstate->relation);
856 0 : bool do_clear_vm = false;
857 0 : bool do_clear_heap = false;
858 :
859 : Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE));
860 :
861 0 : switch (corruption_type)
862 : {
863 0 : case VM_CORRUPT_LPDEAD:
864 0 : ereport(WARNING,
865 : (errcode(ERRCODE_DATA_CORRUPTED),
866 : errmsg("dead line pointer found on page marked all-visible"),
867 : errcontext("relation \"%s\", page %u, tuple %u",
868 : relname, prstate->block, offnum)));
869 0 : do_clear_vm = true;
870 0 : do_clear_heap = true;
871 0 : break;
872 :
873 0 : case VM_CORRUPT_TUPLE_VISIBILITY:
874 :
875 : /*
876 : * A HEAPTUPLE_LIVE tuple on an all-visible page can appear to not
877 : * be visible to everyone when
878 : * GetOldestNonRemovableTransactionId() returns a conservative
879 : * value that's older than the real safe xmin. That is not
880 : * corruption -- the PD_ALL_VISIBLE flag is still correct.
881 : *
882 : * However, dead tuple versions, in-progress inserts, and
883 : * in-progress deletes should never appear on a page marked
884 : * all-visible. That indicates real corruption. PD_ALL_VISIBLE
885 : * should have been cleared by the DML operation that deleted or
886 : * inserted the tuple.
887 : */
888 0 : ereport(WARNING,
889 : (errcode(ERRCODE_DATA_CORRUPTED),
890 : errmsg("tuple not visible to all transactions found on page marked all-visible"),
891 : errcontext("relation \"%s\", page %u, tuple %u",
892 : relname, prstate->block, offnum)));
893 0 : do_clear_vm = true;
894 0 : do_clear_heap = true;
895 0 : break;
896 :
897 0 : case VM_CORRUPT_MISSING_PAGE_HINT:
898 :
899 : /*
900 : * As of PostgreSQL 9.2, the visibility map bit should never be
901 : * set if the page-level bit is clear. However, for vacuum, it's
902 : * possible that the bit got cleared after
903 : * heap_vac_scan_next_block() was called, so we must recheck now
904 : * that we have the buffer lock before concluding that the VM is
905 : * corrupt.
906 : */
907 : Assert(!PageIsAllVisible(prstate->page));
908 : Assert(prstate->old_vmbits & VISIBILITYMAP_VALID_BITS);
909 0 : ereport(WARNING,
910 : (errcode(ERRCODE_DATA_CORRUPTED),
911 : errmsg("page is not marked all-visible but visibility map bit is set"),
912 : errcontext("relation \"%s\", page %u",
913 : relname, prstate->block)));
914 0 : do_clear_vm = true;
915 0 : break;
916 : }
917 :
918 : Assert(do_clear_heap || do_clear_vm);
919 :
920 : /* Avoid marking the buffer dirty if PD_ALL_VISIBLE is already clear */
921 0 : if (do_clear_heap)
922 : {
923 : Assert(PageIsAllVisible(prstate->page));
924 0 : PageClearAllVisible(prstate->page);
925 0 : MarkBufferDirtyHint(prstate->buffer, true);
926 : }
927 :
928 0 : if (do_clear_vm)
929 : {
930 0 : visibilitymap_clear(prstate->relation, prstate->block,
931 : prstate->vmbuffer,
932 : VISIBILITYMAP_VALID_BITS);
933 0 : prstate->old_vmbits = 0;
934 : }
935 0 : }
936 :
937 : /*
938 : * Decide whether to set the visibility map bits (all-visible and all-frozen)
939 : * for the current page using information from the PruneState and VM.
940 : *
941 : * This function does not actually set the VM bits or page-level visibility
942 : * hint, PD_ALL_VISIBLE.
943 : *
944 : * This should be called only after do_freeze has been decided (and do_prune
945 : * has been set), as these factor into our heuristic-based decision.
946 : *
947 : * Returns true if one or both VM bits should be set and false otherwise.
948 : */
949 : static bool
950 336771 : heap_page_will_set_vm(PruneState *prstate, PruneReason reason,
951 : bool do_prune, bool do_freeze)
952 : {
953 336771 : if (!prstate->attempt_set_vm)
954 68095 : return false;
955 :
956 268676 : if (!prstate->set_all_visible)
957 196115 : return false;
958 :
959 : /*
960 : * If this is an on-access call and we're not actually pruning, avoid
961 : * setting the visibility map if it would newly dirty the heap page or, if
962 : * the page is already dirty, if doing so would require including a
963 : * full-page image (FPI) of the heap page in the WAL.
964 : */
965 72561 : if (reason == PRUNE_ON_ACCESS && !do_prune && !do_freeze &&
966 28844 : (!BufferIsDirty(prstate->buffer) || XLogCheckBufferNeedsBackup(prstate->buffer)))
967 : {
968 15578 : prstate->set_all_visible = prstate->set_all_frozen = false;
969 15578 : return false;
970 : }
971 :
972 56983 : prstate->new_vmbits = VISIBILITYMAP_ALL_VISIBLE;
973 :
974 56983 : if (prstate->set_all_frozen)
975 31646 : prstate->new_vmbits |= VISIBILITYMAP_ALL_FROZEN;
976 :
977 56983 : if (prstate->new_vmbits == prstate->old_vmbits)
978 : {
979 1716 : prstate->new_vmbits = 0;
980 1716 : return false;
981 : }
982 :
983 55267 : return true;
984 : }
985 :
986 : /*
987 : * If the page is already all-frozen, or already all-visible and freezing
988 : * won't be attempted, there is no remaining work and we can use the fast path
989 : * to avoid the expensive overhead of heap_page_prune_and_freeze().
990 : *
991 : * This can happen when the page has a stale prune hint, or if VACUUM is
992 : * scanning an already all-frozen page due to SKIP_PAGES_THRESHOLD.
993 : *
994 : * The caller must already have examined the visibility map and saved the
995 : * status of the page's VM bits in prstate->old_vmbits. Caller must hold a
996 : * content lock on the heap page since it will examine line pointers.
997 : *
998 : * Before calling prune_freeze_fast_path(), the caller should first
999 : * check for and fix any discrepancy between the page-level visibility hint
1000 : * and the visibility map. Otherwise, the fast path will always prevent us
1001 : * from getting them in sync. Note that if there are tuples on the page that
1002 : * are not visible to all but the VM is incorrectly marked
1003 : * all-visible/all-frozen, we will not get the chance to fix that corruption
1004 : * when using the fast path.
1005 : */
1006 : static void
1007 171020 : prune_freeze_fast_path(PruneState *prstate, PruneFreezeResult *presult)
1008 : {
1009 171020 : OffsetNumber maxoff = PageGetMaxOffsetNumber(prstate->page);
1010 171020 : Page page = prstate->page;
1011 :
1012 : Assert((prstate->old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
1013 : ((prstate->old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
1014 : !prstate->attempt_freeze));
1015 :
1016 : /* We'll fill in presult for the caller */
1017 171020 : memset(presult, 0, sizeof(PruneFreezeResult));
1018 :
1019 : /* Clear any stale prune hint */
1020 171020 : if (TransactionIdIsValid(PageGetPruneXid(page)))
1021 : {
1022 98 : PageClearPrunable(page);
1023 98 : MarkBufferDirtyHint(prstate->buffer, true);
1024 : }
1025 :
1026 171020 : if (PageIsEmpty(page))
1027 0 : return;
1028 :
1029 : /*
1030 : * Since the page is all-visible, a count of the normal ItemIds on the
1031 : * page should be sufficient for vacuum's live tuple count.
1032 : */
1033 171020 : for (OffsetNumber off = FirstOffsetNumber;
1034 9929724 : off <= maxoff;
1035 9758704 : off = OffsetNumberNext(off))
1036 : {
1037 9758704 : ItemId lp = PageGetItemId(page, off);
1038 :
1039 9758704 : if (!ItemIdIsUsed(lp))
1040 158431 : continue;
1041 :
1042 9600273 : presult->hastup = true;
1043 :
1044 9600273 : if (ItemIdIsNormal(lp))
1045 9430598 : prstate->live_tuples++;
1046 : }
1047 :
1048 171020 : presult->live_tuples = prstate->live_tuples;
1049 : }
1050 :
1051 : /*
1052 : * Prune and repair fragmentation and potentially freeze tuples on the
1053 : * specified page. If the page's visibility status has changed, update it in
1054 : * the VM.
1055 : *
1056 : * Caller must have pin and buffer cleanup lock on the page. Note that we
1057 : * don't update the FSM information for page on caller's behalf. Caller might
1058 : * also need to account for a reduction in the length of the line pointer
1059 : * array following array truncation by us.
1060 : *
1061 : * params contains the input parameters used to control freezing and pruning
1062 : * behavior. See the definition of PruneFreezeParams for more on what each
1063 : * parameter does.
1064 : *
1065 : * If the HEAP_PAGE_PRUNE_FREEZE option is set in params, we will freeze
1066 : * tuples if it's required in order to advance relfrozenxid / relminmxid, or
1067 : * if it's considered advantageous for overall system performance to do so
1068 : * now. The 'params.cutoffs', 'presult', 'new_relfrozen_xid' and
1069 : * 'new_relmin_mxid' arguments are required when freezing.
1070 : *
1071 : * A vmbuffer corresponding to the heap page is also passed and if the page is
1072 : * found to be all-visible/all-frozen, we will set it in the VM.
1073 : *
1074 : * presult contains output parameters needed by callers, such as the number of
1075 : * tuples removed and the offsets of dead items on the page after pruning.
1076 : * heap_page_prune_and_freeze() is responsible for initializing it. Required
1077 : * by all callers.
1078 : *
1079 : * off_loc is the offset location required by the caller to use in error
1080 : * callback.
1081 : *
1082 : * new_relfrozen_xid and new_relmin_mxid must be provided by the caller if the
1083 : * HEAP_PAGE_PRUNE_FREEZE option is set in params. On entry, they contain the
1084 : * oldest XID and multi-XID seen on the relation so far. They will be updated
1085 : * with the oldest values present on the page after pruning. After processing
1086 : * the whole relation, VACUUM can use these values as the new
1087 : * relfrozenxid/relminmxid for the relation.
1088 : */
1089 : void
1090 507791 : heap_page_prune_and_freeze(PruneFreezeParams *params,
1091 : PruneFreezeResult *presult,
1092 : OffsetNumber *off_loc,
1093 : TransactionId *new_relfrozen_xid,
1094 : MultiXactId *new_relmin_mxid)
1095 : {
1096 : PruneState prstate;
1097 : bool do_freeze;
1098 : bool do_prune;
1099 : bool do_hint_prune;
1100 : bool do_set_vm;
1101 : bool did_tuple_hint_fpi;
1102 507791 : int64 fpi_before = pgWalUsage.wal_fpi;
1103 : TransactionId conflict_xid;
1104 :
1105 : /* Initialize prstate */
1106 507791 : prune_freeze_setup(params,
1107 : new_relfrozen_xid, new_relmin_mxid,
1108 : presult, &prstate);
1109 :
1110 : /*
1111 : * If the VM is set but PD_ALL_VISIBLE is clear, fix that corruption
1112 : * before pruning and freezing so that the page and VM start out in a
1113 : * consistent state.
1114 : */
1115 507791 : if ((prstate.old_vmbits & VISIBILITYMAP_VALID_BITS) &&
1116 175923 : !PageIsAllVisible(prstate.page))
1117 0 : heap_page_fix_vm_corruption(&prstate, InvalidOffsetNumber,
1118 : VM_CORRUPT_MISSING_PAGE_HINT);
1119 :
1120 : /*
1121 : * If the page is already all-frozen, or already all-visible when freezing
1122 : * is not being attempted, take the fast path, skipping pruning and
1123 : * freezing code entirely. This must be done after fixing any discrepancy
1124 : * between the page-level visibility hint and the VM, since that may have
1125 : * cleared old_vmbits.
1126 : */
1127 507791 : if ((params->options & HEAP_PAGE_PRUNE_ALLOW_FAST_PATH) != 0 &&
1128 506498 : ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) ||
1129 335478 : ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) &&
1130 4444 : !prstate.attempt_freeze)))
1131 : {
1132 171020 : prune_freeze_fast_path(&prstate, presult);
1133 171020 : return;
1134 : }
1135 :
1136 : /*
1137 : * Examine all line pointers and tuple visibility information to determine
1138 : * which line pointers should change state and which tuples may be frozen.
1139 : * Prepare queue of state changes to later be executed in a critical
1140 : * section.
1141 : */
1142 336771 : prune_freeze_plan(&prstate, off_loc);
1143 :
1144 : /*
1145 : * After processing all the live tuples on the page, if the newest xmin
1146 : * amongst them may be considered running by any snapshot, the page cannot
1147 : * be all-visible. This should be done before determining whether or not
1148 : * to opportunistically freeze.
1149 : */
1150 336771 : if (prstate.set_all_visible &&
1151 177230 : TransactionIdIsNormal(prstate.newest_live_xid) &&
1152 75619 : GlobalVisTestXidConsideredRunning(prstate.vistest,
1153 : prstate.newest_live_xid,
1154 : true))
1155 3349 : prstate.set_all_visible = prstate.set_all_frozen = false;
1156 :
1157 : /*
1158 : * If checksums are enabled, calling heap_prune_satisfies_vacuum() while
1159 : * checking tuple visibility information in prune_freeze_plan() may have
1160 : * caused an FPI to be emitted.
1161 : */
1162 336771 : did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi;
1163 :
1164 992859 : do_prune = prstate.nredirected > 0 ||
1165 614020 : prstate.ndead > 0 ||
1166 277249 : prstate.nunused > 0;
1167 :
1168 : /*
1169 : * Even if we don't prune anything, if we found a new value for the
1170 : * pd_prune_xid field or the page was marked full, we will update the hint
1171 : * bit.
1172 : */
1173 529225 : do_hint_prune = PageGetPruneXid(prstate.page) != prstate.new_prune_xid ||
1174 192454 : PageIsFull(prstate.page);
1175 :
1176 : /*
1177 : * Decide if we want to go ahead with freezing according to the freeze
1178 : * plans we prepared, or not.
1179 : */
1180 336771 : do_freeze = heap_page_will_freeze(did_tuple_hint_fpi,
1181 : do_prune,
1182 : do_hint_prune,
1183 : &prstate);
1184 :
1185 : /*
1186 : * While scanning the line pointers, we did not clear
1187 : * set_all_visible/set_all_frozen when encountering LP_DEAD items because
1188 : * we wanted the decision whether or not to freeze the page to be
1189 : * unaffected by the short-term presence of LP_DEAD items. These LP_DEAD
1190 : * items are effectively assumed to be LP_UNUSED items in the making. It
1191 : * doesn't matter which vacuum heap pass (initial pass or final pass) ends
1192 : * up setting the page all-frozen, as long as the ongoing VACUUM does it.
1193 : *
1194 : * Now that we finished determining whether or not to freeze the page,
1195 : * update set_all_visible and set_all_frozen so that they reflect the true
1196 : * state of the page for setting PD_ALL_VISIBLE and VM bits.
1197 : */
1198 336771 : if (prstate.lpdead_items > 0)
1199 61661 : prstate.set_all_visible = prstate.set_all_frozen = false;
1200 :
1201 : Assert(!prstate.set_all_frozen || prstate.set_all_visible);
1202 : Assert(!prstate.set_all_visible || prstate.attempt_set_vm);
1203 : Assert(!prstate.set_all_visible || (prstate.lpdead_items == 0));
1204 :
1205 336771 : do_set_vm = heap_page_will_set_vm(&prstate, params->reason, do_prune, do_freeze);
1206 :
1207 : /*
1208 : * new_vmbits should be 0 regardless of whether or not the page is
1209 : * all-visible if we do not intend to set the VM.
1210 : */
1211 : Assert(do_set_vm || prstate.new_vmbits == 0);
1212 :
1213 : /*
1214 : * The snapshot conflict horizon for the whole record is the most
1215 : * conservative (newest) horizon required by any change in the record.
1216 : */
1217 336771 : conflict_xid = InvalidTransactionId;
1218 336771 : if (do_set_vm)
1219 55267 : conflict_xid = prstate.newest_live_xid;
1220 336771 : if (do_freeze && TransactionIdFollows(prstate.pagefrz.FreezePageConflictXid, conflict_xid))
1221 4269 : conflict_xid = prstate.pagefrz.FreezePageConflictXid;
1222 336771 : if (do_prune && TransactionIdFollows(prstate.latest_xid_removed, conflict_xid))
1223 52704 : conflict_xid = prstate.latest_xid_removed;
1224 :
1225 : /* Lock vmbuffer before entering a critical section */
1226 336771 : if (do_set_vm)
1227 55267 : LockBuffer(prstate.vmbuffer, BUFFER_LOCK_EXCLUSIVE);
1228 :
1229 : /* Any error while applying the changes is critical */
1230 336771 : START_CRIT_SECTION();
1231 :
1232 336771 : if (do_hint_prune)
1233 : {
1234 : /*
1235 : * Update the page's pd_prune_xid field to either zero, or the lowest
1236 : * XID of any soon-prunable tuple.
1237 : */
1238 144383 : ((PageHeader) prstate.page)->pd_prune_xid = prstate.new_prune_xid;
1239 :
1240 : /*
1241 : * Also clear the "page is full" flag, since there's no point in
1242 : * repeating the prune/defrag process until something else happens to
1243 : * the page.
1244 : */
1245 144383 : PageClearFull(prstate.page);
1246 :
1247 : /*
1248 : * If that's all we had to do to the page, this is a non-WAL-logged
1249 : * hint. If we are going to freeze or prune the page or set
1250 : * PD_ALL_VISIBLE, we will mark the buffer dirty below.
1251 : *
1252 : * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden
1253 : * for the VM to be set and PD_ALL_VISIBLE to be clear.
1254 : */
1255 144383 : if (!do_freeze && !do_prune && !do_set_vm)
1256 49972 : MarkBufferDirtyHint(prstate.buffer, true);
1257 : }
1258 :
1259 336771 : if (do_prune || do_freeze || do_set_vm)
1260 : {
1261 : /* Apply the planned item changes and repair page fragmentation. */
1262 117096 : if (do_prune)
1263 : {
1264 59959 : heap_page_prune_execute(prstate.buffer, false,
1265 : prstate.redirected, prstate.nredirected,
1266 : prstate.nowdead, prstate.ndead,
1267 : prstate.nowunused, prstate.nunused);
1268 : }
1269 :
1270 117096 : if (do_freeze)
1271 25355 : heap_freeze_prepared_tuples(prstate.buffer, prstate.frozen, prstate.nfrozen);
1272 :
1273 : /* Set the visibility map and page visibility hint */
1274 117096 : if (do_set_vm)
1275 : {
1276 : /*
1277 : * While it is valid for PD_ALL_VISIBLE to be set when the
1278 : * corresponding VM bit is clear, we strongly prefer to keep them
1279 : * in sync.
1280 : *
1281 : * The heap buffer must be marked dirty before adding it to the
1282 : * WAL chain when setting the VM. We don't worry about
1283 : * unnecessarily dirtying the heap buffer if PD_ALL_VISIBLE is
1284 : * already set, though. It is extremely rare to have a clean heap
1285 : * buffer with PD_ALL_VISIBLE already set and the VM bits clear,
1286 : * so there is no point in optimizing it.
1287 : */
1288 55267 : PageSetAllVisible(prstate.page);
1289 55267 : PageClearPrunable(prstate.page);
1290 55267 : visibilitymap_set(prstate.block, prstate.vmbuffer, prstate.new_vmbits,
1291 55267 : prstate.relation->rd_locator);
1292 : }
1293 :
1294 117096 : MarkBufferDirty(prstate.buffer);
1295 :
1296 : /*
1297 : * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
1298 : */
1299 117096 : if (RelationNeedsWAL(prstate.relation))
1300 : {
1301 168073 : log_heap_prune_and_freeze(prstate.relation, prstate.buffer,
1302 : do_set_vm ? prstate.vmbuffer : InvalidBuffer,
1303 53676 : do_set_vm ? prstate.new_vmbits : 0,
1304 : conflict_xid,
1305 : true, /* cleanup lock */
1306 : params->reason,
1307 : prstate.frozen, prstate.nfrozen,
1308 : prstate.redirected, prstate.nredirected,
1309 : prstate.nowdead, prstate.ndead,
1310 : prstate.nowunused, prstate.nunused);
1311 : }
1312 : }
1313 :
1314 336771 : END_CRIT_SECTION();
1315 :
1316 336771 : if (do_set_vm)
1317 55267 : LockBuffer(prstate.vmbuffer, BUFFER_LOCK_UNLOCK);
1318 :
1319 : /*
1320 : * During its second pass over the heap, VACUUM calls
1321 : * heap_page_would_be_all_visible() to determine whether a page is
1322 : * all-visible and all-frozen. The logic here is similar. After completing
1323 : * pruning and freezing, use an assertion to verify that our results
1324 : * remain consistent with heap_page_would_be_all_visible(). It's also a
1325 : * valuable cross-check of the page state after pruning and freezing.
1326 : */
1327 : #ifdef USE_ASSERT_CHECKING
1328 : if (prstate.set_all_visible)
1329 : {
1330 : TransactionId debug_cutoff;
1331 : bool debug_all_frozen;
1332 :
1333 : Assert(prstate.lpdead_items == 0);
1334 :
1335 : Assert(heap_page_is_all_visible(prstate.relation, prstate.buffer,
1336 : prstate.vistest,
1337 : &debug_all_frozen,
1338 : &debug_cutoff, off_loc));
1339 :
1340 : Assert(!TransactionIdIsValid(debug_cutoff) ||
1341 : debug_cutoff == prstate.newest_live_xid);
1342 :
1343 : /*
1344 : * It's possible the page is composed entirely of frozen tuples but is
1345 : * not set all-frozen in the VM and did not pass
1346 : * HEAP_PAGE_PRUNE_FREEZE. In this case, it's possible
1347 : * heap_page_is_all_visible() finds the page completely frozen, even
1348 : * though prstate.set_all_frozen is false.
1349 : */
1350 : Assert(!prstate.set_all_frozen || debug_all_frozen);
1351 : }
1352 : #endif
1353 :
1354 : /* Copy information back for caller */
1355 336771 : presult->ndeleted = prstate.ndeleted;
1356 336771 : presult->nnewlpdead = prstate.ndead;
1357 336771 : presult->nfrozen = prstate.nfrozen;
1358 336771 : presult->live_tuples = prstate.live_tuples;
1359 336771 : presult->recently_dead_tuples = prstate.recently_dead_tuples;
1360 336771 : presult->hastup = prstate.hastup;
1361 :
1362 336771 : presult->lpdead_items = prstate.lpdead_items;
1363 : /* the presult->deadoffsets array was already filled in */
1364 :
1365 336771 : presult->newly_all_visible = false;
1366 336771 : presult->newly_all_frozen = false;
1367 336771 : presult->newly_all_visible_frozen = false;
1368 336771 : if (do_set_vm)
1369 : {
1370 55267 : if ((prstate.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
1371 : {
1372 52080 : presult->newly_all_visible = true;
1373 52080 : if (prstate.set_all_frozen)
1374 28001 : presult->newly_all_visible_frozen = true;
1375 : }
1376 3187 : else if ((prstate.old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 &&
1377 3187 : prstate.set_all_frozen)
1378 3187 : presult->newly_all_frozen = true;
1379 : }
1380 :
1381 336771 : if (prstate.attempt_freeze)
1382 : {
1383 231268 : if (presult->nfrozen > 0)
1384 : {
1385 25355 : *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid;
1386 25355 : *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid;
1387 : }
1388 : else
1389 : {
1390 205913 : *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid;
1391 205913 : *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid;
1392 : }
1393 : }
1394 : }
1395 :
1396 :
1397 : /*
1398 : * Perform visibility checks for heap pruning.
1399 : */
1400 : static HTSV_Result
1401 18390346 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup)
1402 : {
1403 : HTSV_Result res;
1404 : TransactionId dead_after;
1405 :
1406 18390346 : res = HeapTupleSatisfiesVacuumHorizon(tup, prstate->buffer, &dead_after);
1407 :
1408 18390346 : if (res != HEAPTUPLE_RECENTLY_DEAD)
1409 16270951 : return res;
1410 :
1411 : /*
1412 : * For VACUUM, we must be sure to prune tuples with xmax older than
1413 : * OldestXmin -- a visibility cutoff determined at the beginning of
1414 : * vacuuming the relation. OldestXmin is used for freezing determination
1415 : * and we cannot freeze dead tuples' xmaxes.
1416 : */
1417 2119395 : if (prstate->cutoffs &&
1418 1129891 : TransactionIdIsValid(prstate->cutoffs->OldestXmin) &&
1419 1129891 : NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin))
1420 804542 : return HEAPTUPLE_DEAD;
1421 :
1422 : /*
1423 : * Determine whether or not the tuple is considered dead when compared
1424 : * with the provided GlobalVisState. On-access pruning does not provide
1425 : * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older
1426 : * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead
1427 : * if the GlobalVisState has been updated since the beginning of vacuuming
1428 : * the relation.
1429 : */
1430 1314853 : if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after, true))
1431 937203 : return HEAPTUPLE_DEAD;
1432 :
1433 377650 : return res;
1434 : }
1435 :
1436 :
1437 : /*
1438 : * Pruning calculates tuple visibility once and saves the results in an array
1439 : * of int8. See PruneState.htsv for details. This helper function is meant
1440 : * to guard against examining visibility status array members which have not
1441 : * yet been computed.
1442 : */
1443 : static inline HTSV_Result
1444 18373386 : htsv_get_valid_status(int status)
1445 : {
1446 : Assert(status >= HEAPTUPLE_DEAD &&
1447 : status <= HEAPTUPLE_DELETE_IN_PROGRESS);
1448 18373386 : return (HTSV_Result) status;
1449 : }
1450 :
1451 : /*
1452 : * Prune specified line pointer or a HOT chain originating at line pointer.
1453 : *
1454 : * Tuple visibility information is provided in prstate->htsv.
1455 : *
1456 : * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
1457 : * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
1458 : * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
1459 : * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
1460 : * DEAD, our visibility test is just too coarse to detect it.
1461 : *
1462 : * Pruning must never leave behind a DEAD tuple that still has tuple storage.
1463 : * VACUUM isn't prepared to deal with that case.
1464 : *
1465 : * The root line pointer is redirected to the tuple immediately after the
1466 : * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
1467 : * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
1468 : * tuple, which we treat as a chain of length 1.)
1469 : *
1470 : * We don't actually change the page here. We just add entries to the arrays in
1471 : * prstate showing the changes to be made. Items to be redirected are added
1472 : * to the redirected[] array (two entries per redirection); items to be set to
1473 : * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
1474 : * state are added to nowunused[]. We perform bookkeeping of live tuples,
1475 : * visibility etc. based on what the page will look like after the changes
1476 : * applied. All that bookkeeping is performed in the heap_prune_record_*()
1477 : * subroutines. The division of labor is that heap_prune_chain() decides the
1478 : * fate of each tuple, ie. whether it's going to be removed, redirected or
1479 : * left unchanged, and the heap_prune_record_*() subroutines update PruneState
1480 : * based on that outcome.
1481 : */
1482 : static void
1483 18248760 : heap_prune_chain(OffsetNumber maxoff, OffsetNumber rootoffnum,
1484 : PruneState *prstate)
1485 : {
1486 18248760 : TransactionId priorXmax = InvalidTransactionId;
1487 : ItemId rootlp;
1488 : OffsetNumber offnum;
1489 : OffsetNumber chainitems[MaxHeapTuplesPerPage];
1490 18248760 : Page page = prstate->page;
1491 :
1492 : /*
1493 : * After traversing the HOT chain, ndeadchain is the index in chainitems
1494 : * of the first live successor after the last dead item.
1495 : */
1496 18248760 : int ndeadchain = 0,
1497 18248760 : nchain = 0;
1498 :
1499 18248760 : rootlp = PageGetItemId(page, rootoffnum);
1500 :
1501 : /* Start from the root tuple */
1502 18248760 : offnum = rootoffnum;
1503 :
1504 : /* while not end of the chain */
1505 : for (;;)
1506 322655 : {
1507 : HeapTupleHeader htup;
1508 : ItemId lp;
1509 :
1510 : /* Sanity check (pure paranoia) */
1511 18571415 : if (offnum < FirstOffsetNumber)
1512 0 : break;
1513 :
1514 : /*
1515 : * An offset past the end of page's line pointer array is possible
1516 : * when the array was truncated (original item must have been unused)
1517 : */
1518 18571415 : if (offnum > maxoff)
1519 0 : break;
1520 :
1521 : /* If item is already processed, stop --- it must not be same chain */
1522 18571415 : if (prstate->processed[offnum])
1523 0 : break;
1524 :
1525 18571415 : lp = PageGetItemId(page, offnum);
1526 :
1527 : /*
1528 : * Unused item obviously isn't part of the chain. Likewise, a dead
1529 : * line pointer can't be part of the chain. Both of those cases were
1530 : * already marked as processed.
1531 : */
1532 : Assert(ItemIdIsUsed(lp));
1533 : Assert(!ItemIdIsDead(lp));
1534 :
1535 : /*
1536 : * If we are looking at the redirected root line pointer, jump to the
1537 : * first normal tuple in the chain. If we find a redirect somewhere
1538 : * else, stop --- it must not be same chain.
1539 : */
1540 18571415 : if (ItemIdIsRedirected(lp))
1541 : {
1542 198029 : if (nchain > 0)
1543 0 : break; /* not at start of chain */
1544 198029 : chainitems[nchain++] = offnum;
1545 198029 : offnum = ItemIdGetRedirect(rootlp);
1546 198029 : continue;
1547 : }
1548 :
1549 : Assert(ItemIdIsNormal(lp));
1550 :
1551 18373386 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1552 :
1553 : /*
1554 : * Check the tuple XMIN against prior XMAX, if any
1555 : */
1556 18498012 : if (TransactionIdIsValid(priorXmax) &&
1557 124626 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
1558 0 : break;
1559 :
1560 : /*
1561 : * OK, this tuple is indeed a member of the chain.
1562 : */
1563 18373386 : chainitems[nchain++] = offnum;
1564 :
1565 18373386 : switch (htsv_get_valid_status(prstate->htsv[offnum]))
1566 : {
1567 1813001 : case HEAPTUPLE_DEAD:
1568 :
1569 : /* Remember the last DEAD tuple seen */
1570 1813001 : ndeadchain = nchain;
1571 1813001 : HeapTupleHeaderAdvanceConflictHorizon(htup,
1572 : &prstate->latest_xid_removed);
1573 : /* Advance to next chain member */
1574 1813001 : break;
1575 :
1576 377650 : case HEAPTUPLE_RECENTLY_DEAD:
1577 :
1578 : /*
1579 : * We don't need to advance the conflict horizon for
1580 : * RECENTLY_DEAD tuples, even if we are removing them. This
1581 : * is because we only remove RECENTLY_DEAD tuples if they
1582 : * precede a DEAD tuple, and the DEAD tuple must have been
1583 : * inserted by a newer transaction than the RECENTLY_DEAD
1584 : * tuple by virtue of being later in the chain. We will have
1585 : * advanced the conflict horizon for the DEAD tuple.
1586 : */
1587 :
1588 : /*
1589 : * Advance past RECENTLY_DEAD tuples just in case there's a
1590 : * DEAD one after them. We have to make sure that we don't
1591 : * miss any DEAD tuples, since DEAD tuples that still have
1592 : * tuple storage after pruning will confuse VACUUM.
1593 : */
1594 377650 : break;
1595 :
1596 16182735 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1597 : case HEAPTUPLE_LIVE:
1598 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1599 16182735 : goto process_chain;
1600 :
1601 0 : default:
1602 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1603 : goto process_chain;
1604 : }
1605 :
1606 : /*
1607 : * If the tuple is not HOT-updated, then we are at the end of this
1608 : * HOT-update chain.
1609 : */
1610 2190651 : if (!HeapTupleHeaderIsHotUpdated(htup))
1611 2066025 : goto process_chain;
1612 :
1613 : /* HOT implies it can't have moved to different partition */
1614 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1615 :
1616 : /*
1617 : * Advance to next chain member.
1618 : */
1619 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == prstate->block);
1620 124626 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1621 124626 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1622 : }
1623 :
1624 0 : if (ItemIdIsRedirected(rootlp) && nchain < 2)
1625 : {
1626 : /*
1627 : * We found a redirect item that doesn't point to a valid follow-on
1628 : * item. This can happen if the loop in heap_page_prune_and_freeze()
1629 : * caused us to visit the dead successor of a redirect item before
1630 : * visiting the redirect item. We can clean up by setting the
1631 : * redirect item to LP_DEAD state or LP_UNUSED if the caller
1632 : * indicated.
1633 : */
1634 0 : heap_prune_record_dead_or_unused(prstate, rootoffnum, false);
1635 0 : return;
1636 : }
1637 :
1638 0 : process_chain:
1639 :
1640 18248760 : if (ndeadchain == 0)
1641 : {
1642 : /*
1643 : * No DEAD tuple was found, so the chain is entirely composed of
1644 : * normal, unchanged tuples. Leave it alone.
1645 : */
1646 16482526 : int i = 0;
1647 :
1648 16482526 : if (ItemIdIsRedirected(rootlp))
1649 : {
1650 177457 : heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum);
1651 177457 : i++;
1652 : }
1653 32970629 : for (; i < nchain; i++)
1654 16488103 : heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
1655 : }
1656 1766234 : else if (ndeadchain == nchain)
1657 : {
1658 : /*
1659 : * The entire chain is dead. Mark the root line pointer LP_DEAD, and
1660 : * fully remove the other tuples in the chain.
1661 : */
1662 1695345 : heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp));
1663 1740981 : for (int i = 1; i < nchain; i++)
1664 45636 : heap_prune_record_unused(prstate, chainitems[i], true);
1665 : }
1666 : else
1667 : {
1668 : /*
1669 : * We found a DEAD tuple in the chain. Redirect the root line pointer
1670 : * to the first non-DEAD tuple, and mark as unused each intermediate
1671 : * item that we are able to remove from the chain.
1672 : */
1673 70889 : heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain],
1674 70889 : ItemIdIsNormal(rootlp));
1675 92592 : for (int i = 1; i < ndeadchain; i++)
1676 21703 : heap_prune_record_unused(prstate, chainitems[i], true);
1677 :
1678 : /* the rest of tuples in the chain are normal, unchanged tuples */
1679 143171 : for (int i = ndeadchain; i < nchain; i++)
1680 72282 : heap_prune_record_unchanged_lp_normal(prstate, chainitems[i]);
1681 : }
1682 : }
1683 :
1684 : /* Record lowest soon-prunable XID */
1685 : static void
1686 4594615 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
1687 : OffsetNumber offnum)
1688 : {
1689 : /*
1690 : * This should exactly match the PageSetPrunable macro. We can't store
1691 : * directly into the page header yet, so we update working state.
1692 : */
1693 : Assert(TransactionIdIsNormal(xid));
1694 8997679 : if (!TransactionIdIsValid(prstate->new_prune_xid) ||
1695 4403064 : TransactionIdPrecedes(xid, prstate->new_prune_xid))
1696 193132 : prstate->new_prune_xid = xid;
1697 :
1698 : /*
1699 : * It's incorrect for a page to be marked all-visible if it contains
1700 : * prunable items.
1701 : */
1702 4594615 : if (PageIsAllVisible(prstate->page))
1703 0 : heap_page_fix_vm_corruption(prstate, offnum,
1704 : VM_CORRUPT_TUPLE_VISIBILITY);
1705 4594615 : }
1706 :
1707 : /* Record line pointer to be redirected */
1708 : static void
1709 70889 : heap_prune_record_redirect(PruneState *prstate,
1710 : OffsetNumber offnum, OffsetNumber rdoffnum,
1711 : bool was_normal)
1712 : {
1713 : Assert(!prstate->processed[offnum]);
1714 70889 : prstate->processed[offnum] = true;
1715 :
1716 : /*
1717 : * Do not mark the redirect target here. It needs to be counted
1718 : * separately as an unchanged tuple.
1719 : */
1720 :
1721 : Assert(prstate->nredirected < MaxHeapTuplesPerPage);
1722 70889 : prstate->redirected[prstate->nredirected * 2] = offnum;
1723 70889 : prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
1724 :
1725 70889 : prstate->nredirected++;
1726 :
1727 : /*
1728 : * If the root entry had been a normal tuple, we are deleting it, so count
1729 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1730 : * count.
1731 : */
1732 70889 : if (was_normal)
1733 62546 : prstate->ndeleted++;
1734 :
1735 70889 : prstate->hastup = true;
1736 70889 : }
1737 :
1738 : /* Record line pointer to be marked dead */
1739 : static void
1740 1661511 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum,
1741 : bool was_normal)
1742 : {
1743 : Assert(!prstate->processed[offnum]);
1744 1661511 : prstate->processed[offnum] = true;
1745 :
1746 : Assert(prstate->ndead < MaxHeapTuplesPerPage);
1747 1661511 : prstate->nowdead[prstate->ndead] = offnum;
1748 1661511 : prstate->ndead++;
1749 :
1750 : /*
1751 : * Deliberately delay unsetting set_all_visible and set_all_frozen until
1752 : * later during pruning. Removable dead tuples shouldn't preclude freezing
1753 : * the page.
1754 : */
1755 :
1756 : /* Record the dead offset for vacuum */
1757 1661511 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
1758 :
1759 : /*
1760 : * If the root entry had been a normal tuple, we are deleting it, so count
1761 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1762 : * count.
1763 : */
1764 1661511 : if (was_normal)
1765 1649282 : prstate->ndeleted++;
1766 1661511 : }
1767 :
1768 : /*
1769 : * Depending on whether or not the caller set mark_unused_now to true, record that a
1770 : * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in
1771 : * which we will mark line pointers LP_UNUSED, but we will not mark line
1772 : * pointers LP_DEAD if mark_unused_now is true.
1773 : */
1774 : static void
1775 1695345 : heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
1776 : bool was_normal)
1777 : {
1778 : /*
1779 : * If the caller set mark_unused_now to true, we can remove dead tuples
1780 : * during pruning instead of marking their line pointers dead. Set this
1781 : * tuple's line pointer LP_UNUSED. We hint that this option is less
1782 : * likely.
1783 : */
1784 1695345 : if (unlikely(prstate->mark_unused_now))
1785 33834 : heap_prune_record_unused(prstate, offnum, was_normal);
1786 : else
1787 1661511 : heap_prune_record_dead(prstate, offnum, was_normal);
1788 :
1789 : /*
1790 : * It's incorrect for the page to be set all-visible if it contains dead
1791 : * items. Fix that on the heap page and check the VM for corruption as
1792 : * well. Do that here rather than in heap_prune_record_dead() so we also
1793 : * cover tuples that are directly marked LP_UNUSED via mark_unused_now.
1794 : */
1795 1695345 : if (PageIsAllVisible(prstate->page))
1796 0 : heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
1797 1695345 : }
1798 :
1799 : /* Record line pointer to be marked unused */
1800 : static void
1801 106484 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal)
1802 : {
1803 : Assert(!prstate->processed[offnum]);
1804 106484 : prstate->processed[offnum] = true;
1805 :
1806 : Assert(prstate->nunused < MaxHeapTuplesPerPage);
1807 106484 : prstate->nowunused[prstate->nunused] = offnum;
1808 106484 : prstate->nunused++;
1809 :
1810 : /*
1811 : * If the root entry had been a normal tuple, we are deleting it, so count
1812 : * it in the result. But changing a redirect (even to DEAD state) doesn't
1813 : * count.
1814 : */
1815 106484 : if (was_normal)
1816 104417 : prstate->ndeleted++;
1817 106484 : }
1818 :
1819 : /*
1820 : * Record an unused line pointer that is left unchanged.
1821 : */
1822 : static void
1823 135740 : heap_prune_record_unchanged_lp_unused(PruneState *prstate, OffsetNumber offnum)
1824 : {
1825 : Assert(!prstate->processed[offnum]);
1826 135740 : prstate->processed[offnum] = true;
1827 135740 : }
1828 :
1829 : /*
1830 : * Record line pointer that is left unchanged. We consider freezing it, and
1831 : * update bookkeeping of tuple counts and page visibility.
1832 : */
1833 : static void
1834 16574101 : heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
1835 : {
1836 : HeapTupleHeader htup;
1837 : TransactionId xmin;
1838 16574101 : Page page = prstate->page;
1839 :
1840 : Assert(!prstate->processed[offnum]);
1841 16574101 : prstate->processed[offnum] = true;
1842 :
1843 16574101 : prstate->hastup = true; /* the page is not empty */
1844 :
1845 : /*
1846 : * The criteria for counting a tuple as live in this block need to match
1847 : * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and
1848 : * ANALYZE may produce wildly different reltuples values, e.g. when there
1849 : * are many recently-dead tuples.
1850 : *
1851 : * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM
1852 : * can't run inside a transaction block, which makes some cases impossible
1853 : * (e.g. in-progress insert from the same transaction).
1854 : *
1855 : * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*()
1856 : * subroutines. They don't count dead items like acquire_sample_rows()
1857 : * does, because we assume that all dead items will become LP_UNUSED
1858 : * before VACUUM finishes. This difference is only superficial. VACUUM
1859 : * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM
1860 : * won't remember LP_DEAD items, but only because they're not supposed to
1861 : * be left behind when it is done. (Cases where we bypass index vacuuming
1862 : * will violate this optimistic assumption, but the overall impact of that
1863 : * should be negligible.)
1864 : */
1865 16574101 : htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum));
1866 :
1867 16574101 : switch (prstate->htsv[offnum])
1868 : {
1869 11979486 : case HEAPTUPLE_LIVE:
1870 :
1871 : /*
1872 : * Count it as live. Not only is this natural, but it's also what
1873 : * acquire_sample_rows() does.
1874 : */
1875 11979486 : prstate->live_tuples++;
1876 :
1877 : /*
1878 : * Is the tuple definitely visible to all transactions?
1879 : *
1880 : * NB: Like with per-tuple hint bits, we can't set the
1881 : * PD_ALL_VISIBLE flag if the inserter committed asynchronously.
1882 : * See SetHintBits for more info. Check that the tuple is hinted
1883 : * xmin-committed because of that.
1884 : */
1885 11979486 : if (!HeapTupleHeaderXminCommitted(htup))
1886 : {
1887 49487 : prstate->set_all_visible = false;
1888 49487 : prstate->set_all_frozen = false;
1889 49487 : break;
1890 : }
1891 :
1892 : /*
1893 : * The inserter definitely committed. But we don't know if it is
1894 : * old enough that everyone sees it as committed. Later, after
1895 : * processing all the tuples on the page, we'll check if there is
1896 : * any snapshot that still considers the newest xid on the page to
1897 : * be running. If so, we don't consider the page all-visible.
1898 : */
1899 11929999 : xmin = HeapTupleHeaderGetXmin(htup);
1900 :
1901 : /* Track newest xmin on page. */
1902 11929999 : if (TransactionIdFollows(xmin, prstate->newest_live_xid) &&
1903 : TransactionIdIsNormal(xmin))
1904 539235 : prstate->newest_live_xid = xmin;
1905 :
1906 11929999 : break;
1907 :
1908 377650 : case HEAPTUPLE_RECENTLY_DEAD:
1909 377650 : prstate->recently_dead_tuples++;
1910 377650 : prstate->set_all_visible = false;
1911 377650 : prstate->set_all_frozen = false;
1912 :
1913 : /*
1914 : * This tuple will soon become DEAD. Update the hint field so
1915 : * that the page is reconsidered for pruning in future.
1916 : */
1917 377650 : heap_prune_record_prunable(prstate,
1918 : HeapTupleHeaderGetUpdateXid(htup),
1919 : offnum);
1920 377650 : break;
1921 :
1922 130009 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1923 :
1924 : /*
1925 : * We do not count these rows as live, because we expect the
1926 : * inserting transaction to update the counters at commit, and we
1927 : * assume that will happen only after we report our results. This
1928 : * assumption is a bit shaky, but it is what acquire_sample_rows()
1929 : * does, so be consistent.
1930 : */
1931 130009 : prstate->set_all_visible = false;
1932 130009 : prstate->set_all_frozen = false;
1933 :
1934 : /*
1935 : * Though there is nothing "prunable" on the page, we maintain
1936 : * pd_prune_xid for inserts so that we have the opportunity to
1937 : * mark them all-visible during the next round of pruning.
1938 : */
1939 130009 : heap_prune_record_prunable(prstate,
1940 : HeapTupleHeaderGetXmin(htup),
1941 : offnum);
1942 130009 : break;
1943 :
1944 4086956 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1945 :
1946 : /*
1947 : * This an expected case during concurrent vacuum. Count such
1948 : * rows as live. As above, we assume the deleting transaction
1949 : * will commit and update the counters after we report.
1950 : */
1951 4086956 : prstate->live_tuples++;
1952 4086956 : prstate->set_all_visible = false;
1953 4086956 : prstate->set_all_frozen = false;
1954 :
1955 : /*
1956 : * This tuple may soon become DEAD. Update the hint field so that
1957 : * the page is reconsidered for pruning in future.
1958 : */
1959 4086956 : heap_prune_record_prunable(prstate,
1960 : HeapTupleHeaderGetUpdateXid(htup),
1961 : offnum);
1962 4086956 : break;
1963 :
1964 0 : default:
1965 :
1966 : /*
1967 : * DEAD tuples should've been passed to heap_prune_record_dead()
1968 : * or heap_prune_record_unused() instead.
1969 : */
1970 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d",
1971 : prstate->htsv[offnum]);
1972 : break;
1973 : }
1974 :
1975 : /* Consider freezing any normal tuples which will not be removed */
1976 16574101 : if (prstate->attempt_freeze)
1977 : {
1978 : bool totally_frozen;
1979 :
1980 9795534 : if ((heap_prepare_freeze_tuple(htup,
1981 9795534 : prstate->cutoffs,
1982 : &prstate->pagefrz,
1983 9795534 : &prstate->frozen[prstate->nfrozen],
1984 : &totally_frozen)))
1985 : {
1986 : /* Save prepared freeze plan for later */
1987 2708353 : prstate->frozen[prstate->nfrozen++].offset = offnum;
1988 : }
1989 :
1990 : /*
1991 : * If any tuple isn't either totally frozen already or eligible to
1992 : * become totally frozen (according to its freeze plan), then the page
1993 : * definitely cannot be set all-frozen in the visibility map later on.
1994 : */
1995 9795534 : if (!totally_frozen)
1996 4828508 : prstate->set_all_frozen = false;
1997 : }
1998 16574101 : }
1999 :
2000 :
2001 : /*
2002 : * Record line pointer that was already LP_DEAD and is left unchanged.
2003 : */
2004 : static void
2005 1553608 : heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum)
2006 : {
2007 : Assert(!prstate->processed[offnum]);
2008 1553608 : prstate->processed[offnum] = true;
2009 :
2010 : /*
2011 : * Deliberately don't set hastup for LP_DEAD items. We make the soft
2012 : * assumption that any LP_DEAD items encountered here will become
2013 : * LP_UNUSED later on, before count_nondeletable_pages is reached. If we
2014 : * don't make this assumption then rel truncation will only happen every
2015 : * other VACUUM, at most. Besides, VACUUM must treat
2016 : * hastup/nonempty_pages as provisional no matter how LP_DEAD items are
2017 : * handled (handled here, or handled later on).
2018 : *
2019 : * Similarly, don't unset set_all_visible and set_all_frozen until later,
2020 : * at the end of heap_page_prune_and_freeze(). This will allow us to
2021 : * attempt to freeze the page after pruning. As long as we unset it
2022 : * before updating the visibility map, this will be correct.
2023 : */
2024 :
2025 : /* Record the dead offset for vacuum */
2026 1553608 : prstate->deadoffsets[prstate->lpdead_items++] = offnum;
2027 :
2028 : /*
2029 : * It's incorrect for a page to be marked all-visible if it contains dead
2030 : * items.
2031 : */
2032 1553608 : if (PageIsAllVisible(prstate->page))
2033 0 : heap_page_fix_vm_corruption(prstate, offnum, VM_CORRUPT_LPDEAD);
2034 1553608 : }
2035 :
2036 : /*
2037 : * Record LP_REDIRECT that is left unchanged.
2038 : */
2039 : static void
2040 177457 : heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum)
2041 : {
2042 : /*
2043 : * A redirect line pointer doesn't count as a live tuple.
2044 : *
2045 : * If we leave a redirect line pointer in place, there will be another
2046 : * tuple on the page that it points to. We will do the bookkeeping for
2047 : * that separately. So we have nothing to do here, except remember that
2048 : * we processed this item.
2049 : */
2050 : Assert(!prstate->processed[offnum]);
2051 177457 : prstate->processed[offnum] = true;
2052 177457 : }
2053 :
2054 : /*
2055 : * Perform the actual page changes needed by heap_page_prune_and_freeze().
2056 : *
2057 : * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers
2058 : * as unused, not redirecting or removing anything else. The
2059 : * PageRepairFragmentation() call is skipped in that case.
2060 : *
2061 : * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on
2062 : * the buffer. If it is set, an ordinary exclusive lock suffices.
2063 : */
2064 : void
2065 70146 : heap_page_prune_execute(Buffer buffer, bool lp_truncate_only,
2066 : OffsetNumber *redirected, int nredirected,
2067 : OffsetNumber *nowdead, int ndead,
2068 : OffsetNumber *nowunused, int nunused)
2069 : {
2070 70146 : Page page = BufferGetPage(buffer);
2071 : OffsetNumber *offnum;
2072 : HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
2073 :
2074 : /* Shouldn't be called unless there's something to do */
2075 : Assert(nredirected > 0 || ndead > 0 || nunused > 0);
2076 :
2077 : /* If 'lp_truncate_only', we can only remove already-dead line pointers */
2078 : Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0));
2079 :
2080 : /* Update all redirected line pointers */
2081 70146 : offnum = redirected;
2082 160350 : for (int i = 0; i < nredirected; i++)
2083 : {
2084 90204 : OffsetNumber fromoff = *offnum++;
2085 90204 : OffsetNumber tooff = *offnum++;
2086 90204 : ItemId fromlp = PageGetItemId(page, fromoff);
2087 : ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
2088 :
2089 : #ifdef USE_ASSERT_CHECKING
2090 :
2091 : /*
2092 : * Any existing item that we set as an LP_REDIRECT (any 'from' item)
2093 : * must be the first item from a HOT chain. If the item has tuple
2094 : * storage then it can't be a heap-only tuple. Otherwise we are just
2095 : * maintaining an existing LP_REDIRECT from an existing HOT chain that
2096 : * has been pruned at least once before now.
2097 : */
2098 : if (!ItemIdIsRedirected(fromlp))
2099 : {
2100 : Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
2101 :
2102 : htup = (HeapTupleHeader) PageGetItem(page, fromlp);
2103 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
2104 : }
2105 : else
2106 : {
2107 : /* We shouldn't need to redundantly set the redirect */
2108 : Assert(ItemIdGetRedirect(fromlp) != tooff);
2109 : }
2110 :
2111 : /*
2112 : * The item that we're about to set as an LP_REDIRECT (the 'from'
2113 : * item) will point to an existing item (the 'to' item) that is
2114 : * already a heap-only tuple. There can be at most one LP_REDIRECT
2115 : * item per HOT chain.
2116 : *
2117 : * We need to keep around an LP_REDIRECT item (after original
2118 : * non-heap-only root tuple gets pruned away) so that it's always
2119 : * possible for VACUUM to easily figure out what TID to delete from
2120 : * indexes when an entire HOT chain becomes dead. A heap-only tuple
2121 : * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
2122 : * tuple can.
2123 : *
2124 : * This check may miss problems, e.g. the target of a redirect could
2125 : * be marked as unused subsequently. The page_verify_redirects() check
2126 : * below will catch such problems.
2127 : */
2128 : tolp = PageGetItemId(page, tooff);
2129 : Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
2130 : htup = (HeapTupleHeader) PageGetItem(page, tolp);
2131 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2132 : #endif
2133 :
2134 90204 : ItemIdSetRedirect(fromlp, tooff);
2135 : }
2136 :
2137 : /* Update all now-dead line pointers */
2138 70146 : offnum = nowdead;
2139 2006290 : for (int i = 0; i < ndead; i++)
2140 : {
2141 1936144 : OffsetNumber off = *offnum++;
2142 1936144 : ItemId lp = PageGetItemId(page, off);
2143 :
2144 : #ifdef USE_ASSERT_CHECKING
2145 :
2146 : /*
2147 : * An LP_DEAD line pointer must be left behind when the original item
2148 : * (which is dead to everybody) could still be referenced by a TID in
2149 : * an index. This should never be necessary with any individual
2150 : * heap-only tuple item, though. (It's not clear how much of a problem
2151 : * that would be, but there is no reason to allow it.)
2152 : */
2153 : if (ItemIdHasStorage(lp))
2154 : {
2155 : Assert(ItemIdIsNormal(lp));
2156 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2157 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
2158 : }
2159 : else
2160 : {
2161 : /* Whole HOT chain becomes dead */
2162 : Assert(ItemIdIsRedirected(lp));
2163 : }
2164 : #endif
2165 :
2166 1936144 : ItemIdSetDead(lp);
2167 : }
2168 :
2169 : /* Update all now-unused line pointers */
2170 70146 : offnum = nowunused;
2171 378301 : for (int i = 0; i < nunused; i++)
2172 : {
2173 308155 : OffsetNumber off = *offnum++;
2174 308155 : ItemId lp = PageGetItemId(page, off);
2175 :
2176 : #ifdef USE_ASSERT_CHECKING
2177 :
2178 : if (lp_truncate_only)
2179 : {
2180 : /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */
2181 : Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
2182 : }
2183 : else
2184 : {
2185 : /*
2186 : * When heap_page_prune_and_freeze() was called, mark_unused_now
2187 : * may have been passed as true, which allows would-be LP_DEAD
2188 : * items to be made LP_UNUSED instead. This is only possible if
2189 : * the relation has no indexes. If there are any dead items, then
2190 : * mark_unused_now was not true and every item being marked
2191 : * LP_UNUSED must refer to a heap-only tuple.
2192 : */
2193 : if (ndead > 0)
2194 : {
2195 : Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
2196 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2197 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2198 : }
2199 : else
2200 : Assert(ItemIdIsUsed(lp));
2201 : }
2202 :
2203 : #endif
2204 :
2205 308155 : ItemIdSetUnused(lp);
2206 : }
2207 :
2208 70146 : if (lp_truncate_only)
2209 2070 : PageTruncateLinePointerArray(page);
2210 : else
2211 : {
2212 : /*
2213 : * Finally, repair any fragmentation, and update the page's hint bit
2214 : * about whether it has free pointers.
2215 : */
2216 68076 : PageRepairFragmentation(page);
2217 :
2218 : /*
2219 : * Now that the page has been modified, assert that redirect items
2220 : * still point to valid targets.
2221 : */
2222 68076 : page_verify_redirects(page);
2223 : }
2224 70146 : }
2225 :
2226 :
2227 : /*
2228 : * If built with assertions, verify that all LP_REDIRECT items point to a
2229 : * valid item.
2230 : *
2231 : * One way that bugs related to HOT pruning show is redirect items pointing to
2232 : * removed tuples. It's not trivial to reliably check that marking an item
2233 : * unused will not orphan a redirect item during heap_prune_chain() /
2234 : * heap_page_prune_execute(), so we additionally check the whole page after
2235 : * pruning. Without this check such bugs would typically only cause asserts
2236 : * later, potentially well after the corruption has been introduced.
2237 : *
2238 : * Also check comments in heap_page_prune_execute()'s redirection loop.
2239 : */
2240 : static void
2241 68076 : page_verify_redirects(Page page)
2242 : {
2243 : #ifdef USE_ASSERT_CHECKING
2244 : OffsetNumber offnum;
2245 : OffsetNumber maxoff;
2246 :
2247 : maxoff = PageGetMaxOffsetNumber(page);
2248 : for (offnum = FirstOffsetNumber;
2249 : offnum <= maxoff;
2250 : offnum = OffsetNumberNext(offnum))
2251 : {
2252 : ItemId itemid = PageGetItemId(page, offnum);
2253 : OffsetNumber targoff;
2254 : ItemId targitem;
2255 : HeapTupleHeader htup;
2256 :
2257 : if (!ItemIdIsRedirected(itemid))
2258 : continue;
2259 :
2260 : targoff = ItemIdGetRedirect(itemid);
2261 : targitem = PageGetItemId(page, targoff);
2262 :
2263 : Assert(ItemIdIsUsed(targitem));
2264 : Assert(ItemIdIsNormal(targitem));
2265 : Assert(ItemIdHasStorage(targitem));
2266 : htup = (HeapTupleHeader) PageGetItem(page, targitem);
2267 : Assert(HeapTupleHeaderIsHeapOnly(htup));
2268 : }
2269 : #endif
2270 68076 : }
2271 :
2272 :
2273 : /*
2274 : * For all items in this page, find their respective root line pointers.
2275 : * If item k is part of a HOT-chain with root at item j, then we set
2276 : * root_offsets[k - 1] = j.
2277 : *
2278 : * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
2279 : * Unused entries are filled with InvalidOffsetNumber (zero).
2280 : *
2281 : * The function must be called with at least share lock on the buffer, to
2282 : * prevent concurrent prune operations.
2283 : *
2284 : * Note: The information collected here is valid only as long as the caller
2285 : * holds a pin on the buffer. Once pin is released, a tuple might be pruned
2286 : * and reused by a completely unrelated tuple.
2287 : */
2288 : void
2289 134232 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
2290 : {
2291 : OffsetNumber offnum,
2292 : maxoff;
2293 :
2294 134232 : MemSet(root_offsets, InvalidOffsetNumber,
2295 : MaxHeapTuplesPerPage * sizeof(OffsetNumber));
2296 :
2297 134232 : maxoff = PageGetMaxOffsetNumber(page);
2298 10806332 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2299 : {
2300 10672100 : ItemId lp = PageGetItemId(page, offnum);
2301 : HeapTupleHeader htup;
2302 : OffsetNumber nextoffnum;
2303 : TransactionId priorXmax;
2304 :
2305 : /* skip unused and dead items */
2306 10672100 : if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
2307 9258 : continue;
2308 :
2309 10662842 : if (ItemIdIsNormal(lp))
2310 : {
2311 10659026 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2312 :
2313 : /*
2314 : * Check if this tuple is part of a HOT-chain rooted at some other
2315 : * tuple. If so, skip it for now; we'll process it when we find
2316 : * its root.
2317 : */
2318 10659026 : if (HeapTupleHeaderIsHeapOnly(htup))
2319 4156 : continue;
2320 :
2321 : /*
2322 : * This is either a plain tuple or the root of a HOT-chain.
2323 : * Remember it in the mapping.
2324 : */
2325 10654870 : root_offsets[offnum - 1] = offnum;
2326 :
2327 : /* If it's not the start of a HOT-chain, we're done with it */
2328 10654870 : if (!HeapTupleHeaderIsHotUpdated(htup))
2329 10654606 : continue;
2330 :
2331 : /* Set up to scan the HOT-chain */
2332 264 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
2333 264 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
2334 : }
2335 : else
2336 : {
2337 : /* Must be a redirect item. We do not set its root_offsets entry */
2338 : Assert(ItemIdIsRedirected(lp));
2339 : /* Set up to scan the HOT-chain */
2340 3816 : nextoffnum = ItemIdGetRedirect(lp);
2341 3816 : priorXmax = InvalidTransactionId;
2342 : }
2343 :
2344 : /*
2345 : * Now follow the HOT-chain and collect other tuples in the chain.
2346 : *
2347 : * Note: Even though this is a nested loop, the complexity of the
2348 : * function is O(N) because a tuple in the page should be visited not
2349 : * more than twice, once in the outer loop and once in HOT-chain
2350 : * chases.
2351 : */
2352 : for (;;)
2353 : {
2354 : /* Sanity check (pure paranoia) */
2355 4152 : if (offnum < FirstOffsetNumber)
2356 0 : break;
2357 :
2358 : /*
2359 : * An offset past the end of page's line pointer array is possible
2360 : * when the array was truncated
2361 : */
2362 4152 : if (offnum > maxoff)
2363 0 : break;
2364 :
2365 4152 : lp = PageGetItemId(page, nextoffnum);
2366 :
2367 : /* Check for broken chains */
2368 4152 : if (!ItemIdIsNormal(lp))
2369 0 : break;
2370 :
2371 4152 : htup = (HeapTupleHeader) PageGetItem(page, lp);
2372 :
2373 4488 : if (TransactionIdIsValid(priorXmax) &&
2374 336 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
2375 0 : break;
2376 :
2377 : /* Remember the root line pointer for this item */
2378 4152 : root_offsets[nextoffnum - 1] = offnum;
2379 :
2380 : /* Advance to next chain member, if any */
2381 4152 : if (!HeapTupleHeaderIsHotUpdated(htup))
2382 4080 : break;
2383 :
2384 : /* HOT implies it can't have moved to different partition */
2385 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
2386 :
2387 72 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
2388 72 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
2389 : }
2390 : }
2391 134232 : }
2392 :
2393 :
2394 : /*
2395 : * Compare fields that describe actions required to freeze tuple with caller's
2396 : * open plan. If everything matches then the frz tuple plan is equivalent to
2397 : * caller's plan.
2398 : */
2399 : static inline bool
2400 1190337 : heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
2401 : {
2402 1190337 : if (plan->xmax == frz->xmax &&
2403 1189042 : plan->t_infomask2 == frz->t_infomask2 &&
2404 1188287 : plan->t_infomask == frz->t_infomask &&
2405 1185287 : plan->frzflags == frz->frzflags)
2406 1185287 : return true;
2407 :
2408 : /* Caller must call heap_log_freeze_new_plan again for frz */
2409 5050 : return false;
2410 : }
2411 :
2412 : /*
2413 : * Comparator used to deduplicate the freeze plans used in WAL records.
2414 : */
2415 : static int
2416 1604477 : heap_log_freeze_cmp(const void *arg1, const void *arg2)
2417 : {
2418 1604477 : const HeapTupleFreeze *frz1 = arg1;
2419 1604477 : const HeapTupleFreeze *frz2 = arg2;
2420 :
2421 1604477 : if (frz1->xmax < frz2->xmax)
2422 13169 : return -1;
2423 1591308 : else if (frz1->xmax > frz2->xmax)
2424 14212 : return 1;
2425 :
2426 1577096 : if (frz1->t_infomask2 < frz2->t_infomask2)
2427 5159 : return -1;
2428 1571937 : else if (frz1->t_infomask2 > frz2->t_infomask2)
2429 4584 : return 1;
2430 :
2431 1567353 : if (frz1->t_infomask < frz2->t_infomask)
2432 13456 : return -1;
2433 1553897 : else if (frz1->t_infomask > frz2->t_infomask)
2434 17118 : return 1;
2435 :
2436 1536779 : if (frz1->frzflags < frz2->frzflags)
2437 0 : return -1;
2438 1536779 : else if (frz1->frzflags > frz2->frzflags)
2439 0 : return 1;
2440 :
2441 : /*
2442 : * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
2443 : * (So the tuples will share a single canonical freeze plan.)
2444 : *
2445 : * We tiebreak on page offset number to keep each freeze plan's page
2446 : * offset number array individually sorted. (Unnecessary, but be tidy.)
2447 : */
2448 1536779 : if (frz1->offset < frz2->offset)
2449 1319958 : return -1;
2450 216821 : else if (frz1->offset > frz2->offset)
2451 216821 : return 1;
2452 :
2453 : Assert(false);
2454 0 : return 0;
2455 : }
2456 :
2457 : /*
2458 : * Start new plan initialized using tuple-level actions. At least one tuple
2459 : * will have steps required to freeze described by caller's plan during REDO.
2460 : */
2461 : static inline void
2462 30402 : heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz)
2463 : {
2464 30402 : plan->xmax = frz->xmax;
2465 30402 : plan->t_infomask2 = frz->t_infomask2;
2466 30402 : plan->t_infomask = frz->t_infomask;
2467 30402 : plan->frzflags = frz->frzflags;
2468 30402 : plan->ntuples = 1; /* for now */
2469 30402 : }
2470 :
2471 : /*
2472 : * Deduplicate tuple-based freeze plans so that each distinct set of
2473 : * processing steps is only stored once in the WAL record.
2474 : * Called during original execution of freezing (for logged relations).
2475 : *
2476 : * Return value is number of plans set in *plans_out for caller. Also writes
2477 : * an array of offset numbers into *offsets_out output argument for caller
2478 : * (actually there is one array per freeze plan, but that's not of immediate
2479 : * concern to our caller).
2480 : */
2481 : static int
2482 25352 : heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
2483 : xlhp_freeze_plan *plans_out,
2484 : OffsetNumber *offsets_out)
2485 : {
2486 25352 : int nplans = 0;
2487 :
2488 : /* Sort tuple-based freeze plans in the order required to deduplicate */
2489 25352 : qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
2490 :
2491 1241041 : for (int i = 0; i < ntuples; i++)
2492 : {
2493 1215689 : HeapTupleFreeze *frz = tuples + i;
2494 :
2495 1215689 : if (i == 0)
2496 : {
2497 : /* New canonical freeze plan starting with first tup */
2498 25352 : heap_log_freeze_new_plan(plans_out, frz);
2499 25352 : nplans++;
2500 : }
2501 1190337 : else if (heap_log_freeze_eq(plans_out, frz))
2502 : {
2503 : /* tup matches open canonical plan -- include tup in it */
2504 : Assert(offsets_out[i - 1] < frz->offset);
2505 1185287 : plans_out->ntuples++;
2506 : }
2507 : else
2508 : {
2509 : /* Tup doesn't match current plan -- done with it now */
2510 5050 : plans_out++;
2511 :
2512 : /* New canonical freeze plan starting with this tup */
2513 5050 : heap_log_freeze_new_plan(plans_out, frz);
2514 5050 : nplans++;
2515 : }
2516 :
2517 : /*
2518 : * Save page offset number in dedicated buffer in passing.
2519 : *
2520 : * REDO routine relies on the record's offset numbers array grouping
2521 : * offset numbers by freeze plan. The sort order within each grouping
2522 : * is ascending offset number order, just to keep things tidy.
2523 : */
2524 1215689 : offsets_out[i] = frz->offset;
2525 : }
2526 :
2527 : Assert(nplans > 0 && nplans <= ntuples);
2528 :
2529 25352 : return nplans;
2530 : }
2531 :
2532 : /*
2533 : * Write an XLOG_HEAP2_PRUNE* WAL record
2534 : *
2535 : * This is used for several different page maintenance operations:
2536 : *
2537 : * - Page pruning, in VACUUM's 1st pass or on access: Some items are
2538 : * redirected, some marked dead, and some removed altogether.
2539 : *
2540 : * - Freezing: Items are marked as 'frozen'.
2541 : *
2542 : * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused.
2543 : *
2544 : * They have enough commonalities that we use a single WAL record for them
2545 : * all.
2546 : *
2547 : * If replaying the record requires a cleanup lock, pass cleanup_lock = true.
2548 : * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but
2549 : * replaying 'unused' items depends on whether they were all previously marked
2550 : * as dead.
2551 : *
2552 : * If the VM is being updated, vmflags will contain the bits to set. In this
2553 : * case, vmbuffer should already have been updated and marked dirty and should
2554 : * still be pinned and locked.
2555 : *
2556 : * Note: This function scribbles on the 'frozen' array.
2557 : *
2558 : * Note: This is called in a critical section, so careful what you do here.
2559 : */
2560 : void
2561 129195 : log_heap_prune_and_freeze(Relation relation, Buffer buffer,
2562 : Buffer vmbuffer, uint8 vmflags,
2563 : TransactionId conflict_xid,
2564 : bool cleanup_lock,
2565 : PruneReason reason,
2566 : HeapTupleFreeze *frozen, int nfrozen,
2567 : OffsetNumber *redirected, int nredirected,
2568 : OffsetNumber *dead, int ndead,
2569 : OffsetNumber *unused, int nunused)
2570 : {
2571 : xl_heap_prune xlrec;
2572 : XLogRecPtr recptr;
2573 : uint8 info;
2574 : uint8 regbuf_flags_heap;
2575 :
2576 129195 : Page heap_page = BufferGetPage(buffer);
2577 :
2578 : /* The following local variables hold data registered in the WAL record: */
2579 : xlhp_freeze_plan plans[MaxHeapTuplesPerPage];
2580 : xlhp_freeze_plans freeze_plans;
2581 : xlhp_prune_items redirect_items;
2582 : xlhp_prune_items dead_items;
2583 : xlhp_prune_items unused_items;
2584 : OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
2585 129195 : bool do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
2586 129195 : bool do_set_vm = vmflags & VISIBILITYMAP_VALID_BITS;
2587 129195 : bool heap_fpi_allowed = true;
2588 :
2589 : Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags);
2590 :
2591 129195 : xlrec.flags = 0;
2592 129195 : regbuf_flags_heap = REGBUF_STANDARD;
2593 :
2594 : /*
2595 : * We can avoid an FPI of the heap page if the only modification we are
2596 : * making to it is to set PD_ALL_VISIBLE and checksums/wal_log_hints are
2597 : * disabled.
2598 : *
2599 : * However, if the page has never been WAL-logged (LSN is invalid), we
2600 : * must force an FPI regardless. This can happen when another backend
2601 : * extends the heap, initializes the page, and then fails before WAL-
2602 : * logging it. Since heap extension is not WAL-logged, recovery might try
2603 : * to replay our record and find that the page isn't initialized, which
2604 : * would cause a PANIC.
2605 : */
2606 129195 : if (!XLogRecPtrIsValid(PageGetLSN(heap_page)))
2607 0 : regbuf_flags_heap |= REGBUF_FORCE_IMAGE;
2608 129195 : else if (!do_prune && nfrozen == 0 && (!do_set_vm || !XLogHintBitIsNeeded()))
2609 : {
2610 1332 : regbuf_flags_heap |= REGBUF_NO_IMAGE;
2611 1332 : heap_fpi_allowed = false;
2612 : }
2613 :
2614 : /*
2615 : * Prepare data for the buffer. The arrays are not actually in the
2616 : * buffer, but we pretend that they are. When XLogInsert stores a full
2617 : * page image, the arrays can be omitted.
2618 : */
2619 129195 : XLogBeginInsert();
2620 129195 : XLogRegisterBuffer(0, buffer, regbuf_flags_heap);
2621 :
2622 129195 : if (do_set_vm)
2623 68362 : XLogRegisterBuffer(1, vmbuffer, 0);
2624 :
2625 129195 : if (nfrozen > 0)
2626 : {
2627 : int nplans;
2628 :
2629 25352 : xlrec.flags |= XLHP_HAS_FREEZE_PLANS;
2630 :
2631 : /*
2632 : * Prepare deduplicated representation for use in the WAL record. This
2633 : * destructively sorts frozen tuples array in-place.
2634 : */
2635 25352 : nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets);
2636 :
2637 25352 : freeze_plans.nplans = nplans;
2638 25352 : XLogRegisterBufData(0, &freeze_plans,
2639 : offsetof(xlhp_freeze_plans, plans));
2640 25352 : XLogRegisterBufData(0, plans,
2641 : sizeof(xlhp_freeze_plan) * nplans);
2642 : }
2643 129195 : if (nredirected > 0)
2644 : {
2645 17447 : xlrec.flags |= XLHP_HAS_REDIRECTIONS;
2646 :
2647 17447 : redirect_items.ntargets = nredirected;
2648 17447 : XLogRegisterBufData(0, &redirect_items,
2649 : offsetof(xlhp_prune_items, data));
2650 17447 : XLogRegisterBufData(0, redirected,
2651 : sizeof(OffsetNumber[2]) * nredirected);
2652 : }
2653 129195 : if (ndead > 0)
2654 : {
2655 46337 : xlrec.flags |= XLHP_HAS_DEAD_ITEMS;
2656 :
2657 46337 : dead_items.ntargets = ndead;
2658 46337 : XLogRegisterBufData(0, &dead_items,
2659 : offsetof(xlhp_prune_items, data));
2660 46337 : XLogRegisterBufData(0, dead,
2661 : sizeof(OffsetNumber) * ndead);
2662 : }
2663 129195 : if (nunused > 0)
2664 : {
2665 28886 : xlrec.flags |= XLHP_HAS_NOW_UNUSED_ITEMS;
2666 :
2667 28886 : unused_items.ntargets = nunused;
2668 28886 : XLogRegisterBufData(0, &unused_items,
2669 : offsetof(xlhp_prune_items, data));
2670 28886 : XLogRegisterBufData(0, unused,
2671 : sizeof(OffsetNumber) * nunused);
2672 : }
2673 129195 : if (nfrozen > 0)
2674 25352 : XLogRegisterBufData(0, frz_offsets,
2675 : sizeof(OffsetNumber) * nfrozen);
2676 :
2677 : /*
2678 : * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_*
2679 : * flag above.
2680 : */
2681 129195 : if (vmflags & VISIBILITYMAP_ALL_VISIBLE)
2682 : {
2683 68362 : xlrec.flags |= XLHP_VM_ALL_VISIBLE;
2684 68362 : if (vmflags & VISIBILITYMAP_ALL_FROZEN)
2685 42027 : xlrec.flags |= XLHP_VM_ALL_FROZEN;
2686 : }
2687 129195 : if (RelationIsAccessibleInLogicalDecoding(relation))
2688 593 : xlrec.flags |= XLHP_IS_CATALOG_REL;
2689 129195 : if (TransactionIdIsValid(conflict_xid))
2690 102366 : xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON;
2691 129195 : if (cleanup_lock)
2692 114397 : xlrec.flags |= XLHP_CLEANUP_LOCK;
2693 : else
2694 : {
2695 : Assert(nredirected == 0 && ndead == 0);
2696 : /* also, any items in 'unused' must've been LP_DEAD previously */
2697 : }
2698 129195 : XLogRegisterData(&xlrec, SizeOfHeapPrune);
2699 129195 : if (TransactionIdIsValid(conflict_xid))
2700 102366 : XLogRegisterData(&conflict_xid, sizeof(TransactionId));
2701 :
2702 129195 : switch (reason)
2703 : {
2704 59629 : case PRUNE_ON_ACCESS:
2705 59629 : info = XLOG_HEAP2_PRUNE_ON_ACCESS;
2706 59629 : break;
2707 54768 : case PRUNE_VACUUM_SCAN:
2708 54768 : info = XLOG_HEAP2_PRUNE_VACUUM_SCAN;
2709 54768 : break;
2710 14798 : case PRUNE_VACUUM_CLEANUP:
2711 14798 : info = XLOG_HEAP2_PRUNE_VACUUM_CLEANUP;
2712 14798 : break;
2713 0 : default:
2714 0 : elog(ERROR, "unrecognized prune reason: %d", (int) reason);
2715 : break;
2716 : }
2717 129195 : recptr = XLogInsert(RM_HEAP2_ID, info);
2718 :
2719 129195 : if (do_set_vm)
2720 : {
2721 : Assert(BufferIsDirty(vmbuffer));
2722 68362 : PageSetLSN(BufferGetPage(vmbuffer), recptr);
2723 : }
2724 :
2725 : /*
2726 : * If we explicitly skip an FPI, we must not stamp the heap page with this
2727 : * record's LSN. Recovery skips records <= the stamped LSN, so this could
2728 : * lead to skipping an earlier FPI needed to repair a torn page.
2729 : */
2730 129195 : if (heap_fpi_allowed)
2731 : {
2732 : Assert(BufferIsDirty(buffer));
2733 127863 : PageSetLSN(heap_page, recptr);
2734 : }
2735 129195 : }
|