Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * vacuumlazy.c
4 : * Concurrent ("lazy") vacuuming.
5 : *
6 : * The major space usage for vacuuming is storage for the dead tuple IDs that
7 : * are to be removed from indexes. We want to ensure we can vacuum even the
8 : * very largest relations with finite memory space usage. To do that, we set
9 : * upper bounds on the memory that can be used for keeping track of dead TIDs
10 : * at once.
11 : *
12 : * We are willing to use at most maintenance_work_mem (or perhaps
13 : * autovacuum_work_mem) memory space to keep track of dead TIDs. If the
14 : * TID store is full, we must call lazy_vacuum to vacuum indexes (and to vacuum
15 : * the pages that we've pruned). This frees up the memory space dedicated to
16 : * store dead TIDs.
17 : *
18 : * In practice VACUUM will often complete its initial pass over the target
19 : * heap relation without ever running out of space to store TIDs. This means
20 : * that there only needs to be one call to lazy_vacuum, after the initial pass
21 : * completes.
22 : *
23 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
24 : * Portions Copyright (c) 1994, Regents of the University of California
25 : *
26 : *
27 : * IDENTIFICATION
28 : * src/backend/access/heap/vacuumlazy.c
29 : *
30 : *-------------------------------------------------------------------------
31 : */
32 : #include "postgres.h"
33 :
34 : #include <math.h>
35 :
36 : #include "access/genam.h"
37 : #include "access/heapam.h"
38 : #include "access/htup_details.h"
39 : #include "access/multixact.h"
40 : #include "access/tidstore.h"
41 : #include "access/transam.h"
42 : #include "access/visibilitymap.h"
43 : #include "access/xloginsert.h"
44 : #include "catalog/storage.h"
45 : #include "commands/dbcommands.h"
46 : #include "commands/progress.h"
47 : #include "commands/vacuum.h"
48 : #include "common/int.h"
49 : #include "executor/instrument.h"
50 : #include "miscadmin.h"
51 : #include "pgstat.h"
52 : #include "portability/instr_time.h"
53 : #include "postmaster/autovacuum.h"
54 : #include "storage/bufmgr.h"
55 : #include "storage/freespace.h"
56 : #include "storage/lmgr.h"
57 : #include "utils/lsyscache.h"
58 : #include "utils/pg_rusage.h"
59 : #include "utils/timestamp.h"
60 :
61 :
62 : /*
63 : * Space/time tradeoff parameters: do these need to be user-tunable?
64 : *
65 : * To consider truncating the relation, we want there to be at least
66 : * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
67 : * is less) potentially-freeable pages.
68 : */
69 : #define REL_TRUNCATE_MINIMUM 1000
70 : #define REL_TRUNCATE_FRACTION 16
71 :
72 : /*
73 : * Timing parameters for truncate locking heuristics.
74 : *
75 : * These were not exposed as user tunable GUC values because it didn't seem
76 : * that the potential for improvement was great enough to merit the cost of
77 : * supporting them.
78 : */
79 : #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
80 : #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
81 : #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
82 :
83 : /*
84 : * Threshold that controls whether we bypass index vacuuming and heap
85 : * vacuuming as an optimization
86 : */
87 : #define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */
88 :
89 : /*
90 : * Perform a failsafe check each time we scan another 4GB of pages.
91 : * (Note that this is deliberately kept to a power-of-two, usually 2^19.)
92 : */
93 : #define FAILSAFE_EVERY_PAGES \
94 : ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
95 :
96 : /*
97 : * When a table has no indexes, vacuum the FSM after every 8GB, approximately
98 : * (it won't be exact because we only vacuum FSM after processing a heap page
99 : * that has some removable tuples). When there are indexes, this is ignored,
100 : * and we vacuum FSM after each index/heap cleaning pass.
101 : */
102 : #define VACUUM_FSM_EVERY_PAGES \
103 : ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
104 :
105 : /*
106 : * Before we consider skipping a page that's marked as clean in
107 : * visibility map, we must've seen at least this many clean pages.
108 : */
109 : #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
110 :
111 : /*
112 : * Size of the prefetch window for lazy vacuum backwards truncation scan.
113 : * Needs to be a power of 2.
114 : */
115 : #define PREFETCH_SIZE ((BlockNumber) 32)
116 :
117 : /*
118 : * Macro to check if we are in a parallel vacuum. If true, we are in the
119 : * parallel mode and the DSM segment is initialized.
120 : */
121 : #define ParallelVacuumIsActive(vacrel) ((vacrel)->pvs != NULL)
122 :
123 : /* Phases of vacuum during which we report error context. */
124 : typedef enum
125 : {
126 : VACUUM_ERRCB_PHASE_UNKNOWN,
127 : VACUUM_ERRCB_PHASE_SCAN_HEAP,
128 : VACUUM_ERRCB_PHASE_VACUUM_INDEX,
129 : VACUUM_ERRCB_PHASE_VACUUM_HEAP,
130 : VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
131 : VACUUM_ERRCB_PHASE_TRUNCATE,
132 : } VacErrPhase;
133 :
134 : typedef struct LVRelState
135 : {
136 : /* Target heap relation and its indexes */
137 : Relation rel;
138 : Relation *indrels;
139 : int nindexes;
140 :
141 : /* Buffer access strategy and parallel vacuum state */
142 : BufferAccessStrategy bstrategy;
143 : ParallelVacuumState *pvs;
144 :
145 : /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */
146 : bool aggressive;
147 : /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */
148 : bool skipwithvm;
149 : /* Consider index vacuuming bypass optimization? */
150 : bool consider_bypass_optimization;
151 :
152 : /* Doing index vacuuming, index cleanup, rel truncation? */
153 : bool do_index_vacuuming;
154 : bool do_index_cleanup;
155 : bool do_rel_truncate;
156 :
157 : /* VACUUM operation's cutoffs for freezing and pruning */
158 : struct VacuumCutoffs cutoffs;
159 : GlobalVisState *vistest;
160 : /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */
161 : TransactionId NewRelfrozenXid;
162 : MultiXactId NewRelminMxid;
163 : bool skippedallvis;
164 :
165 : /* Error reporting state */
166 : char *dbname;
167 : char *relnamespace;
168 : char *relname;
169 : char *indname; /* Current index name */
170 : BlockNumber blkno; /* used only for heap operations */
171 : OffsetNumber offnum; /* used only for heap operations */
172 : VacErrPhase phase;
173 : bool verbose; /* VACUUM VERBOSE? */
174 :
175 : /*
176 : * dead_items stores TIDs whose index tuples are deleted by index
177 : * vacuuming. Each TID points to an LP_DEAD line pointer from a heap page
178 : * that has been processed by lazy_scan_prune. Also needed by
179 : * lazy_vacuum_heap_rel, which marks the same LP_DEAD line pointers as
180 : * LP_UNUSED during second heap pass.
181 : *
182 : * Both dead_items and dead_items_info are allocated in shared memory in
183 : * parallel vacuum cases.
184 : */
185 : TidStore *dead_items; /* TIDs whose index tuples we'll delete */
186 : VacDeadItemsInfo *dead_items_info;
187 :
188 : BlockNumber rel_pages; /* total number of pages */
189 : BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */
190 : BlockNumber removed_pages; /* # pages removed by relation truncation */
191 : BlockNumber frozen_pages; /* # pages with newly frozen tuples */
192 : BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */
193 : BlockNumber missed_dead_pages; /* # pages with missed dead tuples */
194 : BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
195 :
196 : /* Statistics output by us, for table */
197 : double new_rel_tuples; /* new estimated total # of tuples */
198 : double new_live_tuples; /* new estimated total # of live tuples */
199 : /* Statistics output by index AMs */
200 : IndexBulkDeleteResult **indstats;
201 :
202 : /* Instrumentation counters */
203 : int num_index_scans;
204 : /* Counters that follow are only for scanned_pages */
205 : int64 tuples_deleted; /* # deleted from table */
206 : int64 tuples_frozen; /* # newly frozen */
207 : int64 lpdead_items; /* # deleted from indexes */
208 : int64 live_tuples; /* # live tuples remaining */
209 : int64 recently_dead_tuples; /* # dead, but not yet removable */
210 : int64 missed_dead_tuples; /* # removable, but not removed */
211 :
212 : /* State maintained by heap_vac_scan_next_block() */
213 : BlockNumber current_block; /* last block returned */
214 : BlockNumber next_unskippable_block; /* next unskippable block */
215 : bool next_unskippable_allvis; /* its visibility status */
216 : Buffer next_unskippable_vmbuffer; /* buffer containing its VM bit */
217 : } LVRelState;
218 :
219 : /* Struct for saving and restoring vacuum error information. */
220 : typedef struct LVSavedErrInfo
221 : {
222 : BlockNumber blkno;
223 : OffsetNumber offnum;
224 : VacErrPhase phase;
225 : } LVSavedErrInfo;
226 :
227 :
228 : /* non-export function prototypes */
229 : static void lazy_scan_heap(LVRelState *vacrel);
230 : static bool heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
231 : bool *all_visible_according_to_vm);
232 : static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
233 : static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
234 : BlockNumber blkno, Page page,
235 : bool sharelock, Buffer vmbuffer);
236 : static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
237 : BlockNumber blkno, Page page,
238 : Buffer vmbuffer, bool all_visible_according_to_vm,
239 : bool *has_lpdead_items);
240 : static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf,
241 : BlockNumber blkno, Page page,
242 : bool *has_lpdead_items);
243 : static void lazy_vacuum(LVRelState *vacrel);
244 : static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
245 : static void lazy_vacuum_heap_rel(LVRelState *vacrel);
246 : static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
247 : Buffer buffer, OffsetNumber *deadoffsets,
248 : int num_offsets, Buffer vmbuffer);
249 : static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
250 : static void lazy_cleanup_all_indexes(LVRelState *vacrel);
251 : static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
252 : IndexBulkDeleteResult *istat,
253 : double reltuples,
254 : LVRelState *vacrel);
255 : static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
256 : IndexBulkDeleteResult *istat,
257 : double reltuples,
258 : bool estimated_count,
259 : LVRelState *vacrel);
260 : static bool should_attempt_truncation(LVRelState *vacrel);
261 : static void lazy_truncate_heap(LVRelState *vacrel);
262 : static BlockNumber count_nondeletable_pages(LVRelState *vacrel,
263 : bool *lock_waiter_detected);
264 : static void dead_items_alloc(LVRelState *vacrel, int nworkers);
265 : static void dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets,
266 : int num_offsets);
267 : static void dead_items_reset(LVRelState *vacrel);
268 : static void dead_items_cleanup(LVRelState *vacrel);
269 : static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
270 : TransactionId *visibility_cutoff_xid, bool *all_frozen);
271 : static void update_relstats_all_indexes(LVRelState *vacrel);
272 : static void vacuum_error_callback(void *arg);
273 : static void update_vacuum_error_info(LVRelState *vacrel,
274 : LVSavedErrInfo *saved_vacrel,
275 : int phase, BlockNumber blkno,
276 : OffsetNumber offnum);
277 : static void restore_vacuum_error_info(LVRelState *vacrel,
278 : const LVSavedErrInfo *saved_vacrel);
279 :
280 :
281 : /*
282 : * heap_vacuum_rel() -- perform VACUUM for one heap relation
283 : *
284 : * This routine sets things up for and then calls lazy_scan_heap, where
285 : * almost all work actually takes place. Finalizes everything after call
286 : * returns by managing relation truncation and updating rel's pg_class
287 : * entry. (Also updates pg_class entries for any indexes that need it.)
288 : *
289 : * At entry, we have already established a transaction and opened
290 : * and locked the relation.
291 : */
292 : void
293 97130 : heap_vacuum_rel(Relation rel, VacuumParams *params,
294 : BufferAccessStrategy bstrategy)
295 : {
296 : LVRelState *vacrel;
297 : bool verbose,
298 : instrument,
299 : skipwithvm,
300 : frozenxid_updated,
301 : minmulti_updated;
302 : BlockNumber orig_rel_pages,
303 : new_rel_pages,
304 : new_rel_allvisible;
305 : PGRUsage ru0;
306 97130 : TimestampTz starttime = 0;
307 97130 : PgStat_Counter startreadtime = 0,
308 97130 : startwritetime = 0;
309 97130 : WalUsage startwalusage = pgWalUsage;
310 97130 : BufferUsage startbufferusage = pgBufferUsage;
311 : ErrorContextCallback errcallback;
312 97130 : char **indnames = NULL;
313 :
314 97130 : verbose = (params->options & VACOPT_VERBOSE) != 0;
315 173518 : instrument = (verbose || (AmAutoVacuumWorkerProcess() &&
316 76388 : params->log_min_duration >= 0));
317 97130 : if (instrument)
318 : {
319 76408 : pg_rusage_init(&ru0);
320 76408 : starttime = GetCurrentTimestamp();
321 76408 : if (track_io_timing)
322 : {
323 0 : startreadtime = pgStatBlockReadTime;
324 0 : startwritetime = pgStatBlockWriteTime;
325 : }
326 : }
327 :
328 97130 : pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
329 : RelationGetRelid(rel));
330 :
331 : /*
332 : * Setup error traceback support for ereport() first. The idea is to set
333 : * up an error context callback to display additional information on any
334 : * error during a vacuum. During different phases of vacuum, we update
335 : * the state so that the error context callback always display current
336 : * information.
337 : *
338 : * Copy the names of heap rel into local memory for error reporting
339 : * purposes, too. It isn't always safe to assume that we can get the name
340 : * of each rel. It's convenient for code in lazy_scan_heap to always use
341 : * these temp copies.
342 : */
343 97130 : vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
344 97130 : vacrel->dbname = get_database_name(MyDatabaseId);
345 97130 : vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
346 97130 : vacrel->relname = pstrdup(RelationGetRelationName(rel));
347 97130 : vacrel->indname = NULL;
348 97130 : vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
349 97130 : vacrel->verbose = verbose;
350 97130 : errcallback.callback = vacuum_error_callback;
351 97130 : errcallback.arg = vacrel;
352 97130 : errcallback.previous = error_context_stack;
353 97130 : error_context_stack = &errcallback;
354 :
355 : /* Set up high level stuff about rel and its indexes */
356 97130 : vacrel->rel = rel;
357 97130 : vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
358 : &vacrel->indrels);
359 97130 : vacrel->bstrategy = bstrategy;
360 97130 : if (instrument && vacrel->nindexes > 0)
361 : {
362 : /* Copy index names used by instrumentation (not error reporting) */
363 73210 : indnames = palloc(sizeof(char *) * vacrel->nindexes);
364 188138 : for (int i = 0; i < vacrel->nindexes; i++)
365 114928 : indnames[i] = pstrdup(RelationGetRelationName(vacrel->indrels[i]));
366 : }
367 :
368 : /*
369 : * The index_cleanup param either disables index vacuuming and cleanup or
370 : * forces it to go ahead when we would otherwise apply the index bypass
371 : * optimization. The default is 'auto', which leaves the final decision
372 : * up to lazy_vacuum().
373 : *
374 : * The truncate param allows user to avoid attempting relation truncation,
375 : * though it can't force truncation to happen.
376 : */
377 : Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
378 : Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
379 : params->truncate != VACOPTVALUE_AUTO);
380 :
381 : /*
382 : * While VacuumFailSafeActive is reset to false before calling this, we
383 : * still need to reset it here due to recursive calls.
384 : */
385 97130 : VacuumFailsafeActive = false;
386 97130 : vacrel->consider_bypass_optimization = true;
387 97130 : vacrel->do_index_vacuuming = true;
388 97130 : vacrel->do_index_cleanup = true;
389 97130 : vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
390 97130 : if (params->index_cleanup == VACOPTVALUE_DISABLED)
391 : {
392 : /* Force disable index vacuuming up-front */
393 264 : vacrel->do_index_vacuuming = false;
394 264 : vacrel->do_index_cleanup = false;
395 : }
396 96866 : else if (params->index_cleanup == VACOPTVALUE_ENABLED)
397 : {
398 : /* Force index vacuuming. Note that failsafe can still bypass. */
399 32 : vacrel->consider_bypass_optimization = false;
400 : }
401 : else
402 : {
403 : /* Default/auto, make all decisions dynamically */
404 : Assert(params->index_cleanup == VACOPTVALUE_AUTO);
405 : }
406 :
407 : /* Initialize page counters explicitly (be tidy) */
408 97130 : vacrel->scanned_pages = 0;
409 97130 : vacrel->removed_pages = 0;
410 97130 : vacrel->frozen_pages = 0;
411 97130 : vacrel->lpdead_item_pages = 0;
412 97130 : vacrel->missed_dead_pages = 0;
413 97130 : vacrel->nonempty_pages = 0;
414 : /* dead_items_alloc allocates vacrel->dead_items later on */
415 :
416 : /* Allocate/initialize output statistics state */
417 97130 : vacrel->new_rel_tuples = 0;
418 97130 : vacrel->new_live_tuples = 0;
419 97130 : vacrel->indstats = (IndexBulkDeleteResult **)
420 97130 : palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
421 :
422 : /* Initialize remaining counters (be tidy) */
423 97130 : vacrel->num_index_scans = 0;
424 97130 : vacrel->tuples_deleted = 0;
425 97130 : vacrel->tuples_frozen = 0;
426 97130 : vacrel->lpdead_items = 0;
427 97130 : vacrel->live_tuples = 0;
428 97130 : vacrel->recently_dead_tuples = 0;
429 97130 : vacrel->missed_dead_tuples = 0;
430 :
431 : /*
432 : * Get cutoffs that determine which deleted tuples are considered DEAD,
433 : * not just RECENTLY_DEAD, and which XIDs/MXIDs to freeze. Then determine
434 : * the extent of the blocks that we'll scan in lazy_scan_heap. It has to
435 : * happen in this order to ensure that the OldestXmin cutoff field works
436 : * as an upper bound on the XIDs stored in the pages we'll actually scan
437 : * (NewRelfrozenXid tracking must never be allowed to miss unfrozen XIDs).
438 : *
439 : * Next acquire vistest, a related cutoff that's used in pruning. We use
440 : * vistest in combination with OldestXmin to ensure that
441 : * heap_page_prune_and_freeze() always removes any deleted tuple whose
442 : * xmax is < OldestXmin. lazy_scan_prune must never become confused about
443 : * whether a tuple should be frozen or removed. (In the future we might
444 : * want to teach lazy_scan_prune to recompute vistest from time to time,
445 : * to increase the number of dead tuples it can prune away.)
446 : */
447 97130 : vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs);
448 97130 : vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
449 97130 : vacrel->vistest = GlobalVisTestFor(rel);
450 : /* Initialize state used to track oldest extant XID/MXID */
451 97130 : vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin;
452 97130 : vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact;
453 97130 : vacrel->skippedallvis = false;
454 97130 : skipwithvm = true;
455 97130 : if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
456 : {
457 : /*
458 : * Force aggressive mode, and disable skipping blocks using the
459 : * visibility map (even those set all-frozen)
460 : */
461 298 : vacrel->aggressive = true;
462 298 : skipwithvm = false;
463 : }
464 :
465 97130 : vacrel->skipwithvm = skipwithvm;
466 :
467 97130 : if (verbose)
468 : {
469 20 : if (vacrel->aggressive)
470 0 : ereport(INFO,
471 : (errmsg("aggressively vacuuming \"%s.%s.%s\"",
472 : vacrel->dbname, vacrel->relnamespace,
473 : vacrel->relname)));
474 : else
475 20 : ereport(INFO,
476 : (errmsg("vacuuming \"%s.%s.%s\"",
477 : vacrel->dbname, vacrel->relnamespace,
478 : vacrel->relname)));
479 : }
480 :
481 : /*
482 : * Allocate dead_items memory using dead_items_alloc. This handles
483 : * parallel VACUUM initialization as part of allocating shared memory
484 : * space used for dead_items. (But do a failsafe precheck first, to
485 : * ensure that parallel VACUUM won't be attempted at all when relfrozenxid
486 : * is already dangerously old.)
487 : */
488 97130 : lazy_check_wraparound_failsafe(vacrel);
489 97130 : dead_items_alloc(vacrel, params->nworkers);
490 :
491 : /*
492 : * Call lazy_scan_heap to perform all required heap pruning, index
493 : * vacuuming, and heap vacuuming (plus related processing)
494 : */
495 97130 : lazy_scan_heap(vacrel);
496 :
497 : /*
498 : * Free resources managed by dead_items_alloc. This ends parallel mode in
499 : * passing when necessary.
500 : */
501 97130 : dead_items_cleanup(vacrel);
502 : Assert(!IsInParallelMode());
503 :
504 : /*
505 : * Update pg_class entries for each of rel's indexes where appropriate.
506 : *
507 : * Unlike the later update to rel's pg_class entry, this is not critical.
508 : * Maintains relpages/reltuples statistics used by the planner only.
509 : */
510 97130 : if (vacrel->do_index_cleanup)
511 81978 : update_relstats_all_indexes(vacrel);
512 :
513 : /* Done with rel's indexes */
514 97130 : vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
515 :
516 : /* Optionally truncate rel */
517 97130 : if (should_attempt_truncation(vacrel))
518 268 : lazy_truncate_heap(vacrel);
519 :
520 : /* Pop the error context stack */
521 97130 : error_context_stack = errcallback.previous;
522 :
523 : /* Report that we are now doing final cleanup */
524 97130 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
525 : PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
526 :
527 : /*
528 : * Prepare to update rel's pg_class entry.
529 : *
530 : * Aggressive VACUUMs must always be able to advance relfrozenxid to a
531 : * value >= FreezeLimit, and relminmxid to a value >= MultiXactCutoff.
532 : * Non-aggressive VACUUMs may advance them by any amount, or not at all.
533 : */
534 : Assert(vacrel->NewRelfrozenXid == vacrel->cutoffs.OldestXmin ||
535 : TransactionIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.FreezeLimit :
536 : vacrel->cutoffs.relfrozenxid,
537 : vacrel->NewRelfrozenXid));
538 : Assert(vacrel->NewRelminMxid == vacrel->cutoffs.OldestMxact ||
539 : MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff :
540 : vacrel->cutoffs.relminmxid,
541 : vacrel->NewRelminMxid));
542 97130 : if (vacrel->skippedallvis)
543 : {
544 : /*
545 : * Must keep original relfrozenxid in a non-aggressive VACUUM that
546 : * chose to skip an all-visible page range. The state that tracks new
547 : * values will have missed unfrozen XIDs from the pages we skipped.
548 : */
549 : Assert(!vacrel->aggressive);
550 50 : vacrel->NewRelfrozenXid = InvalidTransactionId;
551 50 : vacrel->NewRelminMxid = InvalidMultiXactId;
552 : }
553 :
554 : /*
555 : * For safety, clamp relallvisible to be not more than what we're setting
556 : * pg_class.relpages to
557 : */
558 97130 : new_rel_pages = vacrel->rel_pages; /* After possible rel truncation */
559 97130 : visibilitymap_count(rel, &new_rel_allvisible, NULL);
560 97130 : if (new_rel_allvisible > new_rel_pages)
561 0 : new_rel_allvisible = new_rel_pages;
562 :
563 : /*
564 : * Now actually update rel's pg_class entry.
565 : *
566 : * In principle new_live_tuples could be -1 indicating that we (still)
567 : * don't know the tuple count. In practice that can't happen, since we
568 : * scan every page that isn't skipped using the visibility map.
569 : */
570 97130 : vac_update_relstats(rel, new_rel_pages, vacrel->new_live_tuples,
571 97130 : new_rel_allvisible, vacrel->nindexes > 0,
572 : vacrel->NewRelfrozenXid, vacrel->NewRelminMxid,
573 : &frozenxid_updated, &minmulti_updated, false);
574 :
575 : /*
576 : * Report results to the cumulative stats system, too.
577 : *
578 : * Deliberately avoid telling the stats system about LP_DEAD items that
579 : * remain in the table due to VACUUM bypassing index and heap vacuuming.
580 : * ANALYZE will consider the remaining LP_DEAD items to be dead "tuples".
581 : * It seems like a good idea to err on the side of not vacuuming again too
582 : * soon in cases where the failsafe prevented significant amounts of heap
583 : * vacuuming.
584 : */
585 58426 : pgstat_report_vacuum(RelationGetRelid(rel),
586 97130 : rel->rd_rel->relisshared,
587 38704 : Max(vacrel->new_live_tuples, 0),
588 97130 : vacrel->recently_dead_tuples +
589 97130 : vacrel->missed_dead_tuples);
590 97130 : pgstat_progress_end_command();
591 :
592 97130 : if (instrument)
593 : {
594 76408 : TimestampTz endtime = GetCurrentTimestamp();
595 :
596 76444 : if (verbose || params->log_min_duration == 0 ||
597 36 : TimestampDifferenceExceeds(starttime, endtime,
598 : params->log_min_duration))
599 : {
600 : long secs_dur;
601 : int usecs_dur;
602 : WalUsage walusage;
603 : BufferUsage bufferusage;
604 : StringInfoData buf;
605 : char *msgfmt;
606 : int32 diff;
607 76372 : double read_rate = 0,
608 76372 : write_rate = 0;
609 : int64 total_blks_hit;
610 : int64 total_blks_read;
611 : int64 total_blks_dirtied;
612 :
613 76372 : TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
614 76372 : memset(&walusage, 0, sizeof(WalUsage));
615 76372 : WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
616 76372 : memset(&bufferusage, 0, sizeof(BufferUsage));
617 76372 : BufferUsageAccumDiff(&bufferusage, &pgBufferUsage, &startbufferusage);
618 :
619 76372 : total_blks_hit = bufferusage.shared_blks_hit +
620 76372 : bufferusage.local_blks_hit;
621 76372 : total_blks_read = bufferusage.shared_blks_read +
622 76372 : bufferusage.local_blks_read;
623 76372 : total_blks_dirtied = bufferusage.shared_blks_dirtied +
624 76372 : bufferusage.local_blks_dirtied;
625 :
626 76372 : initStringInfo(&buf);
627 76372 : if (verbose)
628 : {
629 : /*
630 : * Aggressiveness already reported earlier, in dedicated
631 : * VACUUM VERBOSE ereport
632 : */
633 : Assert(!params->is_wraparound);
634 20 : msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n");
635 : }
636 76352 : else if (params->is_wraparound)
637 : {
638 : /*
639 : * While it's possible for a VACUUM to be both is_wraparound
640 : * and !aggressive, that's just a corner-case -- is_wraparound
641 : * implies aggressive. Produce distinct output for the corner
642 : * case all the same, just in case.
643 : */
644 76306 : if (vacrel->aggressive)
645 76306 : msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
646 : else
647 0 : msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
648 : }
649 : else
650 : {
651 46 : if (vacrel->aggressive)
652 12 : msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
653 : else
654 34 : msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
655 : }
656 76372 : appendStringInfo(&buf, msgfmt,
657 : vacrel->dbname,
658 : vacrel->relnamespace,
659 : vacrel->relname,
660 : vacrel->num_index_scans);
661 106466 : appendStringInfo(&buf, _("pages: %u removed, %u remain, %u scanned (%.2f%% of total)\n"),
662 : vacrel->removed_pages,
663 : new_rel_pages,
664 : vacrel->scanned_pages,
665 : orig_rel_pages == 0 ? 100.0 :
666 30094 : 100.0 * vacrel->scanned_pages / orig_rel_pages);
667 76372 : appendStringInfo(&buf,
668 76372 : _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable\n"),
669 76372 : (long long) vacrel->tuples_deleted,
670 76372 : (long long) vacrel->new_rel_tuples,
671 76372 : (long long) vacrel->recently_dead_tuples);
672 76372 : if (vacrel->missed_dead_tuples > 0)
673 0 : appendStringInfo(&buf,
674 0 : _("tuples missed: %lld dead from %u pages not removed due to cleanup lock contention\n"),
675 0 : (long long) vacrel->missed_dead_tuples,
676 : vacrel->missed_dead_pages);
677 76372 : diff = (int32) (ReadNextTransactionId() -
678 76372 : vacrel->cutoffs.OldestXmin);
679 76372 : appendStringInfo(&buf,
680 76372 : _("removable cutoff: %u, which was %d XIDs old when operation ended\n"),
681 : vacrel->cutoffs.OldestXmin, diff);
682 76372 : if (frozenxid_updated)
683 : {
684 32756 : diff = (int32) (vacrel->NewRelfrozenXid -
685 32756 : vacrel->cutoffs.relfrozenxid);
686 32756 : appendStringInfo(&buf,
687 32756 : _("new relfrozenxid: %u, which is %d XIDs ahead of previous value\n"),
688 : vacrel->NewRelfrozenXid, diff);
689 : }
690 76372 : if (minmulti_updated)
691 : {
692 26 : diff = (int32) (vacrel->NewRelminMxid -
693 26 : vacrel->cutoffs.relminmxid);
694 26 : appendStringInfo(&buf,
695 26 : _("new relminmxid: %u, which is %d MXIDs ahead of previous value\n"),
696 : vacrel->NewRelminMxid, diff);
697 : }
698 76372 : appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %lld tuples frozen\n"),
699 : vacrel->frozen_pages,
700 : orig_rel_pages == 0 ? 100.0 :
701 30094 : 100.0 * vacrel->frozen_pages / orig_rel_pages,
702 76372 : (long long) vacrel->tuples_frozen);
703 76372 : if (vacrel->do_index_vacuuming)
704 : {
705 61704 : if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
706 61656 : appendStringInfoString(&buf, _("index scan not needed: "));
707 : else
708 48 : appendStringInfoString(&buf, _("index scan needed: "));
709 :
710 61704 : msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
711 : }
712 : else
713 : {
714 14668 : if (!VacuumFailsafeActive)
715 0 : appendStringInfoString(&buf, _("index scan bypassed: "));
716 : else
717 14668 : appendStringInfoString(&buf, _("index scan bypassed by failsafe: "));
718 :
719 14668 : msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
720 : }
721 76372 : appendStringInfo(&buf, msgfmt,
722 : vacrel->lpdead_item_pages,
723 : orig_rel_pages == 0 ? 100.0 :
724 30094 : 100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
725 76372 : (long long) vacrel->lpdead_items);
726 191214 : for (int i = 0; i < vacrel->nindexes; i++)
727 : {
728 114842 : IndexBulkDeleteResult *istat = vacrel->indstats[i];
729 :
730 114842 : if (!istat)
731 114750 : continue;
732 :
733 92 : appendStringInfo(&buf,
734 92 : _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
735 92 : indnames[i],
736 : istat->num_pages,
737 : istat->pages_newly_deleted,
738 : istat->pages_deleted,
739 : istat->pages_free);
740 : }
741 76372 : if (track_io_timing)
742 : {
743 0 : double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
744 0 : double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
745 :
746 0 : appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
747 : read_ms, write_ms);
748 : }
749 76372 : if (secs_dur > 0 || usecs_dur > 0)
750 : {
751 76372 : read_rate = (double) BLCKSZ * total_blks_read /
752 76372 : (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0);
753 76372 : write_rate = (double) BLCKSZ * total_blks_dirtied /
754 76372 : (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0);
755 : }
756 76372 : appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
757 : read_rate, write_rate);
758 76372 : appendStringInfo(&buf,
759 76372 : _("buffer usage: %lld hits, %lld reads, %lld dirtied\n"),
760 : (long long) total_blks_hit,
761 : (long long) total_blks_read,
762 : (long long) total_blks_dirtied);
763 76372 : appendStringInfo(&buf,
764 76372 : _("WAL usage: %lld records, %lld full page images, %llu bytes\n"),
765 76372 : (long long) walusage.wal_records,
766 76372 : (long long) walusage.wal_fpi,
767 76372 : (unsigned long long) walusage.wal_bytes);
768 76372 : appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
769 :
770 76372 : ereport(verbose ? INFO : LOG,
771 : (errmsg_internal("%s", buf.data)));
772 76372 : pfree(buf.data);
773 : }
774 : }
775 :
776 : /* Cleanup index statistics and index names */
777 241400 : for (int i = 0; i < vacrel->nindexes; i++)
778 : {
779 144270 : if (vacrel->indstats[i])
780 2192 : pfree(vacrel->indstats[i]);
781 :
782 144270 : if (instrument)
783 114928 : pfree(indnames[i]);
784 : }
785 97130 : }
786 :
787 : /*
788 : * lazy_scan_heap() -- workhorse function for VACUUM
789 : *
790 : * This routine prunes each page in the heap, and considers the need to
791 : * freeze remaining tuples with storage (not including pages that can be
792 : * skipped using the visibility map). Also performs related maintenance
793 : * of the FSM and visibility map. These steps all take place during an
794 : * initial pass over the target heap relation.
795 : *
796 : * Also invokes lazy_vacuum_all_indexes to vacuum indexes, which largely
797 : * consists of deleting index tuples that point to LP_DEAD items left in
798 : * heap pages following pruning. Earlier initial pass over the heap will
799 : * have collected the TIDs whose index tuples need to be removed.
800 : *
801 : * Finally, invokes lazy_vacuum_heap_rel to vacuum heap pages, which
802 : * largely consists of marking LP_DEAD items (from vacrel->dead_items)
803 : * as LP_UNUSED. This has to happen in a second, final pass over the
804 : * heap, to preserve a basic invariant that all index AMs rely on: no
805 : * extant index tuple can ever be allowed to contain a TID that points to
806 : * an LP_UNUSED line pointer in the heap. We must disallow premature
807 : * recycling of line pointers to avoid index scans that get confused
808 : * about which TID points to which tuple immediately after recycling.
809 : * (Actually, this isn't a concern when target heap relation happens to
810 : * have no indexes, which allows us to safely apply the one-pass strategy
811 : * as an optimization).
812 : *
813 : * In practice we often have enough space to fit all TIDs, and so won't
814 : * need to call lazy_vacuum more than once, after our initial pass over
815 : * the heap has totally finished. Otherwise things are slightly more
816 : * complicated: our "initial pass" over the heap applies only to those
817 : * pages that were pruned before we needed to call lazy_vacuum, and our
818 : * "final pass" over the heap only vacuums these same heap pages.
819 : * However, we process indexes in full every time lazy_vacuum is called,
820 : * which makes index processing very inefficient when memory is in short
821 : * supply.
822 : */
823 : static void
824 97130 : lazy_scan_heap(LVRelState *vacrel)
825 : {
826 97130 : BlockNumber rel_pages = vacrel->rel_pages,
827 : blkno,
828 97130 : next_fsm_block_to_vacuum = 0;
829 : bool all_visible_according_to_vm;
830 :
831 97130 : TidStore *dead_items = vacrel->dead_items;
832 97130 : VacDeadItemsInfo *dead_items_info = vacrel->dead_items_info;
833 97130 : Buffer vmbuffer = InvalidBuffer;
834 97130 : const int initprog_index[] = {
835 : PROGRESS_VACUUM_PHASE,
836 : PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
837 : PROGRESS_VACUUM_MAX_DEAD_TUPLE_BYTES
838 : };
839 : int64 initprog_val[3];
840 :
841 : /* Report that we're scanning the heap, advertising total # of blocks */
842 97130 : initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
843 97130 : initprog_val[1] = rel_pages;
844 97130 : initprog_val[2] = dead_items_info->max_bytes;
845 97130 : pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
846 :
847 : /* Initialize for the first heap_vac_scan_next_block() call */
848 97130 : vacrel->current_block = InvalidBlockNumber;
849 97130 : vacrel->next_unskippable_block = InvalidBlockNumber;
850 97130 : vacrel->next_unskippable_allvis = false;
851 97130 : vacrel->next_unskippable_vmbuffer = InvalidBuffer;
852 :
853 507676 : while (heap_vac_scan_next_block(vacrel, &blkno, &all_visible_according_to_vm))
854 : {
855 : Buffer buf;
856 : Page page;
857 : bool has_lpdead_items;
858 410546 : bool got_cleanup_lock = false;
859 :
860 410546 : vacrel->scanned_pages++;
861 :
862 : /* Report as block scanned, update error traceback information */
863 410546 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
864 410546 : update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
865 : blkno, InvalidOffsetNumber);
866 :
867 410546 : vacuum_delay_point();
868 :
869 : /*
870 : * Regularly check if wraparound failsafe should trigger.
871 : *
872 : * There is a similar check inside lazy_vacuum_all_indexes(), but
873 : * relfrozenxid might start to look dangerously old before we reach
874 : * that point. This check also provides failsafe coverage for the
875 : * one-pass strategy, and the two-pass strategy with the index_cleanup
876 : * param set to 'off'.
877 : */
878 410546 : if (vacrel->scanned_pages % FAILSAFE_EVERY_PAGES == 0)
879 0 : lazy_check_wraparound_failsafe(vacrel);
880 :
881 : /*
882 : * Consider if we definitely have enough space to process TIDs on page
883 : * already. If we are close to overrunning the available space for
884 : * dead_items TIDs, pause and do a cycle of vacuuming before we tackle
885 : * this page.
886 : */
887 410546 : if (TidStoreMemoryUsage(dead_items) > dead_items_info->max_bytes)
888 : {
889 : /*
890 : * Before beginning index vacuuming, we release any pin we may
891 : * hold on the visibility map page. This isn't necessary for
892 : * correctness, but we do it anyway to avoid holding the pin
893 : * across a lengthy, unrelated operation.
894 : */
895 0 : if (BufferIsValid(vmbuffer))
896 : {
897 0 : ReleaseBuffer(vmbuffer);
898 0 : vmbuffer = InvalidBuffer;
899 : }
900 :
901 : /* Perform a round of index and heap vacuuming */
902 0 : vacrel->consider_bypass_optimization = false;
903 0 : lazy_vacuum(vacrel);
904 :
905 : /*
906 : * Vacuum the Free Space Map to make newly-freed space visible on
907 : * upper-level FSM pages. Note we have not yet processed blkno.
908 : */
909 0 : FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
910 : blkno);
911 0 : next_fsm_block_to_vacuum = blkno;
912 :
913 : /* Report that we are once again scanning the heap */
914 0 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
915 : PROGRESS_VACUUM_PHASE_SCAN_HEAP);
916 : }
917 :
918 : /*
919 : * Pin the visibility map page in case we need to mark the page
920 : * all-visible. In most cases this will be very cheap, because we'll
921 : * already have the correct page pinned anyway.
922 : */
923 410546 : visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
924 :
925 410546 : buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
926 : vacrel->bstrategy);
927 410546 : page = BufferGetPage(buf);
928 :
929 : /*
930 : * We need a buffer cleanup lock to prune HOT chains and defragment
931 : * the page in lazy_scan_prune. But when it's not possible to acquire
932 : * a cleanup lock right away, we may be able to settle for reduced
933 : * processing using lazy_scan_noprune.
934 : */
935 410546 : got_cleanup_lock = ConditionalLockBufferForCleanup(buf);
936 :
937 410546 : if (!got_cleanup_lock)
938 10 : LockBuffer(buf, BUFFER_LOCK_SHARE);
939 :
940 : /* Check for new or empty pages before lazy_scan_[no]prune call */
941 410546 : if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, !got_cleanup_lock,
942 410546 : vmbuffer))
943 : {
944 : /* Processed as new/empty page (lock and pin released) */
945 1212 : continue;
946 : }
947 :
948 : /*
949 : * If we didn't get the cleanup lock, we can still collect LP_DEAD
950 : * items in the dead_items area for later vacuuming, count live and
951 : * recently dead tuples for vacuum logging, and determine if this
952 : * block could later be truncated. If we encounter any xid/mxids that
953 : * require advancing the relfrozenxid/relminxid, we'll have to wait
954 : * for a cleanup lock and call lazy_scan_prune().
955 : */
956 409334 : if (!got_cleanup_lock &&
957 10 : !lazy_scan_noprune(vacrel, buf, blkno, page, &has_lpdead_items))
958 : {
959 : /*
960 : * lazy_scan_noprune could not do all required processing. Wait
961 : * for a cleanup lock, and call lazy_scan_prune in the usual way.
962 : */
963 : Assert(vacrel->aggressive);
964 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
965 0 : LockBufferForCleanup(buf);
966 0 : got_cleanup_lock = true;
967 : }
968 :
969 : /*
970 : * If we have a cleanup lock, we must now prune, freeze, and count
971 : * tuples. We may have acquired the cleanup lock originally, or we may
972 : * have gone back and acquired it after lazy_scan_noprune() returned
973 : * false. Either way, the page hasn't been processed yet.
974 : *
975 : * Like lazy_scan_noprune(), lazy_scan_prune() will count
976 : * recently_dead_tuples and live tuples for vacuum logging, determine
977 : * if the block can later be truncated, and accumulate the details of
978 : * remaining LP_DEAD line pointers on the page into dead_items. These
979 : * dead items include those pruned by lazy_scan_prune() as well as
980 : * line pointers previously marked LP_DEAD.
981 : */
982 409334 : if (got_cleanup_lock)
983 409324 : lazy_scan_prune(vacrel, buf, blkno, page,
984 : vmbuffer, all_visible_according_to_vm,
985 : &has_lpdead_items);
986 :
987 : /*
988 : * Now drop the buffer lock and, potentially, update the FSM.
989 : *
990 : * Our goal is to update the freespace map the last time we touch the
991 : * page. If we'll process a block in the second pass, we may free up
992 : * additional space on the page, so it is better to update the FSM
993 : * after the second pass. If the relation has no indexes, or if index
994 : * vacuuming is disabled, there will be no second heap pass; if this
995 : * particular page has no dead items, the second heap pass will not
996 : * touch this page. So, in those cases, update the FSM now.
997 : *
998 : * Note: In corner cases, it's possible to miss updating the FSM
999 : * entirely. If index vacuuming is currently enabled, we'll skip the
1000 : * FSM update now. But if failsafe mode is later activated, or there
1001 : * are so few dead tuples that index vacuuming is bypassed, there will
1002 : * also be no opportunity to update the FSM later, because we'll never
1003 : * revisit this page. Since updating the FSM is desirable but not
1004 : * absolutely required, that's OK.
1005 : */
1006 409334 : if (vacrel->nindexes == 0
1007 390582 : || !vacrel->do_index_vacuuming
1008 313020 : || !has_lpdead_items)
1009 387300 : {
1010 387300 : Size freespace = PageGetHeapFreeSpace(page);
1011 :
1012 387300 : UnlockReleaseBuffer(buf);
1013 387300 : RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1014 :
1015 : /*
1016 : * Periodically perform FSM vacuuming to make newly-freed space
1017 : * visible on upper FSM pages. This is done after vacuuming if the
1018 : * table has indexes. There will only be newly-freed space if we
1019 : * held the cleanup lock and lazy_scan_prune() was called.
1020 : */
1021 387300 : if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items &&
1022 0 : blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1023 : {
1024 0 : FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1025 : blkno);
1026 0 : next_fsm_block_to_vacuum = blkno;
1027 : }
1028 : }
1029 : else
1030 22034 : UnlockReleaseBuffer(buf);
1031 : }
1032 :
1033 97130 : vacrel->blkno = InvalidBlockNumber;
1034 97130 : if (BufferIsValid(vmbuffer))
1035 38832 : ReleaseBuffer(vmbuffer);
1036 :
1037 : /* report that everything is now scanned */
1038 97130 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1039 :
1040 : /* now we can compute the new value for pg_class.reltuples */
1041 194260 : vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages,
1042 : vacrel->scanned_pages,
1043 97130 : vacrel->live_tuples);
1044 :
1045 : /*
1046 : * Also compute the total number of surviving heap entries. In the
1047 : * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1048 : */
1049 97130 : vacrel->new_rel_tuples =
1050 97130 : Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples +
1051 97130 : vacrel->missed_dead_tuples;
1052 :
1053 : /*
1054 : * Do index vacuuming (call each index's ambulkdelete routine), then do
1055 : * related heap vacuuming
1056 : */
1057 97130 : if (dead_items_info->num_items > 0)
1058 1012 : lazy_vacuum(vacrel);
1059 :
1060 : /*
1061 : * Vacuum the remainder of the Free Space Map. We must do this whether or
1062 : * not there were indexes, and whether or not we bypassed index vacuuming.
1063 : */
1064 97130 : if (blkno > next_fsm_block_to_vacuum)
1065 38832 : FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1066 :
1067 : /* report all blocks vacuumed */
1068 97130 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1069 :
1070 : /* Do final index cleanup (call each index's amvacuumcleanup routine) */
1071 97130 : if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1072 77642 : lazy_cleanup_all_indexes(vacrel);
1073 97130 : }
1074 :
1075 : /*
1076 : * heap_vac_scan_next_block() -- get next block for vacuum to process
1077 : *
1078 : * lazy_scan_heap() calls here every time it needs to get the next block to
1079 : * prune and vacuum. The function uses the visibility map, vacuum options,
1080 : * and various thresholds to skip blocks which do not need to be processed and
1081 : * sets blkno to the next block to process.
1082 : *
1083 : * The block number and visibility status of the next block to process are set
1084 : * in *blkno and *all_visible_according_to_vm. The return value is false if
1085 : * there are no further blocks to process.
1086 : *
1087 : * vacrel is an in/out parameter here. Vacuum options and information about
1088 : * the relation are read. vacrel->skippedallvis is set if we skip a block
1089 : * that's all-visible but not all-frozen, to ensure that we don't update
1090 : * relfrozenxid in that case. vacrel also holds information about the next
1091 : * unskippable block, as bookkeeping for this function.
1092 : */
1093 : static bool
1094 507676 : heap_vac_scan_next_block(LVRelState *vacrel, BlockNumber *blkno,
1095 : bool *all_visible_according_to_vm)
1096 : {
1097 : BlockNumber next_block;
1098 :
1099 : /* relies on InvalidBlockNumber + 1 overflowing to 0 on first call */
1100 507676 : next_block = vacrel->current_block + 1;
1101 :
1102 : /* Have we reached the end of the relation? */
1103 507676 : if (next_block >= vacrel->rel_pages)
1104 : {
1105 97130 : if (BufferIsValid(vacrel->next_unskippable_vmbuffer))
1106 : {
1107 36492 : ReleaseBuffer(vacrel->next_unskippable_vmbuffer);
1108 36492 : vacrel->next_unskippable_vmbuffer = InvalidBuffer;
1109 : }
1110 97130 : *blkno = vacrel->rel_pages;
1111 97130 : return false;
1112 : }
1113 :
1114 : /*
1115 : * We must be in one of the three following states:
1116 : */
1117 410546 : if (next_block > vacrel->next_unskippable_block ||
1118 145642 : vacrel->next_unskippable_block == InvalidBlockNumber)
1119 : {
1120 : /*
1121 : * 1. We have just processed an unskippable block (or we're at the
1122 : * beginning of the scan). Find the next unskippable block using the
1123 : * visibility map.
1124 : */
1125 : bool skipsallvis;
1126 :
1127 303736 : find_next_unskippable_block(vacrel, &skipsallvis);
1128 :
1129 : /*
1130 : * We now know the next block that we must process. It can be the
1131 : * next block after the one we just processed, or something further
1132 : * ahead. If it's further ahead, we can jump to it, but we choose to
1133 : * do so only if we can skip at least SKIP_PAGES_THRESHOLD consecutive
1134 : * pages. Since we're reading sequentially, the OS should be doing
1135 : * readahead for us, so there's no gain in skipping a page now and
1136 : * then. Skipping such a range might even discourage sequential
1137 : * detection.
1138 : *
1139 : * This test also enables more frequent relfrozenxid advancement
1140 : * during non-aggressive VACUUMs. If the range has any all-visible
1141 : * pages then skipping makes updating relfrozenxid unsafe, which is a
1142 : * real downside.
1143 : */
1144 303736 : if (vacrel->next_unskippable_block - next_block >= SKIP_PAGES_THRESHOLD)
1145 : {
1146 3276 : next_block = vacrel->next_unskippable_block;
1147 3276 : if (skipsallvis)
1148 50 : vacrel->skippedallvis = true;
1149 : }
1150 : }
1151 :
1152 : /* Now we must be in one of the two remaining states: */
1153 410546 : if (next_block < vacrel->next_unskippable_block)
1154 : {
1155 : /*
1156 : * 2. We are processing a range of blocks that we could have skipped
1157 : * but chose not to. We know that they are all-visible in the VM,
1158 : * otherwise they would've been unskippable.
1159 : */
1160 106810 : *blkno = vacrel->current_block = next_block;
1161 106810 : *all_visible_according_to_vm = true;
1162 106810 : return true;
1163 : }
1164 : else
1165 : {
1166 : /*
1167 : * 3. We reached the next unskippable block. Process it. On next
1168 : * iteration, we will be back in state 1.
1169 : */
1170 : Assert(next_block == vacrel->next_unskippable_block);
1171 :
1172 303736 : *blkno = vacrel->current_block = next_block;
1173 303736 : *all_visible_according_to_vm = vacrel->next_unskippable_allvis;
1174 303736 : return true;
1175 : }
1176 : }
1177 :
1178 : /*
1179 : * Find the next unskippable block in a vacuum scan using the visibility map.
1180 : * The next unskippable block and its visibility information is updated in
1181 : * vacrel.
1182 : *
1183 : * Note: our opinion of which blocks can be skipped can go stale immediately.
1184 : * It's okay if caller "misses" a page whose all-visible or all-frozen marking
1185 : * was concurrently cleared, though. All that matters is that caller scan all
1186 : * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact.
1187 : * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with
1188 : * older XIDs/MXIDs. The *skippedallvis flag will be set here when the choice
1189 : * to skip such a range is actually made, making everything safe.)
1190 : */
1191 : static void
1192 303736 : find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis)
1193 : {
1194 303736 : BlockNumber rel_pages = vacrel->rel_pages;
1195 303736 : BlockNumber next_unskippable_block = vacrel->next_unskippable_block + 1;
1196 303736 : Buffer next_unskippable_vmbuffer = vacrel->next_unskippable_vmbuffer;
1197 : bool next_unskippable_allvis;
1198 :
1199 303736 : *skipsallvis = false;
1200 :
1201 : for (;;)
1202 336836 : {
1203 640572 : uint8 mapbits = visibilitymap_get_status(vacrel->rel,
1204 : next_unskippable_block,
1205 : &next_unskippable_vmbuffer);
1206 :
1207 640572 : next_unskippable_allvis = (mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0;
1208 :
1209 : /*
1210 : * A block is unskippable if it is not all visible according to the
1211 : * visibility map.
1212 : */
1213 640572 : if (!next_unskippable_allvis)
1214 : {
1215 : Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0);
1216 270648 : break;
1217 : }
1218 :
1219 : /*
1220 : * Caller must scan the last page to determine whether it has tuples
1221 : * (caller must have the opportunity to set vacrel->nonempty_pages).
1222 : * This rule avoids having lazy_truncate_heap() take access-exclusive
1223 : * lock on rel to attempt a truncation that fails anyway, just because
1224 : * there are tuples on the last page (it is likely that there will be
1225 : * tuples on other nearby pages as well, but those can be skipped).
1226 : *
1227 : * Implement this by always treating the last block as unsafe to skip.
1228 : */
1229 369924 : if (next_unskippable_block == rel_pages - 1)
1230 32348 : break;
1231 :
1232 : /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */
1233 337576 : if (!vacrel->skipwithvm)
1234 740 : break;
1235 :
1236 : /*
1237 : * Aggressive VACUUM caller can't skip pages just because they are
1238 : * all-visible. They may still skip all-frozen pages, which can't
1239 : * contain XIDs < OldestXmin (XIDs that aren't already frozen by now).
1240 : */
1241 336836 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0)
1242 : {
1243 4930 : if (vacrel->aggressive)
1244 0 : break;
1245 :
1246 : /*
1247 : * All-visible block is safe to skip in non-aggressive case. But
1248 : * remember that the final range contains such a block for later.
1249 : */
1250 4930 : *skipsallvis = true;
1251 : }
1252 :
1253 336836 : next_unskippable_block++;
1254 : }
1255 :
1256 : /* write the local variables back to vacrel */
1257 303736 : vacrel->next_unskippable_block = next_unskippable_block;
1258 303736 : vacrel->next_unskippable_allvis = next_unskippable_allvis;
1259 303736 : vacrel->next_unskippable_vmbuffer = next_unskippable_vmbuffer;
1260 303736 : }
1261 :
1262 : /*
1263 : * lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling.
1264 : *
1265 : * Must call here to handle both new and empty pages before calling
1266 : * lazy_scan_prune or lazy_scan_noprune, since they're not prepared to deal
1267 : * with new or empty pages.
1268 : *
1269 : * It's necessary to consider new pages as a special case, since the rules for
1270 : * maintaining the visibility map and FSM with empty pages are a little
1271 : * different (though new pages can be truncated away during rel truncation).
1272 : *
1273 : * Empty pages are not really a special case -- they're just heap pages that
1274 : * have no allocated tuples (including even LP_UNUSED items). You might
1275 : * wonder why we need to handle them here all the same. It's only necessary
1276 : * because of a corner-case involving a hard crash during heap relation
1277 : * extension. If we ever make relation-extension crash safe, then it should
1278 : * no longer be necessary to deal with empty pages here (or new pages, for
1279 : * that matter).
1280 : *
1281 : * Caller must hold at least a shared lock. We might need to escalate the
1282 : * lock in that case, so the type of lock caller holds needs to be specified
1283 : * using 'sharelock' argument.
1284 : *
1285 : * Returns false in common case where caller should go on to call
1286 : * lazy_scan_prune (or lazy_scan_noprune). Otherwise returns true, indicating
1287 : * that lazy_scan_heap is done processing the page, releasing lock on caller's
1288 : * behalf.
1289 : */
1290 : static bool
1291 410546 : lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
1292 : Page page, bool sharelock, Buffer vmbuffer)
1293 : {
1294 : Size freespace;
1295 :
1296 410546 : if (PageIsNew(page))
1297 : {
1298 : /*
1299 : * All-zeroes pages can be left over if either a backend extends the
1300 : * relation by a single page, but crashes before the newly initialized
1301 : * page has been written out, or when bulk-extending the relation
1302 : * (which creates a number of empty pages at the tail end of the
1303 : * relation), and then enters them into the FSM.
1304 : *
1305 : * Note we do not enter the page into the visibilitymap. That has the
1306 : * downside that we repeatedly visit this page in subsequent vacuums,
1307 : * but otherwise we'll never discover the space on a promoted standby.
1308 : * The harm of repeated checking ought to normally not be too bad. The
1309 : * space usually should be used at some point, otherwise there
1310 : * wouldn't be any regular vacuums.
1311 : *
1312 : * Make sure these pages are in the FSM, to ensure they can be reused.
1313 : * Do that by testing if there's any space recorded for the page. If
1314 : * not, enter it. We do so after releasing the lock on the heap page,
1315 : * the FSM is approximate, after all.
1316 : */
1317 1170 : UnlockReleaseBuffer(buf);
1318 :
1319 1170 : if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1320 : {
1321 858 : freespace = BLCKSZ - SizeOfPageHeaderData;
1322 :
1323 858 : RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1324 : }
1325 :
1326 1170 : return true;
1327 : }
1328 :
1329 409376 : if (PageIsEmpty(page))
1330 : {
1331 : /*
1332 : * It seems likely that caller will always be able to get a cleanup
1333 : * lock on an empty page. But don't take any chances -- escalate to
1334 : * an exclusive lock (still don't need a cleanup lock, though).
1335 : */
1336 42 : if (sharelock)
1337 : {
1338 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1339 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1340 :
1341 0 : if (!PageIsEmpty(page))
1342 : {
1343 : /* page isn't new or empty -- keep lock and pin for now */
1344 0 : return false;
1345 : }
1346 : }
1347 : else
1348 : {
1349 : /* Already have a full cleanup lock (which is more than enough) */
1350 : }
1351 :
1352 : /*
1353 : * Unlike new pages, empty pages are always set all-visible and
1354 : * all-frozen.
1355 : */
1356 42 : if (!PageIsAllVisible(page))
1357 : {
1358 0 : START_CRIT_SECTION();
1359 :
1360 : /* mark buffer dirty before writing a WAL record */
1361 0 : MarkBufferDirty(buf);
1362 :
1363 : /*
1364 : * It's possible that another backend has extended the heap,
1365 : * initialized the page, and then failed to WAL-log the page due
1366 : * to an ERROR. Since heap extension is not WAL-logged, recovery
1367 : * might try to replay our record setting the page all-visible and
1368 : * find that the page isn't initialized, which will cause a PANIC.
1369 : * To prevent that, check whether the page has been previously
1370 : * WAL-logged, and if not, do that now.
1371 : */
1372 0 : if (RelationNeedsWAL(vacrel->rel) &&
1373 0 : PageGetLSN(page) == InvalidXLogRecPtr)
1374 0 : log_newpage_buffer(buf, true);
1375 :
1376 0 : PageSetAllVisible(page);
1377 0 : visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1378 : vmbuffer, InvalidTransactionId,
1379 : VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1380 0 : END_CRIT_SECTION();
1381 : }
1382 :
1383 42 : freespace = PageGetHeapFreeSpace(page);
1384 42 : UnlockReleaseBuffer(buf);
1385 42 : RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1386 42 : return true;
1387 : }
1388 :
1389 : /* page isn't new or empty -- keep lock and pin */
1390 409334 : return false;
1391 : }
1392 :
1393 : /* qsort comparator for sorting OffsetNumbers */
1394 : static int
1395 5565892 : cmpOffsetNumbers(const void *a, const void *b)
1396 : {
1397 5565892 : return pg_cmp_u16(*(const OffsetNumber *) a, *(const OffsetNumber *) b);
1398 : }
1399 :
1400 : /*
1401 : * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1402 : *
1403 : * Caller must hold pin and buffer cleanup lock on the buffer.
1404 : *
1405 : * vmbuffer is the buffer containing the VM block with visibility information
1406 : * for the heap block, blkno. all_visible_according_to_vm is the saved
1407 : * visibility status of the heap block looked up earlier by the caller. We
1408 : * won't rely entirely on this status, as it may be out of date.
1409 : *
1410 : * *has_lpdead_items is set to true or false depending on whether, upon return
1411 : * from this function, any LP_DEAD items are still present on the page.
1412 : */
1413 : static void
1414 409324 : lazy_scan_prune(LVRelState *vacrel,
1415 : Buffer buf,
1416 : BlockNumber blkno,
1417 : Page page,
1418 : Buffer vmbuffer,
1419 : bool all_visible_according_to_vm,
1420 : bool *has_lpdead_items)
1421 : {
1422 409324 : Relation rel = vacrel->rel;
1423 : PruneFreezeResult presult;
1424 409324 : int prune_options = 0;
1425 :
1426 : Assert(BufferGetBlockNumber(buf) == blkno);
1427 :
1428 : /*
1429 : * Prune all HOT-update chains and potentially freeze tuples on this page.
1430 : *
1431 : * If the relation has no indexes, we can immediately mark would-be dead
1432 : * items LP_UNUSED.
1433 : *
1434 : * The number of tuples removed from the page is returned in
1435 : * presult.ndeleted. It should not be confused with presult.lpdead_items;
1436 : * presult.lpdead_items's final value can be thought of as the number of
1437 : * tuples that were deleted from indexes.
1438 : *
1439 : * We will update the VM after collecting LP_DEAD items and freezing
1440 : * tuples. Pruning will have determined whether or not the page is
1441 : * all-visible.
1442 : */
1443 409324 : prune_options = HEAP_PAGE_PRUNE_FREEZE;
1444 409324 : if (vacrel->nindexes == 0)
1445 18752 : prune_options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW;
1446 :
1447 409324 : heap_page_prune_and_freeze(rel, buf, vacrel->vistest, prune_options,
1448 : &vacrel->cutoffs, &presult, PRUNE_VACUUM_SCAN,
1449 : &vacrel->offnum,
1450 : &vacrel->NewRelfrozenXid, &vacrel->NewRelminMxid);
1451 :
1452 : Assert(MultiXactIdIsValid(vacrel->NewRelminMxid));
1453 : Assert(TransactionIdIsValid(vacrel->NewRelfrozenXid));
1454 :
1455 409324 : if (presult.nfrozen > 0)
1456 : {
1457 : /*
1458 : * We don't increment the frozen_pages instrumentation counter when
1459 : * nfrozen == 0, since it only counts pages with newly frozen tuples
1460 : * (don't confuse that with pages newly set all-frozen in VM).
1461 : */
1462 32318 : vacrel->frozen_pages++;
1463 : }
1464 :
1465 : /*
1466 : * VACUUM will call heap_page_is_all_visible() during the second pass over
1467 : * the heap to determine all_visible and all_frozen for the page -- this
1468 : * is a specialized version of the logic from this function. Now that
1469 : * we've finished pruning and freezing, make sure that we're in total
1470 : * agreement with heap_page_is_all_visible() using an assertion.
1471 : */
1472 : #ifdef USE_ASSERT_CHECKING
1473 : /* Note that all_frozen value does not matter when !all_visible */
1474 : if (presult.all_visible)
1475 : {
1476 : TransactionId debug_cutoff;
1477 : bool debug_all_frozen;
1478 :
1479 : Assert(presult.lpdead_items == 0);
1480 :
1481 : if (!heap_page_is_all_visible(vacrel, buf,
1482 : &debug_cutoff, &debug_all_frozen))
1483 : Assert(false);
1484 :
1485 : Assert(presult.all_frozen == debug_all_frozen);
1486 :
1487 : Assert(!TransactionIdIsValid(debug_cutoff) ||
1488 : debug_cutoff == presult.vm_conflict_horizon);
1489 : }
1490 : #endif
1491 :
1492 : /*
1493 : * Now save details of the LP_DEAD items from the page in vacrel
1494 : */
1495 409324 : if (presult.lpdead_items > 0)
1496 : {
1497 26732 : vacrel->lpdead_item_pages++;
1498 :
1499 : /*
1500 : * deadoffsets are collected incrementally in
1501 : * heap_page_prune_and_freeze() as each dead line pointer is recorded,
1502 : * with an indeterminate order, but dead_items_add requires them to be
1503 : * sorted.
1504 : */
1505 26732 : qsort(presult.deadoffsets, presult.lpdead_items, sizeof(OffsetNumber),
1506 : cmpOffsetNumbers);
1507 :
1508 26732 : dead_items_add(vacrel, blkno, presult.deadoffsets, presult.lpdead_items);
1509 : }
1510 :
1511 : /* Finally, add page-local counts to whole-VACUUM counts */
1512 409324 : vacrel->tuples_deleted += presult.ndeleted;
1513 409324 : vacrel->tuples_frozen += presult.nfrozen;
1514 409324 : vacrel->lpdead_items += presult.lpdead_items;
1515 409324 : vacrel->live_tuples += presult.live_tuples;
1516 409324 : vacrel->recently_dead_tuples += presult.recently_dead_tuples;
1517 :
1518 : /* Can't truncate this page */
1519 409324 : if (presult.hastup)
1520 395558 : vacrel->nonempty_pages = blkno + 1;
1521 :
1522 : /* Did we find LP_DEAD items? */
1523 409324 : *has_lpdead_items = (presult.lpdead_items > 0);
1524 :
1525 : Assert(!presult.all_visible || !(*has_lpdead_items));
1526 :
1527 : /*
1528 : * Handle setting visibility map bit based on information from the VM (as
1529 : * of last heap_vac_scan_next_block() call), and from all_visible and
1530 : * all_frozen variables
1531 : */
1532 409324 : if (!all_visible_according_to_vm && presult.all_visible)
1533 53386 : {
1534 53386 : uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1535 :
1536 53386 : if (presult.all_frozen)
1537 : {
1538 : Assert(!TransactionIdIsValid(presult.vm_conflict_horizon));
1539 40104 : flags |= VISIBILITYMAP_ALL_FROZEN;
1540 : }
1541 :
1542 : /*
1543 : * It should never be the case that the visibility map page is set
1544 : * while the page-level bit is clear, but the reverse is allowed (if
1545 : * checksums are not enabled). Regardless, set both bits so that we
1546 : * get back in sync.
1547 : *
1548 : * NB: If the heap page is all-visible but the VM bit is not set, we
1549 : * don't need to dirty the heap page. However, if checksums are
1550 : * enabled, we do need to make sure that the heap page is dirtied
1551 : * before passing it to visibilitymap_set(), because it may be logged.
1552 : * Given that this situation should only happen in rare cases after a
1553 : * crash, it is not worth optimizing.
1554 : */
1555 53386 : PageSetAllVisible(page);
1556 53386 : MarkBufferDirty(buf);
1557 53386 : visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1558 : vmbuffer, presult.vm_conflict_horizon,
1559 : flags);
1560 : }
1561 :
1562 : /*
1563 : * As of PostgreSQL 9.2, the visibility map bit should never be set if the
1564 : * page-level bit is clear. However, it's possible that the bit got
1565 : * cleared after heap_vac_scan_next_block() was called, so we must recheck
1566 : * with buffer lock before concluding that the VM is corrupt.
1567 : */
1568 355938 : else if (all_visible_according_to_vm && !PageIsAllVisible(page) &&
1569 0 : visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0)
1570 : {
1571 0 : elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1572 : vacrel->relname, blkno);
1573 0 : visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1574 : VISIBILITYMAP_VALID_BITS);
1575 : }
1576 :
1577 : /*
1578 : * It's possible for the value returned by
1579 : * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1580 : * wrong for us to see tuples that appear to not be visible to everyone
1581 : * yet, while PD_ALL_VISIBLE is already set. The real safe xmin value
1582 : * never moves backwards, but GetOldestNonRemovableTransactionId() is
1583 : * conservative and sometimes returns a value that's unnecessarily small,
1584 : * so if we see that contradiction it just means that the tuples that we
1585 : * think are not visible to everyone yet actually are, and the
1586 : * PD_ALL_VISIBLE flag is correct.
1587 : *
1588 : * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set,
1589 : * however.
1590 : */
1591 355938 : else if (presult.lpdead_items > 0 && PageIsAllVisible(page))
1592 : {
1593 0 : elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u",
1594 : vacrel->relname, blkno);
1595 0 : PageClearAllVisible(page);
1596 0 : MarkBufferDirty(buf);
1597 0 : visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1598 : VISIBILITYMAP_VALID_BITS);
1599 : }
1600 :
1601 : /*
1602 : * If the all-visible page is all-frozen but not marked as such yet, mark
1603 : * it as all-frozen. Note that all_frozen is only valid if all_visible is
1604 : * true, so we must check both all_visible and all_frozen.
1605 : */
1606 355938 : else if (all_visible_according_to_vm && presult.all_visible &&
1607 139854 : presult.all_frozen && !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1608 : {
1609 : /*
1610 : * Avoid relying on all_visible_according_to_vm as a proxy for the
1611 : * page-level PD_ALL_VISIBLE bit being set, since it might have become
1612 : * stale -- even when all_visible is set
1613 : */
1614 22 : if (!PageIsAllVisible(page))
1615 : {
1616 0 : PageSetAllVisible(page);
1617 0 : MarkBufferDirty(buf);
1618 : }
1619 :
1620 : /*
1621 : * Set the page all-frozen (and all-visible) in the VM.
1622 : *
1623 : * We can pass InvalidTransactionId as our cutoff_xid, since a
1624 : * snapshotConflictHorizon sufficient to make everything safe for REDO
1625 : * was logged when the page's tuples were frozen.
1626 : */
1627 : Assert(!TransactionIdIsValid(presult.vm_conflict_horizon));
1628 22 : visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1629 : vmbuffer, InvalidTransactionId,
1630 : VISIBILITYMAP_ALL_VISIBLE |
1631 : VISIBILITYMAP_ALL_FROZEN);
1632 : }
1633 409324 : }
1634 :
1635 : /*
1636 : * lazy_scan_noprune() -- lazy_scan_prune() without pruning or freezing
1637 : *
1638 : * Caller need only hold a pin and share lock on the buffer, unlike
1639 : * lazy_scan_prune, which requires a full cleanup lock. While pruning isn't
1640 : * performed here, it's quite possible that an earlier opportunistic pruning
1641 : * operation left LP_DEAD items behind. We'll at least collect any such items
1642 : * in dead_items for removal from indexes.
1643 : *
1644 : * For aggressive VACUUM callers, we may return false to indicate that a full
1645 : * cleanup lock is required for processing by lazy_scan_prune. This is only
1646 : * necessary when the aggressive VACUUM needs to freeze some tuple XIDs from
1647 : * one or more tuples on the page. We always return true for non-aggressive
1648 : * callers.
1649 : *
1650 : * If this function returns true, *has_lpdead_items gets set to true or false
1651 : * depending on whether, upon return from this function, any LP_DEAD items are
1652 : * present on the page. If this function returns false, *has_lpdead_items
1653 : * is not updated.
1654 : */
1655 : static bool
1656 10 : lazy_scan_noprune(LVRelState *vacrel,
1657 : Buffer buf,
1658 : BlockNumber blkno,
1659 : Page page,
1660 : bool *has_lpdead_items)
1661 : {
1662 : OffsetNumber offnum,
1663 : maxoff;
1664 : int lpdead_items,
1665 : live_tuples,
1666 : recently_dead_tuples,
1667 : missed_dead_tuples;
1668 : bool hastup;
1669 : HeapTupleHeader tupleheader;
1670 10 : TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid;
1671 10 : MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid;
1672 : OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1673 :
1674 : Assert(BufferGetBlockNumber(buf) == blkno);
1675 :
1676 10 : hastup = false; /* for now */
1677 :
1678 10 : lpdead_items = 0;
1679 10 : live_tuples = 0;
1680 10 : recently_dead_tuples = 0;
1681 10 : missed_dead_tuples = 0;
1682 :
1683 10 : maxoff = PageGetMaxOffsetNumber(page);
1684 274 : for (offnum = FirstOffsetNumber;
1685 : offnum <= maxoff;
1686 264 : offnum = OffsetNumberNext(offnum))
1687 : {
1688 : ItemId itemid;
1689 : HeapTupleData tuple;
1690 :
1691 264 : vacrel->offnum = offnum;
1692 264 : itemid = PageGetItemId(page, offnum);
1693 :
1694 264 : if (!ItemIdIsUsed(itemid))
1695 38 : continue;
1696 :
1697 228 : if (ItemIdIsRedirected(itemid))
1698 : {
1699 2 : hastup = true;
1700 2 : continue;
1701 : }
1702 :
1703 226 : if (ItemIdIsDead(itemid))
1704 : {
1705 : /*
1706 : * Deliberately don't set hastup=true here. See same point in
1707 : * lazy_scan_prune for an explanation.
1708 : */
1709 0 : deadoffsets[lpdead_items++] = offnum;
1710 0 : continue;
1711 : }
1712 :
1713 226 : hastup = true; /* page prevents rel truncation */
1714 226 : tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1715 226 : if (heap_tuple_should_freeze(tupleheader, &vacrel->cutoffs,
1716 : &NoFreezePageRelfrozenXid,
1717 : &NoFreezePageRelminMxid))
1718 : {
1719 : /* Tuple with XID < FreezeLimit (or MXID < MultiXactCutoff) */
1720 128 : if (vacrel->aggressive)
1721 : {
1722 : /*
1723 : * Aggressive VACUUMs must always be able to advance rel's
1724 : * relfrozenxid to a value >= FreezeLimit (and be able to
1725 : * advance rel's relminmxid to a value >= MultiXactCutoff).
1726 : * The ongoing aggressive VACUUM won't be able to do that
1727 : * unless it can freeze an XID (or MXID) from this tuple now.
1728 : *
1729 : * The only safe option is to have caller perform processing
1730 : * of this page using lazy_scan_prune. Caller might have to
1731 : * wait a while for a cleanup lock, but it can't be helped.
1732 : */
1733 0 : vacrel->offnum = InvalidOffsetNumber;
1734 0 : return false;
1735 : }
1736 :
1737 : /*
1738 : * Non-aggressive VACUUMs are under no obligation to advance
1739 : * relfrozenxid (even by one XID). We can be much laxer here.
1740 : *
1741 : * Currently we always just accept an older final relfrozenxid
1742 : * and/or relminmxid value. We never make caller wait or work a
1743 : * little harder, even when it likely makes sense to do so.
1744 : */
1745 : }
1746 :
1747 226 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
1748 226 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1749 226 : tuple.t_len = ItemIdGetLength(itemid);
1750 226 : tuple.t_tableOid = RelationGetRelid(vacrel->rel);
1751 :
1752 226 : switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
1753 : buf))
1754 : {
1755 220 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1756 : case HEAPTUPLE_LIVE:
1757 :
1758 : /*
1759 : * Count both cases as live, just like lazy_scan_prune
1760 : */
1761 220 : live_tuples++;
1762 :
1763 220 : break;
1764 2 : case HEAPTUPLE_DEAD:
1765 :
1766 : /*
1767 : * There is some useful work for pruning to do, that won't be
1768 : * done due to failure to get a cleanup lock.
1769 : */
1770 2 : missed_dead_tuples++;
1771 2 : break;
1772 4 : case HEAPTUPLE_RECENTLY_DEAD:
1773 :
1774 : /*
1775 : * Count in recently_dead_tuples, just like lazy_scan_prune
1776 : */
1777 4 : recently_dead_tuples++;
1778 4 : break;
1779 0 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1780 :
1781 : /*
1782 : * Do not count these rows as live, just like lazy_scan_prune
1783 : */
1784 0 : break;
1785 0 : default:
1786 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1787 : break;
1788 : }
1789 : }
1790 :
1791 10 : vacrel->offnum = InvalidOffsetNumber;
1792 :
1793 : /*
1794 : * By here we know for sure that caller can put off freezing and pruning
1795 : * this particular page until the next VACUUM. Remember its details now.
1796 : * (lazy_scan_prune expects a clean slate, so we have to do this last.)
1797 : */
1798 10 : vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid;
1799 10 : vacrel->NewRelminMxid = NoFreezePageRelminMxid;
1800 :
1801 : /* Save any LP_DEAD items found on the page in dead_items */
1802 10 : if (vacrel->nindexes == 0)
1803 : {
1804 : /* Using one-pass strategy (since table has no indexes) */
1805 0 : if (lpdead_items > 0)
1806 : {
1807 : /*
1808 : * Perfunctory handling for the corner case where a single pass
1809 : * strategy VACUUM cannot get a cleanup lock, and it turns out
1810 : * that there is one or more LP_DEAD items: just count the LP_DEAD
1811 : * items as missed_dead_tuples instead. (This is a bit dishonest,
1812 : * but it beats having to maintain specialized heap vacuuming code
1813 : * forever, for vanishingly little benefit.)
1814 : */
1815 0 : hastup = true;
1816 0 : missed_dead_tuples += lpdead_items;
1817 : }
1818 : }
1819 10 : else if (lpdead_items > 0)
1820 : {
1821 : /*
1822 : * Page has LP_DEAD items, and so any references/TIDs that remain in
1823 : * indexes will be deleted during index vacuuming (and then marked
1824 : * LP_UNUSED in the heap)
1825 : */
1826 0 : vacrel->lpdead_item_pages++;
1827 :
1828 0 : dead_items_add(vacrel, blkno, deadoffsets, lpdead_items);
1829 :
1830 0 : vacrel->lpdead_items += lpdead_items;
1831 : }
1832 :
1833 : /*
1834 : * Finally, add relevant page-local counts to whole-VACUUM counts
1835 : */
1836 10 : vacrel->live_tuples += live_tuples;
1837 10 : vacrel->recently_dead_tuples += recently_dead_tuples;
1838 10 : vacrel->missed_dead_tuples += missed_dead_tuples;
1839 10 : if (missed_dead_tuples > 0)
1840 2 : vacrel->missed_dead_pages++;
1841 :
1842 : /* Can't truncate this page */
1843 10 : if (hastup)
1844 10 : vacrel->nonempty_pages = blkno + 1;
1845 :
1846 : /* Did we find LP_DEAD items? */
1847 10 : *has_lpdead_items = (lpdead_items > 0);
1848 :
1849 : /* Caller won't need to call lazy_scan_prune with same page */
1850 10 : return true;
1851 : }
1852 :
1853 : /*
1854 : * Main entry point for index vacuuming and heap vacuuming.
1855 : *
1856 : * Removes items collected in dead_items from table's indexes, then marks the
1857 : * same items LP_UNUSED in the heap. See the comments above lazy_scan_heap
1858 : * for full details.
1859 : *
1860 : * Also empties dead_items, freeing up space for later TIDs.
1861 : *
1862 : * We may choose to bypass index vacuuming at this point, though only when the
1863 : * ongoing VACUUM operation will definitely only have one index scan/round of
1864 : * index vacuuming.
1865 : */
1866 : static void
1867 1012 : lazy_vacuum(LVRelState *vacrel)
1868 : {
1869 : bool bypass;
1870 :
1871 : /* Should not end up here with no indexes */
1872 : Assert(vacrel->nindexes > 0);
1873 : Assert(vacrel->lpdead_item_pages > 0);
1874 :
1875 1012 : if (!vacrel->do_index_vacuuming)
1876 : {
1877 : Assert(!vacrel->do_index_cleanup);
1878 30 : dead_items_reset(vacrel);
1879 30 : return;
1880 : }
1881 :
1882 : /*
1883 : * Consider bypassing index vacuuming (and heap vacuuming) entirely.
1884 : *
1885 : * We currently only do this in cases where the number of LP_DEAD items
1886 : * for the entire VACUUM operation is close to zero. This avoids sharp
1887 : * discontinuities in the duration and overhead of successive VACUUM
1888 : * operations that run against the same table with a fixed workload.
1889 : * Ideally, successive VACUUM operations will behave as if there are
1890 : * exactly zero LP_DEAD items in cases where there are close to zero.
1891 : *
1892 : * This is likely to be helpful with a table that is continually affected
1893 : * by UPDATEs that can mostly apply the HOT optimization, but occasionally
1894 : * have small aberrations that lead to just a few heap pages retaining
1895 : * only one or two LP_DEAD items. This is pretty common; even when the
1896 : * DBA goes out of their way to make UPDATEs use HOT, it is practically
1897 : * impossible to predict whether HOT will be applied in 100% of cases.
1898 : * It's far easier to ensure that 99%+ of all UPDATEs against a table use
1899 : * HOT through careful tuning.
1900 : */
1901 982 : bypass = false;
1902 982 : if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0)
1903 : {
1904 : BlockNumber threshold;
1905 :
1906 : Assert(vacrel->num_index_scans == 0);
1907 : Assert(vacrel->lpdead_items == vacrel->dead_items_info->num_items);
1908 : Assert(vacrel->do_index_vacuuming);
1909 : Assert(vacrel->do_index_cleanup);
1910 :
1911 : /*
1912 : * This crossover point at which we'll start to do index vacuuming is
1913 : * expressed as a percentage of the total number of heap pages in the
1914 : * table that are known to have at least one LP_DEAD item. This is
1915 : * much more important than the total number of LP_DEAD items, since
1916 : * it's a proxy for the number of heap pages whose visibility map bits
1917 : * cannot be set on account of bypassing index and heap vacuuming.
1918 : *
1919 : * We apply one further precautionary test: the space currently used
1920 : * to store the TIDs (TIDs that now all point to LP_DEAD items) must
1921 : * not exceed 32MB. This limits the risk that we will bypass index
1922 : * vacuuming again and again until eventually there is a VACUUM whose
1923 : * dead_items space is not CPU cache resident.
1924 : *
1925 : * We don't take any special steps to remember the LP_DEAD items (such
1926 : * as counting them in our final update to the stats system) when the
1927 : * optimization is applied. Though the accounting used in analyze.c's
1928 : * acquire_sample_rows() will recognize the same LP_DEAD items as dead
1929 : * rows in its own stats report, that's okay. The discrepancy should
1930 : * be negligible. If this optimization is ever expanded to cover more
1931 : * cases then this may need to be reconsidered.
1932 : */
1933 964 : threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
1934 968 : bypass = (vacrel->lpdead_item_pages < threshold &&
1935 4 : (TidStoreMemoryUsage(vacrel->dead_items) < (32L * 1024L * 1024L)));
1936 : }
1937 :
1938 982 : if (bypass)
1939 : {
1940 : /*
1941 : * There are almost zero TIDs. Behave as if there were precisely
1942 : * zero: bypass index vacuuming, but do index cleanup.
1943 : *
1944 : * We expect that the ongoing VACUUM operation will finish very
1945 : * quickly, so there is no point in considering speeding up as a
1946 : * failsafe against wraparound failure. (Index cleanup is expected to
1947 : * finish very quickly in cases where there were no ambulkdelete()
1948 : * calls.)
1949 : */
1950 4 : vacrel->do_index_vacuuming = false;
1951 : }
1952 978 : else if (lazy_vacuum_all_indexes(vacrel))
1953 : {
1954 : /*
1955 : * We successfully completed a round of index vacuuming. Do related
1956 : * heap vacuuming now.
1957 : */
1958 978 : lazy_vacuum_heap_rel(vacrel);
1959 : }
1960 : else
1961 : {
1962 : /*
1963 : * Failsafe case.
1964 : *
1965 : * We attempted index vacuuming, but didn't finish a full round/full
1966 : * index scan. This happens when relfrozenxid or relminmxid is too
1967 : * far in the past.
1968 : *
1969 : * From this point on the VACUUM operation will do no further index
1970 : * vacuuming or heap vacuuming. This VACUUM operation won't end up
1971 : * back here again.
1972 : */
1973 : Assert(VacuumFailsafeActive);
1974 : }
1975 :
1976 : /*
1977 : * Forget the LP_DEAD items that we just vacuumed (or just decided to not
1978 : * vacuum)
1979 : */
1980 982 : dead_items_reset(vacrel);
1981 : }
1982 :
1983 : /*
1984 : * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
1985 : *
1986 : * Returns true in the common case when all indexes were successfully
1987 : * vacuumed. Returns false in rare cases where we determined that the ongoing
1988 : * VACUUM operation is at risk of taking too long to finish, leading to
1989 : * wraparound failure.
1990 : */
1991 : static bool
1992 978 : lazy_vacuum_all_indexes(LVRelState *vacrel)
1993 : {
1994 978 : bool allindexes = true;
1995 978 : double old_live_tuples = vacrel->rel->rd_rel->reltuples;
1996 978 : const int progress_start_index[] = {
1997 : PROGRESS_VACUUM_PHASE,
1998 : PROGRESS_VACUUM_INDEXES_TOTAL
1999 : };
2000 978 : const int progress_end_index[] = {
2001 : PROGRESS_VACUUM_INDEXES_TOTAL,
2002 : PROGRESS_VACUUM_INDEXES_PROCESSED,
2003 : PROGRESS_VACUUM_NUM_INDEX_VACUUMS
2004 : };
2005 : int64 progress_start_val[2];
2006 : int64 progress_end_val[3];
2007 :
2008 : Assert(vacrel->nindexes > 0);
2009 : Assert(vacrel->do_index_vacuuming);
2010 : Assert(vacrel->do_index_cleanup);
2011 :
2012 : /* Precheck for XID wraparound emergencies */
2013 978 : if (lazy_check_wraparound_failsafe(vacrel))
2014 : {
2015 : /* Wraparound emergency -- don't even start an index scan */
2016 0 : return false;
2017 : }
2018 :
2019 : /*
2020 : * Report that we are now vacuuming indexes and the number of indexes to
2021 : * vacuum.
2022 : */
2023 978 : progress_start_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_INDEX;
2024 978 : progress_start_val[1] = vacrel->nindexes;
2025 978 : pgstat_progress_update_multi_param(2, progress_start_index, progress_start_val);
2026 :
2027 978 : if (!ParallelVacuumIsActive(vacrel))
2028 : {
2029 2876 : for (int idx = 0; idx < vacrel->nindexes; idx++)
2030 : {
2031 1912 : Relation indrel = vacrel->indrels[idx];
2032 1912 : IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2033 :
2034 1912 : vacrel->indstats[idx] = lazy_vacuum_one_index(indrel, istat,
2035 : old_live_tuples,
2036 : vacrel);
2037 :
2038 : /* Report the number of indexes vacuumed */
2039 1912 : pgstat_progress_update_param(PROGRESS_VACUUM_INDEXES_PROCESSED,
2040 1912 : idx + 1);
2041 :
2042 1912 : if (lazy_check_wraparound_failsafe(vacrel))
2043 : {
2044 : /* Wraparound emergency -- end current index scan */
2045 0 : allindexes = false;
2046 0 : break;
2047 : }
2048 : }
2049 : }
2050 : else
2051 : {
2052 : /* Outsource everything to parallel variant */
2053 14 : parallel_vacuum_bulkdel_all_indexes(vacrel->pvs, old_live_tuples,
2054 : vacrel->num_index_scans);
2055 :
2056 : /*
2057 : * Do a postcheck to consider applying wraparound failsafe now. Note
2058 : * that parallel VACUUM only gets the precheck and this postcheck.
2059 : */
2060 14 : if (lazy_check_wraparound_failsafe(vacrel))
2061 0 : allindexes = false;
2062 : }
2063 :
2064 : /*
2065 : * We delete all LP_DEAD items from the first heap pass in all indexes on
2066 : * each call here (except calls where we choose to do the failsafe). This
2067 : * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2068 : * of the failsafe triggering, which prevents the next call from taking
2069 : * place).
2070 : */
2071 : Assert(vacrel->num_index_scans > 0 ||
2072 : vacrel->dead_items_info->num_items == vacrel->lpdead_items);
2073 : Assert(allindexes || VacuumFailsafeActive);
2074 :
2075 : /*
2076 : * Increase and report the number of index scans. Also, we reset
2077 : * PROGRESS_VACUUM_INDEXES_TOTAL and PROGRESS_VACUUM_INDEXES_PROCESSED.
2078 : *
2079 : * We deliberately include the case where we started a round of bulk
2080 : * deletes that we weren't able to finish due to the failsafe triggering.
2081 : */
2082 978 : vacrel->num_index_scans++;
2083 978 : progress_end_val[0] = 0;
2084 978 : progress_end_val[1] = 0;
2085 978 : progress_end_val[2] = vacrel->num_index_scans;
2086 978 : pgstat_progress_update_multi_param(3, progress_end_index, progress_end_val);
2087 :
2088 978 : return allindexes;
2089 : }
2090 :
2091 : /*
2092 : * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2093 : *
2094 : * This routine marks LP_DEAD items in vacrel->dead_items as LP_UNUSED. Pages
2095 : * that never had lazy_scan_prune record LP_DEAD items are not visited at all.
2096 : *
2097 : * We may also be able to truncate the line pointer array of the heap pages we
2098 : * visit. If there is a contiguous group of LP_UNUSED items at the end of the
2099 : * array, it can be reclaimed as free space. These LP_UNUSED items usually
2100 : * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2101 : * each page to LP_UNUSED, and then consider if it's possible to truncate the
2102 : * page's line pointer array).
2103 : *
2104 : * Note: the reason for doing this as a second pass is we cannot remove the
2105 : * tuples until we've removed their index entries, and we want to process
2106 : * index entry removal in batches as large as possible.
2107 : */
2108 : static void
2109 978 : lazy_vacuum_heap_rel(LVRelState *vacrel)
2110 : {
2111 978 : BlockNumber vacuumed_pages = 0;
2112 978 : Buffer vmbuffer = InvalidBuffer;
2113 : LVSavedErrInfo saved_err_info;
2114 : TidStoreIter *iter;
2115 : TidStoreIterResult *iter_result;
2116 :
2117 : Assert(vacrel->do_index_vacuuming);
2118 : Assert(vacrel->do_index_cleanup);
2119 : Assert(vacrel->num_index_scans > 0);
2120 :
2121 : /* Report that we are now vacuuming the heap */
2122 978 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2123 : PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2124 :
2125 : /* Update error traceback information */
2126 978 : update_vacuum_error_info(vacrel, &saved_err_info,
2127 : VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2128 : InvalidBlockNumber, InvalidOffsetNumber);
2129 :
2130 978 : iter = TidStoreBeginIterate(vacrel->dead_items);
2131 23008 : while ((iter_result = TidStoreIterateNext(iter)) != NULL)
2132 : {
2133 : BlockNumber blkno;
2134 : Buffer buf;
2135 : Page page;
2136 : Size freespace;
2137 : OffsetNumber offsets[MaxOffsetNumber];
2138 : int num_offsets;
2139 :
2140 22030 : vacuum_delay_point();
2141 :
2142 22030 : blkno = iter_result->blkno;
2143 22030 : vacrel->blkno = blkno;
2144 :
2145 22030 : num_offsets = TidStoreGetBlockOffsets(iter_result, offsets, lengthof(offsets));
2146 : Assert(num_offsets <= lengthof(offsets));
2147 :
2148 : /*
2149 : * Pin the visibility map page in case we need to mark the page
2150 : * all-visible. In most cases this will be very cheap, because we'll
2151 : * already have the correct page pinned anyway.
2152 : */
2153 22030 : visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
2154 :
2155 : /* We need a non-cleanup exclusive lock to mark dead_items unused */
2156 22030 : buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
2157 : vacrel->bstrategy);
2158 22030 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2159 22030 : lazy_vacuum_heap_page(vacrel, blkno, buf, offsets,
2160 : num_offsets, vmbuffer);
2161 :
2162 : /* Now that we've vacuumed the page, record its available space */
2163 22030 : page = BufferGetPage(buf);
2164 22030 : freespace = PageGetHeapFreeSpace(page);
2165 :
2166 22030 : UnlockReleaseBuffer(buf);
2167 22030 : RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
2168 22030 : vacuumed_pages++;
2169 : }
2170 978 : TidStoreEndIterate(iter);
2171 :
2172 978 : vacrel->blkno = InvalidBlockNumber;
2173 978 : if (BufferIsValid(vmbuffer))
2174 978 : ReleaseBuffer(vmbuffer);
2175 :
2176 : /*
2177 : * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2178 : * the second heap pass. No more, no less.
2179 : */
2180 : Assert(vacrel->num_index_scans > 1 ||
2181 : (vacrel->dead_items_info->num_items == vacrel->lpdead_items &&
2182 : vacuumed_pages == vacrel->lpdead_item_pages));
2183 :
2184 978 : ereport(DEBUG2,
2185 : (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
2186 : vacrel->relname, (long long) vacrel->dead_items_info->num_items,
2187 : vacuumed_pages)));
2188 :
2189 : /* Revert to the previous phase information for error traceback */
2190 978 : restore_vacuum_error_info(vacrel, &saved_err_info);
2191 978 : }
2192 :
2193 : /*
2194 : * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2195 : * vacrel->dead_items store.
2196 : *
2197 : * Caller must have an exclusive buffer lock on the buffer (though a full
2198 : * cleanup lock is also acceptable). vmbuffer must be valid and already have
2199 : * a pin on blkno's visibility map page.
2200 : */
2201 : static void
2202 22030 : lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2203 : OffsetNumber *deadoffsets, int num_offsets,
2204 : Buffer vmbuffer)
2205 : {
2206 22030 : Page page = BufferGetPage(buffer);
2207 : OffsetNumber unused[MaxHeapTuplesPerPage];
2208 22030 : int nunused = 0;
2209 : TransactionId visibility_cutoff_xid;
2210 : bool all_frozen;
2211 : LVSavedErrInfo saved_err_info;
2212 :
2213 : Assert(vacrel->do_index_vacuuming);
2214 :
2215 22030 : pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2216 :
2217 : /* Update error traceback information */
2218 22030 : update_vacuum_error_info(vacrel, &saved_err_info,
2219 : VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2220 : InvalidOffsetNumber);
2221 :
2222 22030 : START_CRIT_SECTION();
2223 :
2224 1473808 : for (int i = 0; i < num_offsets; i++)
2225 : {
2226 : ItemId itemid;
2227 1451778 : OffsetNumber toff = deadoffsets[i];
2228 :
2229 1451778 : itemid = PageGetItemId(page, toff);
2230 :
2231 : Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2232 1451778 : ItemIdSetUnused(itemid);
2233 1451778 : unused[nunused++] = toff;
2234 : }
2235 :
2236 : Assert(nunused > 0);
2237 :
2238 : /* Attempt to truncate line pointer array now */
2239 22030 : PageTruncateLinePointerArray(page);
2240 :
2241 : /*
2242 : * Mark buffer dirty before we write WAL.
2243 : */
2244 22030 : MarkBufferDirty(buffer);
2245 :
2246 : /* XLOG stuff */
2247 22030 : if (RelationNeedsWAL(vacrel->rel))
2248 : {
2249 20546 : log_heap_prune_and_freeze(vacrel->rel, buffer,
2250 : InvalidTransactionId,
2251 : false, /* no cleanup lock required */
2252 : PRUNE_VACUUM_CLEANUP,
2253 : NULL, 0, /* frozen */
2254 : NULL, 0, /* redirected */
2255 : NULL, 0, /* dead */
2256 : unused, nunused);
2257 : }
2258 :
2259 : /*
2260 : * End critical section, so we safely can do visibility tests (which
2261 : * possibly need to perform IO and allocate memory!). If we crash now the
2262 : * page (including the corresponding vm bit) might not be marked all
2263 : * visible, but that's fine. A later vacuum will fix that.
2264 : */
2265 22030 : END_CRIT_SECTION();
2266 :
2267 : /*
2268 : * Now that we have removed the LP_DEAD items from the page, once again
2269 : * check if the page has become all-visible. The page is already marked
2270 : * dirty, exclusively locked, and, if needed, a full page image has been
2271 : * emitted.
2272 : */
2273 : Assert(!PageIsAllVisible(page));
2274 22030 : if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2275 : &all_frozen))
2276 : {
2277 21960 : uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
2278 :
2279 21960 : if (all_frozen)
2280 : {
2281 : Assert(!TransactionIdIsValid(visibility_cutoff_xid));
2282 16966 : flags |= VISIBILITYMAP_ALL_FROZEN;
2283 : }
2284 :
2285 21960 : PageSetAllVisible(page);
2286 21960 : visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2287 : vmbuffer, visibility_cutoff_xid, flags);
2288 : }
2289 :
2290 : /* Revert to the previous phase information for error traceback */
2291 22030 : restore_vacuum_error_info(vacrel, &saved_err_info);
2292 22030 : }
2293 :
2294 : /*
2295 : * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2296 : * relfrozenxid and/or relminmxid that is dangerously far in the past.
2297 : * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2298 : * vacuuming and heap vacuuming. Truncating the heap is also bypassed.
2299 : *
2300 : * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2301 : * up when the failsafe triggers. VACUUM stops applying any cost-based delay
2302 : * that it started out with.
2303 : *
2304 : * Returns true when failsafe has been triggered.
2305 : */
2306 : static bool
2307 100034 : lazy_check_wraparound_failsafe(LVRelState *vacrel)
2308 : {
2309 : /* Don't warn more than once per VACUUM */
2310 100034 : if (VacuumFailsafeActive)
2311 0 : return true;
2312 :
2313 100034 : if (unlikely(vacuum_xid_failsafe_check(&vacrel->cutoffs)))
2314 : {
2315 14888 : const int progress_index[] = {
2316 : PROGRESS_VACUUM_INDEXES_TOTAL,
2317 : PROGRESS_VACUUM_INDEXES_PROCESSED
2318 : };
2319 14888 : int64 progress_val[2] = {0, 0};
2320 :
2321 14888 : VacuumFailsafeActive = true;
2322 :
2323 : /*
2324 : * Abandon use of a buffer access strategy to allow use of all of
2325 : * shared buffers. We assume the caller who allocated the memory for
2326 : * the BufferAccessStrategy will free it.
2327 : */
2328 14888 : vacrel->bstrategy = NULL;
2329 :
2330 : /* Disable index vacuuming, index cleanup, and heap rel truncation */
2331 14888 : vacrel->do_index_vacuuming = false;
2332 14888 : vacrel->do_index_cleanup = false;
2333 14888 : vacrel->do_rel_truncate = false;
2334 :
2335 : /* Reset the progress counters */
2336 14888 : pgstat_progress_update_multi_param(2, progress_index, progress_val);
2337 :
2338 14888 : ereport(WARNING,
2339 : (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2340 : vacrel->dbname, vacrel->relnamespace, vacrel->relname,
2341 : vacrel->num_index_scans),
2342 : errdetail("The table's relfrozenxid or relminmxid is too far in the past."),
2343 : errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2344 : "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2345 :
2346 : /* Stop applying cost limits from this point on */
2347 14888 : VacuumCostActive = false;
2348 14888 : VacuumCostBalance = 0;
2349 :
2350 14888 : return true;
2351 : }
2352 :
2353 85146 : return false;
2354 : }
2355 :
2356 : /*
2357 : * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2358 : */
2359 : static void
2360 77642 : lazy_cleanup_all_indexes(LVRelState *vacrel)
2361 : {
2362 77642 : double reltuples = vacrel->new_rel_tuples;
2363 77642 : bool estimated_count = vacrel->scanned_pages < vacrel->rel_pages;
2364 77642 : const int progress_start_index[] = {
2365 : PROGRESS_VACUUM_PHASE,
2366 : PROGRESS_VACUUM_INDEXES_TOTAL
2367 : };
2368 77642 : const int progress_end_index[] = {
2369 : PROGRESS_VACUUM_INDEXES_TOTAL,
2370 : PROGRESS_VACUUM_INDEXES_PROCESSED
2371 : };
2372 : int64 progress_start_val[2];
2373 77642 : int64 progress_end_val[2] = {0, 0};
2374 :
2375 : Assert(vacrel->do_index_cleanup);
2376 : Assert(vacrel->nindexes > 0);
2377 :
2378 : /*
2379 : * Report that we are now cleaning up indexes and the number of indexes to
2380 : * cleanup.
2381 : */
2382 77642 : progress_start_val[0] = PROGRESS_VACUUM_PHASE_INDEX_CLEANUP;
2383 77642 : progress_start_val[1] = vacrel->nindexes;
2384 77642 : pgstat_progress_update_multi_param(2, progress_start_index, progress_start_val);
2385 :
2386 77642 : if (!ParallelVacuumIsActive(vacrel))
2387 : {
2388 199176 : for (int idx = 0; idx < vacrel->nindexes; idx++)
2389 : {
2390 121556 : Relation indrel = vacrel->indrels[idx];
2391 121556 : IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2392 :
2393 243112 : vacrel->indstats[idx] =
2394 121556 : lazy_cleanup_one_index(indrel, istat, reltuples,
2395 : estimated_count, vacrel);
2396 :
2397 : /* Report the number of indexes cleaned up */
2398 121556 : pgstat_progress_update_param(PROGRESS_VACUUM_INDEXES_PROCESSED,
2399 121556 : idx + 1);
2400 : }
2401 : }
2402 : else
2403 : {
2404 : /* Outsource everything to parallel variant */
2405 22 : parallel_vacuum_cleanup_all_indexes(vacrel->pvs, reltuples,
2406 : vacrel->num_index_scans,
2407 : estimated_count);
2408 : }
2409 :
2410 : /* Reset the progress counters */
2411 77642 : pgstat_progress_update_multi_param(2, progress_end_index, progress_end_val);
2412 77642 : }
2413 :
2414 : /*
2415 : * lazy_vacuum_one_index() -- vacuum index relation.
2416 : *
2417 : * Delete all the index tuples containing a TID collected in
2418 : * vacrel->dead_items. Also update running statistics. Exact
2419 : * details depend on index AM's ambulkdelete routine.
2420 : *
2421 : * reltuples is the number of heap tuples to be passed to the
2422 : * bulkdelete callback. It's always assumed to be estimated.
2423 : * See indexam.sgml for more info.
2424 : *
2425 : * Returns bulk delete stats derived from input stats
2426 : */
2427 : static IndexBulkDeleteResult *
2428 1912 : lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2429 : double reltuples, LVRelState *vacrel)
2430 : {
2431 : IndexVacuumInfo ivinfo;
2432 : LVSavedErrInfo saved_err_info;
2433 :
2434 1912 : ivinfo.index = indrel;
2435 1912 : ivinfo.heaprel = vacrel->rel;
2436 1912 : ivinfo.analyze_only = false;
2437 1912 : ivinfo.report_progress = false;
2438 1912 : ivinfo.estimated_count = true;
2439 1912 : ivinfo.message_level = DEBUG2;
2440 1912 : ivinfo.num_heap_tuples = reltuples;
2441 1912 : ivinfo.strategy = vacrel->bstrategy;
2442 :
2443 : /*
2444 : * Update error traceback information.
2445 : *
2446 : * The index name is saved during this phase and restored immediately
2447 : * after this phase. See vacuum_error_callback.
2448 : */
2449 : Assert(vacrel->indname == NULL);
2450 1912 : vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2451 1912 : update_vacuum_error_info(vacrel, &saved_err_info,
2452 : VACUUM_ERRCB_PHASE_VACUUM_INDEX,
2453 : InvalidBlockNumber, InvalidOffsetNumber);
2454 :
2455 : /* Do bulk deletion */
2456 1912 : istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items,
2457 : vacrel->dead_items_info);
2458 :
2459 : /* Revert to the previous phase information for error traceback */
2460 1912 : restore_vacuum_error_info(vacrel, &saved_err_info);
2461 1912 : pfree(vacrel->indname);
2462 1912 : vacrel->indname = NULL;
2463 :
2464 1912 : return istat;
2465 : }
2466 :
2467 : /*
2468 : * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
2469 : *
2470 : * Calls index AM's amvacuumcleanup routine. reltuples is the number
2471 : * of heap tuples and estimated_count is true if reltuples is an
2472 : * estimated value. See indexam.sgml for more info.
2473 : *
2474 : * Returns bulk delete stats derived from input stats
2475 : */
2476 : static IndexBulkDeleteResult *
2477 121556 : lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2478 : double reltuples, bool estimated_count,
2479 : LVRelState *vacrel)
2480 : {
2481 : IndexVacuumInfo ivinfo;
2482 : LVSavedErrInfo saved_err_info;
2483 :
2484 121556 : ivinfo.index = indrel;
2485 121556 : ivinfo.heaprel = vacrel->rel;
2486 121556 : ivinfo.analyze_only = false;
2487 121556 : ivinfo.report_progress = false;
2488 121556 : ivinfo.estimated_count = estimated_count;
2489 121556 : ivinfo.message_level = DEBUG2;
2490 :
2491 121556 : ivinfo.num_heap_tuples = reltuples;
2492 121556 : ivinfo.strategy = vacrel->bstrategy;
2493 :
2494 : /*
2495 : * Update error traceback information.
2496 : *
2497 : * The index name is saved during this phase and restored immediately
2498 : * after this phase. See vacuum_error_callback.
2499 : */
2500 : Assert(vacrel->indname == NULL);
2501 121556 : vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2502 121556 : update_vacuum_error_info(vacrel, &saved_err_info,
2503 : VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
2504 : InvalidBlockNumber, InvalidOffsetNumber);
2505 :
2506 121556 : istat = vac_cleanup_one_index(&ivinfo, istat);
2507 :
2508 : /* Revert to the previous phase information for error traceback */
2509 121556 : restore_vacuum_error_info(vacrel, &saved_err_info);
2510 121556 : pfree(vacrel->indname);
2511 121556 : vacrel->indname = NULL;
2512 :
2513 121556 : return istat;
2514 : }
2515 :
2516 : /*
2517 : * should_attempt_truncation - should we attempt to truncate the heap?
2518 : *
2519 : * Don't even think about it unless we have a shot at releasing a goodly
2520 : * number of pages. Otherwise, the time taken isn't worth it, mainly because
2521 : * an AccessExclusive lock must be replayed on any hot standby, where it can
2522 : * be particularly disruptive.
2523 : *
2524 : * Also don't attempt it if wraparound failsafe is in effect. The entire
2525 : * system might be refusing to allocate new XIDs at this point. The system
2526 : * definitely won't return to normal unless and until VACUUM actually advances
2527 : * the oldest relfrozenxid -- which hasn't happened for target rel just yet.
2528 : * If lazy_truncate_heap attempted to acquire an AccessExclusiveLock to
2529 : * truncate the table under these circumstances, an XID exhaustion error might
2530 : * make it impossible for VACUUM to fix the underlying XID exhaustion problem.
2531 : * There is very little chance of truncation working out when the failsafe is
2532 : * in effect in any case. lazy_scan_prune makes the optimistic assumption
2533 : * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
2534 : * we're called.
2535 : */
2536 : static bool
2537 97130 : should_attempt_truncation(LVRelState *vacrel)
2538 : {
2539 : BlockNumber possibly_freeable;
2540 :
2541 97130 : if (!vacrel->do_rel_truncate || VacuumFailsafeActive)
2542 15128 : return false;
2543 :
2544 82002 : possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
2545 82002 : if (possibly_freeable > 0 &&
2546 278 : (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
2547 278 : possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION))
2548 268 : return true;
2549 :
2550 81734 : return false;
2551 : }
2552 :
2553 : /*
2554 : * lazy_truncate_heap - try to truncate off any empty pages at the end
2555 : */
2556 : static void
2557 268 : lazy_truncate_heap(LVRelState *vacrel)
2558 : {
2559 268 : BlockNumber orig_rel_pages = vacrel->rel_pages;
2560 : BlockNumber new_rel_pages;
2561 : bool lock_waiter_detected;
2562 : int lock_retry;
2563 :
2564 : /* Report that we are now truncating */
2565 268 : pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2566 : PROGRESS_VACUUM_PHASE_TRUNCATE);
2567 :
2568 : /* Update error traceback information one last time */
2569 268 : update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
2570 : vacrel->nonempty_pages, InvalidOffsetNumber);
2571 :
2572 : /*
2573 : * Loop until no more truncating can be done.
2574 : */
2575 : do
2576 : {
2577 : /*
2578 : * We need full exclusive lock on the relation in order to do
2579 : * truncation. If we can't get it, give up rather than waiting --- we
2580 : * don't want to block other backends, and we don't want to deadlock
2581 : * (which is quite possible considering we already hold a lower-grade
2582 : * lock).
2583 : */
2584 268 : lock_waiter_detected = false;
2585 268 : lock_retry = 0;
2586 : while (true)
2587 : {
2588 672 : if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
2589 264 : break;
2590 :
2591 : /*
2592 : * Check for interrupts while trying to (re-)acquire the exclusive
2593 : * lock.
2594 : */
2595 408 : CHECK_FOR_INTERRUPTS();
2596 :
2597 408 : if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
2598 : VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
2599 : {
2600 : /*
2601 : * We failed to establish the lock in the specified number of
2602 : * retries. This means we give up truncating.
2603 : */
2604 4 : ereport(vacrel->verbose ? INFO : DEBUG2,
2605 : (errmsg("\"%s\": stopping truncate due to conflicting lock request",
2606 : vacrel->relname)));
2607 6 : return;
2608 : }
2609 :
2610 404 : (void) WaitLatch(MyLatch,
2611 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
2612 : VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL,
2613 : WAIT_EVENT_VACUUM_TRUNCATE);
2614 404 : ResetLatch(MyLatch);
2615 : }
2616 :
2617 : /*
2618 : * Now that we have exclusive lock, look to see if the rel has grown
2619 : * whilst we were vacuuming with non-exclusive lock. If so, give up;
2620 : * the newly added pages presumably contain non-deletable tuples.
2621 : */
2622 264 : new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
2623 264 : if (new_rel_pages != orig_rel_pages)
2624 : {
2625 : /*
2626 : * Note: we intentionally don't update vacrel->rel_pages with the
2627 : * new rel size here. If we did, it would amount to assuming that
2628 : * the new pages are empty, which is unlikely. Leaving the numbers
2629 : * alone amounts to assuming that the new pages have the same
2630 : * tuple density as existing ones, which is less unlikely.
2631 : */
2632 0 : UnlockRelation(vacrel->rel, AccessExclusiveLock);
2633 0 : return;
2634 : }
2635 :
2636 : /*
2637 : * Scan backwards from the end to verify that the end pages actually
2638 : * contain no tuples. This is *necessary*, not optional, because
2639 : * other backends could have added tuples to these pages whilst we
2640 : * were vacuuming.
2641 : */
2642 264 : new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
2643 264 : vacrel->blkno = new_rel_pages;
2644 :
2645 264 : if (new_rel_pages >= orig_rel_pages)
2646 : {
2647 : /* can't do anything after all */
2648 2 : UnlockRelation(vacrel->rel, AccessExclusiveLock);
2649 2 : return;
2650 : }
2651 :
2652 : /*
2653 : * Okay to truncate.
2654 : */
2655 262 : RelationTruncate(vacrel->rel, new_rel_pages);
2656 :
2657 : /*
2658 : * We can release the exclusive lock as soon as we have truncated.
2659 : * Other backends can't safely access the relation until they have
2660 : * processed the smgr invalidation that smgrtruncate sent out ... but
2661 : * that should happen as part of standard invalidation processing once
2662 : * they acquire lock on the relation.
2663 : */
2664 262 : UnlockRelation(vacrel->rel, AccessExclusiveLock);
2665 :
2666 : /*
2667 : * Update statistics. Here, it *is* correct to adjust rel_pages
2668 : * without also touching reltuples, since the tuple count wasn't
2669 : * changed by the truncation.
2670 : */
2671 262 : vacrel->removed_pages += orig_rel_pages - new_rel_pages;
2672 262 : vacrel->rel_pages = new_rel_pages;
2673 :
2674 262 : ereport(vacrel->verbose ? INFO : DEBUG2,
2675 : (errmsg("table \"%s\": truncated %u to %u pages",
2676 : vacrel->relname,
2677 : orig_rel_pages, new_rel_pages)));
2678 262 : orig_rel_pages = new_rel_pages;
2679 262 : } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
2680 : }
2681 :
2682 : /*
2683 : * Rescan end pages to verify that they are (still) empty of tuples.
2684 : *
2685 : * Returns number of nondeletable pages (last nonempty page + 1).
2686 : */
2687 : static BlockNumber
2688 264 : count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
2689 : {
2690 : BlockNumber blkno;
2691 : BlockNumber prefetchedUntil;
2692 : instr_time starttime;
2693 :
2694 : /* Initialize the starttime if we check for conflicting lock requests */
2695 264 : INSTR_TIME_SET_CURRENT(starttime);
2696 :
2697 : /*
2698 : * Start checking blocks at what we believe relation end to be and move
2699 : * backwards. (Strange coding of loop control is needed because blkno is
2700 : * unsigned.) To make the scan faster, we prefetch a few blocks at a time
2701 : * in forward direction, so that OS-level readahead can kick in.
2702 : */
2703 264 : blkno = vacrel->rel_pages;
2704 : StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
2705 : "prefetch size must be power of 2");
2706 264 : prefetchedUntil = InvalidBlockNumber;
2707 4110 : while (blkno > vacrel->nonempty_pages)
2708 : {
2709 : Buffer buf;
2710 : Page page;
2711 : OffsetNumber offnum,
2712 : maxoff;
2713 : bool hastup;
2714 :
2715 : /*
2716 : * Check if another process requests a lock on our relation. We are
2717 : * holding an AccessExclusiveLock here, so they will be waiting. We
2718 : * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2719 : * only check if that interval has elapsed once every 32 blocks to
2720 : * keep the number of system calls and actual shared lock table
2721 : * lookups to a minimum.
2722 : */
2723 3856 : if ((blkno % 32) == 0)
2724 : {
2725 : instr_time currenttime;
2726 : instr_time elapsed;
2727 :
2728 124 : INSTR_TIME_SET_CURRENT(currenttime);
2729 124 : elapsed = currenttime;
2730 124 : INSTR_TIME_SUBTRACT(elapsed, starttime);
2731 124 : if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
2732 : >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
2733 : {
2734 0 : if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
2735 : {
2736 0 : ereport(vacrel->verbose ? INFO : DEBUG2,
2737 : (errmsg("table \"%s\": suspending truncate due to conflicting lock request",
2738 : vacrel->relname)));
2739 :
2740 0 : *lock_waiter_detected = true;
2741 0 : return blkno;
2742 : }
2743 0 : starttime = currenttime;
2744 : }
2745 : }
2746 :
2747 : /*
2748 : * We don't insert a vacuum delay point here, because we have an
2749 : * exclusive lock on the table which we want to hold for as short a
2750 : * time as possible. We still need to check for interrupts however.
2751 : */
2752 3856 : CHECK_FOR_INTERRUPTS();
2753 :
2754 3856 : blkno--;
2755 :
2756 : /* If we haven't prefetched this lot yet, do so now. */
2757 3856 : if (prefetchedUntil > blkno)
2758 : {
2759 : BlockNumber prefetchStart;
2760 : BlockNumber pblkno;
2761 :
2762 354 : prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2763 5766 : for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2764 : {
2765 5412 : PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
2766 5412 : CHECK_FOR_INTERRUPTS();
2767 : }
2768 354 : prefetchedUntil = prefetchStart;
2769 : }
2770 :
2771 3856 : buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
2772 : vacrel->bstrategy);
2773 :
2774 : /* In this phase we only need shared access to the buffer */
2775 3856 : LockBuffer(buf, BUFFER_LOCK_SHARE);
2776 :
2777 3856 : page = BufferGetPage(buf);
2778 :
2779 3856 : if (PageIsNew(page) || PageIsEmpty(page))
2780 : {
2781 1640 : UnlockReleaseBuffer(buf);
2782 1640 : continue;
2783 : }
2784 :
2785 2216 : hastup = false;
2786 2216 : maxoff = PageGetMaxOffsetNumber(page);
2787 4422 : for (offnum = FirstOffsetNumber;
2788 : offnum <= maxoff;
2789 2206 : offnum = OffsetNumberNext(offnum))
2790 : {
2791 : ItemId itemid;
2792 :
2793 2216 : itemid = PageGetItemId(page, offnum);
2794 :
2795 : /*
2796 : * Note: any non-unused item should be taken as a reason to keep
2797 : * this page. Even an LP_DEAD item makes truncation unsafe, since
2798 : * we must not have cleaned out its index entries.
2799 : */
2800 2216 : if (ItemIdIsUsed(itemid))
2801 : {
2802 10 : hastup = true;
2803 10 : break; /* can stop scanning */
2804 : }
2805 : } /* scan along page */
2806 :
2807 2216 : UnlockReleaseBuffer(buf);
2808 :
2809 : /* Done scanning if we found a tuple here */
2810 2216 : if (hastup)
2811 10 : return blkno + 1;
2812 : }
2813 :
2814 : /*
2815 : * If we fall out of the loop, all the previously-thought-to-be-empty
2816 : * pages still are; we need not bother to look at the last known-nonempty
2817 : * page.
2818 : */
2819 254 : return vacrel->nonempty_pages;
2820 : }
2821 :
2822 : /*
2823 : * Allocate dead_items and dead_items_info (either using palloc, or in dynamic
2824 : * shared memory). Sets both in vacrel for caller.
2825 : *
2826 : * Also handles parallel initialization as part of allocating dead_items in
2827 : * DSM when required.
2828 : */
2829 : static void
2830 97130 : dead_items_alloc(LVRelState *vacrel, int nworkers)
2831 : {
2832 : VacDeadItemsInfo *dead_items_info;
2833 270648 : int vac_work_mem = AmAutoVacuumWorkerProcess() &&
2834 76388 : autovacuum_work_mem != -1 ?
2835 173518 : autovacuum_work_mem : maintenance_work_mem;
2836 :
2837 : /*
2838 : * Initialize state for a parallel vacuum. As of now, only one worker can
2839 : * be used for an index, so we invoke parallelism only if there are at
2840 : * least two indexes on a table.
2841 : */
2842 97130 : if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
2843 : {
2844 : /*
2845 : * Since parallel workers cannot access data in temporary tables, we
2846 : * can't perform parallel vacuum on them.
2847 : */
2848 7940 : if (RelationUsesLocalBuffers(vacrel->rel))
2849 : {
2850 : /*
2851 : * Give warning only if the user explicitly tries to perform a
2852 : * parallel vacuum on the temporary table.
2853 : */
2854 6 : if (nworkers > 0)
2855 6 : ereport(WARNING,
2856 : (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
2857 : vacrel->relname)));
2858 : }
2859 : else
2860 7934 : vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels,
2861 : vacrel->nindexes, nworkers,
2862 : vac_work_mem,
2863 7934 : vacrel->verbose ? INFO : DEBUG2,
2864 : vacrel->bstrategy);
2865 :
2866 : /*
2867 : * If parallel mode started, dead_items and dead_items_info spaces are
2868 : * allocated in DSM.
2869 : */
2870 7940 : if (ParallelVacuumIsActive(vacrel))
2871 : {
2872 22 : vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs,
2873 : &vacrel->dead_items_info);
2874 22 : return;
2875 : }
2876 : }
2877 :
2878 : /*
2879 : * Serial VACUUM case. Allocate both dead_items and dead_items_info
2880 : * locally.
2881 : */
2882 :
2883 97108 : dead_items_info = (VacDeadItemsInfo *) palloc(sizeof(VacDeadItemsInfo));
2884 97108 : dead_items_info->max_bytes = vac_work_mem * 1024L;
2885 97108 : dead_items_info->num_items = 0;
2886 97108 : vacrel->dead_items_info = dead_items_info;
2887 :
2888 97108 : vacrel->dead_items = TidStoreCreateLocal(dead_items_info->max_bytes, true);
2889 : }
2890 :
2891 : /*
2892 : * Add the given block number and offset numbers to dead_items.
2893 : */
2894 : static void
2895 26732 : dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets,
2896 : int num_offsets)
2897 : {
2898 26732 : TidStore *dead_items = vacrel->dead_items;
2899 26732 : const int prog_index[2] = {
2900 : PROGRESS_VACUUM_NUM_DEAD_ITEM_IDS,
2901 : PROGRESS_VACUUM_DEAD_TUPLE_BYTES
2902 : };
2903 : int64 prog_val[2];
2904 :
2905 26732 : TidStoreSetBlockOffsets(dead_items, blkno, offsets, num_offsets);
2906 26732 : vacrel->dead_items_info->num_items += num_offsets;
2907 :
2908 : /* update the progress information */
2909 26732 : prog_val[0] = vacrel->dead_items_info->num_items;
2910 26732 : prog_val[1] = TidStoreMemoryUsage(dead_items);
2911 26732 : pgstat_progress_update_multi_param(2, prog_index, prog_val);
2912 26732 : }
2913 :
2914 : /*
2915 : * Forget all collected dead items.
2916 : */
2917 : static void
2918 1012 : dead_items_reset(LVRelState *vacrel)
2919 : {
2920 1012 : TidStore *dead_items = vacrel->dead_items;
2921 :
2922 1012 : if (ParallelVacuumIsActive(vacrel))
2923 : {
2924 14 : parallel_vacuum_reset_dead_items(vacrel->pvs);
2925 14 : return;
2926 : }
2927 :
2928 : /* Recreate the tidstore with the same max_bytes limitation */
2929 998 : TidStoreDestroy(dead_items);
2930 998 : vacrel->dead_items = TidStoreCreateLocal(vacrel->dead_items_info->max_bytes, true);
2931 :
2932 : /* Reset the counter */
2933 998 : vacrel->dead_items_info->num_items = 0;
2934 : }
2935 :
2936 : /*
2937 : * Perform cleanup for resources allocated in dead_items_alloc
2938 : */
2939 : static void
2940 97130 : dead_items_cleanup(LVRelState *vacrel)
2941 : {
2942 97130 : if (!ParallelVacuumIsActive(vacrel))
2943 : {
2944 : /* Don't bother with pfree here */
2945 97108 : return;
2946 : }
2947 :
2948 : /* End parallel mode */
2949 22 : parallel_vacuum_end(vacrel->pvs, vacrel->indstats);
2950 22 : vacrel->pvs = NULL;
2951 : }
2952 :
2953 : /*
2954 : * Check if every tuple in the given page is visible to all current and future
2955 : * transactions. Also return the visibility_cutoff_xid which is the highest
2956 : * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
2957 : * on this page is frozen.
2958 : *
2959 : * This is a stripped down version of lazy_scan_prune(). If you change
2960 : * anything here, make sure that everything stays in sync. Note that an
2961 : * assertion calls us to verify that everybody still agrees. Be sure to avoid
2962 : * introducing new side-effects here.
2963 : */
2964 : static bool
2965 22030 : heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
2966 : TransactionId *visibility_cutoff_xid,
2967 : bool *all_frozen)
2968 : {
2969 22030 : Page page = BufferGetPage(buf);
2970 22030 : BlockNumber blockno = BufferGetBlockNumber(buf);
2971 : OffsetNumber offnum,
2972 : maxoff;
2973 22030 : bool all_visible = true;
2974 :
2975 22030 : *visibility_cutoff_xid = InvalidTransactionId;
2976 22030 : *all_frozen = true;
2977 :
2978 22030 : maxoff = PageGetMaxOffsetNumber(page);
2979 1116990 : for (offnum = FirstOffsetNumber;
2980 1095030 : offnum <= maxoff && all_visible;
2981 1094960 : offnum = OffsetNumberNext(offnum))
2982 : {
2983 : ItemId itemid;
2984 : HeapTupleData tuple;
2985 :
2986 : /*
2987 : * Set the offset number so that we can display it along with any
2988 : * error that occurred while processing this tuple.
2989 : */
2990 1094960 : vacrel->offnum = offnum;
2991 1094960 : itemid = PageGetItemId(page, offnum);
2992 :
2993 : /* Unused or redirect line pointers are of no interest */
2994 1094960 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
2995 278412 : continue;
2996 :
2997 816548 : ItemPointerSet(&(tuple.t_self), blockno, offnum);
2998 :
2999 : /*
3000 : * Dead line pointers can have index pointers pointing to them. So
3001 : * they can't be treated as visible
3002 : */
3003 816548 : if (ItemIdIsDead(itemid))
3004 : {
3005 0 : all_visible = false;
3006 0 : *all_frozen = false;
3007 0 : break;
3008 : }
3009 :
3010 : Assert(ItemIdIsNormal(itemid));
3011 :
3012 816548 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3013 816548 : tuple.t_len = ItemIdGetLength(itemid);
3014 816548 : tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3015 :
3016 816548 : switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
3017 : buf))
3018 : {
3019 816510 : case HEAPTUPLE_LIVE:
3020 : {
3021 : TransactionId xmin;
3022 :
3023 : /* Check comments in lazy_scan_prune. */
3024 816510 : if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3025 : {
3026 0 : all_visible = false;
3027 0 : *all_frozen = false;
3028 0 : break;
3029 : }
3030 :
3031 : /*
3032 : * The inserter definitely committed. But is it old enough
3033 : * that everyone sees it as committed?
3034 : */
3035 816510 : xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3036 816510 : if (!TransactionIdPrecedes(xmin,
3037 : vacrel->cutoffs.OldestXmin))
3038 : {
3039 32 : all_visible = false;
3040 32 : *all_frozen = false;
3041 32 : break;
3042 : }
3043 :
3044 : /* Track newest xmin on page. */
3045 816478 : if (TransactionIdFollows(xmin, *visibility_cutoff_xid) &&
3046 : TransactionIdIsNormal(xmin))
3047 18642 : *visibility_cutoff_xid = xmin;
3048 :
3049 : /* Check whether this tuple is already frozen or not */
3050 1019864 : if (all_visible && *all_frozen &&
3051 203386 : heap_tuple_needs_eventual_freeze(tuple.t_data))
3052 5010 : *all_frozen = false;
3053 : }
3054 816478 : break;
3055 :
3056 38 : case HEAPTUPLE_DEAD:
3057 : case HEAPTUPLE_RECENTLY_DEAD:
3058 : case HEAPTUPLE_INSERT_IN_PROGRESS:
3059 : case HEAPTUPLE_DELETE_IN_PROGRESS:
3060 : {
3061 38 : all_visible = false;
3062 38 : *all_frozen = false;
3063 38 : break;
3064 : }
3065 0 : default:
3066 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3067 : break;
3068 : }
3069 : } /* scan along page */
3070 :
3071 : /* Clear the offset information once we have processed the given page. */
3072 22030 : vacrel->offnum = InvalidOffsetNumber;
3073 :
3074 22030 : return all_visible;
3075 : }
3076 :
3077 : /*
3078 : * Update index statistics in pg_class if the statistics are accurate.
3079 : */
3080 : static void
3081 81978 : update_relstats_all_indexes(LVRelState *vacrel)
3082 : {
3083 81978 : Relation *indrels = vacrel->indrels;
3084 81978 : int nindexes = vacrel->nindexes;
3085 81978 : IndexBulkDeleteResult **indstats = vacrel->indstats;
3086 :
3087 : Assert(vacrel->do_index_cleanup);
3088 :
3089 203632 : for (int idx = 0; idx < nindexes; idx++)
3090 : {
3091 121654 : Relation indrel = indrels[idx];
3092 121654 : IndexBulkDeleteResult *istat = indstats[idx];
3093 :
3094 121654 : if (istat == NULL || istat->estimated_count)
3095 119472 : continue;
3096 :
3097 : /* Update index statistics */
3098 2182 : vac_update_relstats(indrel,
3099 : istat->num_pages,
3100 : istat->num_index_tuples,
3101 : 0,
3102 : false,
3103 : InvalidTransactionId,
3104 : InvalidMultiXactId,
3105 : NULL, NULL, false);
3106 : }
3107 81978 : }
3108 :
3109 : /*
3110 : * Error context callback for errors occurring during vacuum. The error
3111 : * context messages for index phases should match the messages set in parallel
3112 : * vacuum. If you change this function for those phases, change
3113 : * parallel_vacuum_error_callback() as well.
3114 : */
3115 : static void
3116 59116 : vacuum_error_callback(void *arg)
3117 : {
3118 59116 : LVRelState *errinfo = arg;
3119 :
3120 59116 : switch (errinfo->phase)
3121 : {
3122 0 : case VACUUM_ERRCB_PHASE_SCAN_HEAP:
3123 0 : if (BlockNumberIsValid(errinfo->blkno))
3124 : {
3125 0 : if (OffsetNumberIsValid(errinfo->offnum))
3126 0 : errcontext("while scanning block %u offset %u of relation \"%s.%s\"",
3127 0 : errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3128 : else
3129 0 : errcontext("while scanning block %u of relation \"%s.%s\"",
3130 : errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3131 : }
3132 : else
3133 0 : errcontext("while scanning relation \"%s.%s\"",
3134 : errinfo->relnamespace, errinfo->relname);
3135 0 : break;
3136 :
3137 0 : case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
3138 0 : if (BlockNumberIsValid(errinfo->blkno))
3139 : {
3140 0 : if (OffsetNumberIsValid(errinfo->offnum))
3141 0 : errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"",
3142 0 : errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3143 : else
3144 0 : errcontext("while vacuuming block %u of relation \"%s.%s\"",
3145 : errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3146 : }
3147 : else
3148 0 : errcontext("while vacuuming relation \"%s.%s\"",
3149 : errinfo->relnamespace, errinfo->relname);
3150 0 : break;
3151 :
3152 0 : case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
3153 0 : errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
3154 : errinfo->indname, errinfo->relnamespace, errinfo->relname);
3155 0 : break;
3156 :
3157 0 : case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
3158 0 : errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
3159 : errinfo->indname, errinfo->relnamespace, errinfo->relname);
3160 0 : break;
3161 :
3162 6 : case VACUUM_ERRCB_PHASE_TRUNCATE:
3163 6 : if (BlockNumberIsValid(errinfo->blkno))
3164 6 : errcontext("while truncating relation \"%s.%s\" to %u blocks",
3165 : errinfo->relnamespace, errinfo->relname, errinfo->blkno);
3166 6 : break;
3167 :
3168 59110 : case VACUUM_ERRCB_PHASE_UNKNOWN:
3169 : default:
3170 59110 : return; /* do nothing; the errinfo may not be
3171 : * initialized */
3172 : }
3173 : }
3174 :
3175 : /*
3176 : * Updates the information required for vacuum error callback. This also saves
3177 : * the current information which can be later restored via restore_vacuum_error_info.
3178 : */
3179 : static void
3180 557290 : update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
3181 : int phase, BlockNumber blkno, OffsetNumber offnum)
3182 : {
3183 557290 : if (saved_vacrel)
3184 : {
3185 146476 : saved_vacrel->offnum = vacrel->offnum;
3186 146476 : saved_vacrel->blkno = vacrel->blkno;
3187 146476 : saved_vacrel->phase = vacrel->phase;
3188 : }
3189 :
3190 557290 : vacrel->blkno = blkno;
3191 557290 : vacrel->offnum = offnum;
3192 557290 : vacrel->phase = phase;
3193 557290 : }
3194 :
3195 : /*
3196 : * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
3197 : */
3198 : static void
3199 146476 : restore_vacuum_error_info(LVRelState *vacrel,
3200 : const LVSavedErrInfo *saved_vacrel)
3201 : {
3202 146476 : vacrel->blkno = saved_vacrel->blkno;
3203 146476 : vacrel->offnum = saved_vacrel->offnum;
3204 146476 : vacrel->phase = saved_vacrel->phase;
3205 146476 : }
|