Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_visibility.c
4 : * display visibility map information and page-level visibility bits
5 : *
6 : * Copyright (c) 2016-2025, PostgreSQL Global Development Group
7 : *
8 : * contrib/pg_visibility/pg_visibility.c
9 : *-------------------------------------------------------------------------
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/heapam.h"
14 : #include "access/htup_details.h"
15 : #include "access/visibilitymap.h"
16 : #include "access/xloginsert.h"
17 : #include "catalog/pg_type.h"
18 : #include "catalog/storage_xlog.h"
19 : #include "funcapi.h"
20 : #include "miscadmin.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 : #include "storage/procarray.h"
24 : #include "storage/read_stream.h"
25 : #include "storage/smgr.h"
26 : #include "utils/rel.h"
27 :
28 14 : PG_MODULE_MAGIC_EXT(
29 : .name = "pg_visibility",
30 : .version = PG_VERSION
31 : );
32 :
33 : typedef struct vbits
34 : {
35 : BlockNumber next;
36 : BlockNumber count;
37 : uint8 bits[FLEXIBLE_ARRAY_MEMBER];
38 : } vbits;
39 :
40 : typedef struct corrupt_items
41 : {
42 : BlockNumber next;
43 : BlockNumber count;
44 : ItemPointer tids;
45 : } corrupt_items;
46 :
47 : /* for collect_corrupt_items_read_stream_next_block */
48 : struct collect_corrupt_items_read_stream_private
49 : {
50 : bool all_frozen;
51 : bool all_visible;
52 : BlockNumber current_blocknum;
53 : BlockNumber last_exclusive;
54 : Relation rel;
55 : Buffer vmbuffer;
56 : };
57 :
58 6 : PG_FUNCTION_INFO_V1(pg_visibility_map);
59 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
60 8 : PG_FUNCTION_INFO_V1(pg_visibility);
61 8 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
62 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
63 10 : PG_FUNCTION_INFO_V1(pg_check_frozen);
64 12 : PG_FUNCTION_INFO_V1(pg_check_visible);
65 8 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
66 :
67 : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
68 : static vbits *collect_visibility_data(Oid relid, bool include_pd);
69 : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
70 : bool all_frozen);
71 : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
72 : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
73 : Buffer buffer);
74 : static void check_relation_relkind(Relation rel);
75 :
76 : /*
77 : * Visibility map information for a single block of a relation.
78 : *
79 : * Note: the VM code will silently return zeroes for pages past the end
80 : * of the map, so we allow probes up to MaxBlockNumber regardless of the
81 : * actual relation size.
82 : */
83 : Datum
84 0 : pg_visibility_map(PG_FUNCTION_ARGS)
85 : {
86 0 : Oid relid = PG_GETARG_OID(0);
87 0 : int64 blkno = PG_GETARG_INT64(1);
88 : int32 mapbits;
89 : Relation rel;
90 0 : Buffer vmbuffer = InvalidBuffer;
91 : TupleDesc tupdesc;
92 : Datum values[2];
93 0 : bool nulls[2] = {0};
94 :
95 0 : rel = relation_open(relid, AccessShareLock);
96 :
97 : /* Only some relkinds have a visibility map */
98 0 : check_relation_relkind(rel);
99 :
100 0 : if (blkno < 0 || blkno > MaxBlockNumber)
101 0 : ereport(ERROR,
102 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
103 : errmsg("invalid block number")));
104 :
105 0 : tupdesc = pg_visibility_tupdesc(false, false);
106 :
107 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
108 0 : if (vmbuffer != InvalidBuffer)
109 0 : ReleaseBuffer(vmbuffer);
110 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
111 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
112 :
113 0 : relation_close(rel, AccessShareLock);
114 :
115 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
116 : }
117 :
118 : /*
119 : * Visibility map information for a single block of a relation, plus the
120 : * page-level information for the same block.
121 : */
122 : Datum
123 12 : pg_visibility(PG_FUNCTION_ARGS)
124 : {
125 12 : Oid relid = PG_GETARG_OID(0);
126 12 : int64 blkno = PG_GETARG_INT64(1);
127 : int32 mapbits;
128 : Relation rel;
129 12 : Buffer vmbuffer = InvalidBuffer;
130 : Buffer buffer;
131 : Page page;
132 : TupleDesc tupdesc;
133 : Datum values[3];
134 12 : bool nulls[3] = {0};
135 :
136 12 : rel = relation_open(relid, AccessShareLock);
137 :
138 : /* Only some relkinds have a visibility map */
139 12 : check_relation_relkind(rel);
140 :
141 2 : if (blkno < 0 || blkno > MaxBlockNumber)
142 0 : ereport(ERROR,
143 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
144 : errmsg("invalid block number")));
145 :
146 2 : tupdesc = pg_visibility_tupdesc(false, true);
147 :
148 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
149 2 : if (vmbuffer != InvalidBuffer)
150 2 : ReleaseBuffer(vmbuffer);
151 2 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
152 2 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
153 :
154 : /* Here we have to explicitly check rel size ... */
155 2 : if (blkno < RelationGetNumberOfBlocks(rel))
156 : {
157 2 : buffer = ReadBuffer(rel, blkno);
158 2 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
159 :
160 2 : page = BufferGetPage(buffer);
161 2 : values[2] = BoolGetDatum(PageIsAllVisible(page));
162 :
163 2 : UnlockReleaseBuffer(buffer);
164 : }
165 : else
166 : {
167 : /* As with the vismap, silently return 0 for pages past EOF */
168 0 : values[2] = BoolGetDatum(false);
169 : }
170 :
171 2 : relation_close(rel, AccessShareLock);
172 :
173 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
174 : }
175 :
176 : /*
177 : * Visibility map information for every block in a relation.
178 : */
179 : Datum
180 40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
181 : {
182 : FuncCallContext *funcctx;
183 : vbits *info;
184 :
185 40 : if (SRF_IS_FIRSTCALL())
186 : {
187 22 : Oid relid = PG_GETARG_OID(0);
188 : MemoryContext oldcontext;
189 :
190 22 : funcctx = SRF_FIRSTCALL_INIT();
191 22 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
192 22 : funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
193 : /* collect_visibility_data will verify the relkind */
194 22 : funcctx->user_fctx = collect_visibility_data(relid, false);
195 8 : MemoryContextSwitchTo(oldcontext);
196 : }
197 :
198 26 : funcctx = SRF_PERCALL_SETUP();
199 26 : info = (vbits *) funcctx->user_fctx;
200 :
201 26 : if (info->next < info->count)
202 : {
203 : Datum values[3];
204 18 : bool nulls[3] = {0};
205 : HeapTuple tuple;
206 :
207 18 : values[0] = Int64GetDatum(info->next);
208 18 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
209 18 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
210 18 : info->next++;
211 :
212 18 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
213 18 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
214 : }
215 :
216 8 : SRF_RETURN_DONE(funcctx);
217 : }
218 :
219 : /*
220 : * Visibility map information for every block in a relation, plus the page
221 : * level information for each block.
222 : */
223 : Datum
224 18 : pg_visibility_rel(PG_FUNCTION_ARGS)
225 : {
226 : FuncCallContext *funcctx;
227 : vbits *info;
228 :
229 18 : if (SRF_IS_FIRSTCALL())
230 : {
231 12 : Oid relid = PG_GETARG_OID(0);
232 : MemoryContext oldcontext;
233 :
234 12 : funcctx = SRF_FIRSTCALL_INIT();
235 12 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
236 12 : funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
237 : /* collect_visibility_data will verify the relkind */
238 12 : funcctx->user_fctx = collect_visibility_data(relid, true);
239 12 : MemoryContextSwitchTo(oldcontext);
240 : }
241 :
242 18 : funcctx = SRF_PERCALL_SETUP();
243 18 : info = (vbits *) funcctx->user_fctx;
244 :
245 18 : if (info->next < info->count)
246 : {
247 : Datum values[4];
248 6 : bool nulls[4] = {0};
249 : HeapTuple tuple;
250 :
251 6 : values[0] = Int64GetDatum(info->next);
252 6 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
253 6 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
254 6 : values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
255 6 : info->next++;
256 :
257 6 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
258 6 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
259 : }
260 :
261 12 : SRF_RETURN_DONE(funcctx);
262 : }
263 :
264 : /*
265 : * Count the number of all-visible and all-frozen pages in the visibility
266 : * map for a particular relation.
267 : */
268 : Datum
269 12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
270 : {
271 12 : Oid relid = PG_GETARG_OID(0);
272 : Relation rel;
273 : BlockNumber nblocks;
274 : BlockNumber blkno;
275 12 : Buffer vmbuffer = InvalidBuffer;
276 12 : int64 all_visible = 0;
277 12 : int64 all_frozen = 0;
278 : TupleDesc tupdesc;
279 : Datum values[2];
280 12 : bool nulls[2] = {0};
281 :
282 12 : rel = relation_open(relid, AccessShareLock);
283 :
284 : /* Only some relkinds have a visibility map */
285 12 : check_relation_relkind(rel);
286 :
287 2 : nblocks = RelationGetNumberOfBlocks(rel);
288 :
289 4 : for (blkno = 0; blkno < nblocks; ++blkno)
290 : {
291 : int32 mapbits;
292 :
293 : /* Make sure we are interruptible. */
294 2 : CHECK_FOR_INTERRUPTS();
295 :
296 : /* Get map info. */
297 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
298 2 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
299 2 : ++all_visible;
300 2 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
301 0 : ++all_frozen;
302 : }
303 :
304 : /* Clean up. */
305 2 : if (vmbuffer != InvalidBuffer)
306 2 : ReleaseBuffer(vmbuffer);
307 2 : relation_close(rel, AccessShareLock);
308 :
309 2 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
310 0 : elog(ERROR, "return type must be a row type");
311 :
312 2 : values[0] = Int64GetDatum(all_visible);
313 2 : values[1] = Int64GetDatum(all_frozen);
314 :
315 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
316 : }
317 :
318 : /*
319 : * Return the TIDs of non-frozen tuples present in pages marked all-frozen
320 : * in the visibility map. We hope no one will ever find any, but there could
321 : * be bugs, database corruption, etc.
322 : */
323 : Datum
324 30 : pg_check_frozen(PG_FUNCTION_ARGS)
325 : {
326 : FuncCallContext *funcctx;
327 : corrupt_items *items;
328 :
329 30 : if (SRF_IS_FIRSTCALL())
330 : {
331 20 : Oid relid = PG_GETARG_OID(0);
332 : MemoryContext oldcontext;
333 :
334 20 : funcctx = SRF_FIRSTCALL_INIT();
335 20 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
336 : /* collect_corrupt_items will verify the relkind */
337 20 : funcctx->user_fctx = collect_corrupt_items(relid, false, true);
338 10 : MemoryContextSwitchTo(oldcontext);
339 : }
340 :
341 20 : funcctx = SRF_PERCALL_SETUP();
342 20 : items = (corrupt_items *) funcctx->user_fctx;
343 :
344 20 : if (items->next < items->count)
345 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
346 :
347 10 : SRF_RETURN_DONE(funcctx);
348 : }
349 :
350 : /*
351 : * Return the TIDs of not-all-visible tuples in pages marked all-visible
352 : * in the visibility map. We hope no one will ever find any, but there could
353 : * be bugs, database corruption, etc.
354 : */
355 : Datum
356 16 : pg_check_visible(PG_FUNCTION_ARGS)
357 : {
358 : FuncCallContext *funcctx;
359 : corrupt_items *items;
360 :
361 16 : if (SRF_IS_FIRSTCALL())
362 : {
363 6 : Oid relid = PG_GETARG_OID(0);
364 : MemoryContext oldcontext;
365 :
366 6 : funcctx = SRF_FIRSTCALL_INIT();
367 6 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
368 : /* collect_corrupt_items will verify the relkind */
369 6 : funcctx->user_fctx = collect_corrupt_items(relid, true, false);
370 6 : MemoryContextSwitchTo(oldcontext);
371 : }
372 :
373 16 : funcctx = SRF_PERCALL_SETUP();
374 16 : items = (corrupt_items *) funcctx->user_fctx;
375 :
376 16 : if (items->next < items->count)
377 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
378 :
379 6 : SRF_RETURN_DONE(funcctx);
380 : }
381 :
382 : /*
383 : * Remove the visibility map fork for a relation. If there turn out to be
384 : * any bugs in the visibility map code that require rebuilding the VM, this
385 : * provides users with a way to do it that is cleaner than shutting down the
386 : * server and removing files by hand.
387 : *
388 : * This is a cut-down version of RelationTruncate.
389 : */
390 : Datum
391 12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
392 : {
393 12 : Oid relid = PG_GETARG_OID(0);
394 : Relation rel;
395 : ForkNumber fork;
396 : BlockNumber block;
397 : BlockNumber old_block;
398 :
399 12 : rel = relation_open(relid, AccessExclusiveLock);
400 :
401 : /* Only some relkinds have a visibility map */
402 12 : check_relation_relkind(rel);
403 :
404 : /* Forcibly reset cached file size */
405 2 : RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
406 :
407 : /* Compute new and old size before entering critical section. */
408 2 : fork = VISIBILITYMAP_FORKNUM;
409 2 : block = visibilitymap_prepare_truncate(rel, 0);
410 2 : old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
411 :
412 : /*
413 : * WAL-logging, buffer dropping, file truncation must be atomic and all on
414 : * one side of a checkpoint. See RelationTruncate() for discussion.
415 : */
416 : Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
417 2 : MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
418 2 : START_CRIT_SECTION();
419 :
420 2 : if (RelationNeedsWAL(rel))
421 : {
422 : XLogRecPtr lsn;
423 : xl_smgr_truncate xlrec;
424 :
425 2 : xlrec.blkno = 0;
426 2 : xlrec.rlocator = rel->rd_locator;
427 2 : xlrec.flags = SMGR_TRUNCATE_VM;
428 :
429 2 : XLogBeginInsert();
430 2 : XLogRegisterData(&xlrec, sizeof(xlrec));
431 :
432 2 : lsn = XLogInsert(RM_SMGR_ID,
433 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
434 2 : XLogFlush(lsn);
435 : }
436 :
437 2 : if (BlockNumberIsValid(block))
438 2 : smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
439 :
440 2 : END_CRIT_SECTION();
441 2 : MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
442 :
443 : /*
444 : * Release the lock right away, not at commit time.
445 : *
446 : * It would be a problem to release the lock prior to commit if this
447 : * truncate operation sends any transactional invalidation messages. Other
448 : * backends would potentially be able to lock the relation without
449 : * processing them in the window of time between when we release the lock
450 : * here and when we sent the messages at our eventual commit. However,
451 : * we're currently only sending a non-transactional smgr invalidation,
452 : * which will have been posted to shared memory immediately from within
453 : * smgr_truncate. Therefore, there should be no race here.
454 : *
455 : * The reason why it's desirable to release the lock early here is because
456 : * of the possibility that someone will need to use this to blow away many
457 : * visibility map forks at once. If we can't release the lock until
458 : * commit time, the transaction doing this will accumulate
459 : * AccessExclusiveLocks on all of those relations at the same time, which
460 : * is undesirable. However, if this turns out to be unsafe we may have no
461 : * choice...
462 : */
463 2 : relation_close(rel, AccessExclusiveLock);
464 :
465 : /* Nothing to return. */
466 2 : PG_RETURN_VOID();
467 : }
468 :
469 : /*
470 : * Helper function to construct whichever TupleDesc we need for a particular
471 : * call.
472 : */
473 : static TupleDesc
474 36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
475 : {
476 : TupleDesc tupdesc;
477 36 : AttrNumber maxattr = 2;
478 36 : AttrNumber a = 0;
479 :
480 36 : if (include_blkno)
481 34 : ++maxattr;
482 36 : if (include_pd)
483 14 : ++maxattr;
484 36 : tupdesc = CreateTemplateTupleDesc(maxattr);
485 36 : if (include_blkno)
486 34 : TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
487 36 : TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
488 36 : TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
489 36 : if (include_pd)
490 14 : TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
491 : Assert(a == maxattr);
492 :
493 36 : return BlessTupleDesc(tupdesc);
494 : }
495 :
496 : /*
497 : * Collect visibility data about a relation.
498 : *
499 : * Checks relkind of relid and will throw an error if the relation does not
500 : * have a VM.
501 : */
502 : static vbits *
503 34 : collect_visibility_data(Oid relid, bool include_pd)
504 : {
505 : Relation rel;
506 : BlockNumber nblocks;
507 : vbits *info;
508 : BlockNumber blkno;
509 34 : Buffer vmbuffer = InvalidBuffer;
510 34 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
511 : BlockRangeReadStreamPrivate p;
512 34 : ReadStream *stream = NULL;
513 :
514 34 : rel = relation_open(relid, AccessShareLock);
515 :
516 : /* Only some relkinds have a visibility map */
517 30 : check_relation_relkind(rel);
518 :
519 20 : nblocks = RelationGetNumberOfBlocks(rel);
520 20 : info = palloc0(offsetof(vbits, bits) + nblocks);
521 20 : info->next = 0;
522 20 : info->count = nblocks;
523 :
524 : /* Create a stream if reading main fork. */
525 20 : if (include_pd)
526 : {
527 12 : p.current_blocknum = 0;
528 12 : p.last_exclusive = nblocks;
529 :
530 : /*
531 : * It is safe to use batchmode as block_range_read_stream_cb takes no
532 : * locks.
533 : */
534 12 : stream = read_stream_begin_relation(READ_STREAM_FULL |
535 : READ_STREAM_USE_BATCHING,
536 : bstrategy,
537 : rel,
538 : MAIN_FORKNUM,
539 : block_range_read_stream_cb,
540 : &p,
541 : 0);
542 : }
543 :
544 44 : for (blkno = 0; blkno < nblocks; ++blkno)
545 : {
546 : int32 mapbits;
547 :
548 : /* Make sure we are interruptible. */
549 24 : CHECK_FOR_INTERRUPTS();
550 :
551 : /* Get map info. */
552 24 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
553 24 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
554 16 : info->bits[blkno] |= (1 << 0);
555 24 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
556 10 : info->bits[blkno] |= (1 << 1);
557 :
558 : /*
559 : * Page-level data requires reading every block, so only get it if the
560 : * caller needs it. Use a buffer access strategy, too, to prevent
561 : * cache-trashing.
562 : */
563 24 : if (include_pd)
564 : {
565 : Buffer buffer;
566 : Page page;
567 :
568 6 : buffer = read_stream_next_buffer(stream, NULL);
569 6 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
570 :
571 6 : page = BufferGetPage(buffer);
572 6 : if (PageIsAllVisible(page))
573 4 : info->bits[blkno] |= (1 << 2);
574 :
575 6 : UnlockReleaseBuffer(buffer);
576 : }
577 : }
578 :
579 20 : if (include_pd)
580 : {
581 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
582 12 : read_stream_end(stream);
583 : }
584 :
585 : /* Clean up. */
586 20 : if (vmbuffer != InvalidBuffer)
587 14 : ReleaseBuffer(vmbuffer);
588 20 : relation_close(rel, AccessShareLock);
589 :
590 20 : return info;
591 : }
592 :
593 : /*
594 : * The "strict" version of GetOldestNonRemovableTransactionId(). The
595 : * pg_visibility check can tolerate false positives (don't report some of the
596 : * errors), but can't tolerate false negatives (report false errors). Normally,
597 : * horizons move forwards, but there are cases when it could move backward
598 : * (see comment for ComputeXidHorizons()).
599 : *
600 : * This is why we have to implement our own function for xid horizon, which
601 : * would be guaranteed to be newer or equal to any xid horizon computed before.
602 : * We have to do the following to achieve this.
603 : *
604 : * 1. Ignore processes xmin's, because they consider connection to other
605 : * databases that were ignored before.
606 : * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
607 : * now perform minimal checking on a standby by always using nextXid, this
608 : * approach is better than nothing and will at least catch extremely broken
609 : * cases where a xid is in the future.
610 : * 3. Ignore walsender xmin, because it could go backward if some replication
611 : * connections don't use replication slots.
612 : *
613 : * While it might seem like we could use KnownAssignedXids for shared
614 : * catalogs, since shared catalogs rely on a global horizon rather than a
615 : * database-specific one - there are potential edge cases. For example, a
616 : * transaction may crash on the primary without writing a commit/abort record.
617 : * This would lead to a situation where it appears to still be running on the
618 : * standby, even though it has already ended on the primary. For this reason,
619 : * it's safer to ignore KnownAssignedXids, even for shared catalogs.
620 : *
621 : * As a result, we're using only currently running xids to compute the horizon.
622 : * Surely these would significantly sacrifice accuracy. But we have to do so
623 : * to avoid reporting false errors.
624 : */
625 : static TransactionId
626 16 : GetStrictOldestNonRemovableTransactionId(Relation rel)
627 : {
628 : RunningTransactions runningTransactions;
629 :
630 16 : if (RecoveryInProgress())
631 : {
632 : TransactionId result;
633 :
634 : /* As we ignore KnownAssignedXids on standby, just pick nextXid */
635 2 : LWLockAcquire(XidGenLock, LW_SHARED);
636 2 : result = XidFromFullTransactionId(TransamVariables->nextXid);
637 2 : LWLockRelease(XidGenLock);
638 2 : return result;
639 : }
640 14 : else if (rel == NULL || rel->rd_rel->relisshared)
641 : {
642 : /* Shared relation: take into account all running xids */
643 0 : runningTransactions = GetRunningTransactionData();
644 0 : LWLockRelease(ProcArrayLock);
645 0 : LWLockRelease(XidGenLock);
646 0 : return runningTransactions->oldestRunningXid;
647 : }
648 14 : else if (!RELATION_IS_LOCAL(rel))
649 : {
650 : /*
651 : * Normal relation: take into account xids running within the current
652 : * database
653 : */
654 14 : runningTransactions = GetRunningTransactionData();
655 14 : LWLockRelease(ProcArrayLock);
656 14 : LWLockRelease(XidGenLock);
657 14 : return runningTransactions->oldestDatabaseRunningXid;
658 : }
659 : else
660 : {
661 : /*
662 : * For temporary relations, ComputeXidHorizons() uses only
663 : * TransamVariables->latestCompletedXid and MyProc->xid. These two
664 : * shouldn't go backwards. So we're fine with this horizon.
665 : */
666 0 : return GetOldestNonRemovableTransactionId(rel);
667 : }
668 : }
669 :
670 : /*
671 : * Callback function to get next block for read stream object used in
672 : * collect_corrupt_items() function.
673 : */
674 : static BlockNumber
675 206 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
676 : void *callback_private_data,
677 : void *per_buffer_data)
678 : {
679 206 : struct collect_corrupt_items_read_stream_private *p = callback_private_data;
680 :
681 218 : for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
682 : {
683 202 : bool check_frozen = false;
684 202 : bool check_visible = false;
685 :
686 : /* Make sure we are interruptible. */
687 202 : CHECK_FOR_INTERRUPTS();
688 :
689 202 : if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
690 98 : check_frozen = true;
691 202 : if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
692 92 : check_visible = true;
693 202 : if (!check_visible && !check_frozen)
694 12 : continue;
695 :
696 190 : return p->current_blocknum++;
697 : }
698 :
699 16 : return InvalidBlockNumber;
700 : }
701 :
702 : /*
703 : * Returns a list of items whose visibility map information does not match
704 : * the status of the tuples on the page.
705 : *
706 : * If all_visible is passed as true, this will include all items which are
707 : * on pages marked as all-visible in the visibility map but which do not
708 : * seem to in fact be all-visible.
709 : *
710 : * If all_frozen is passed as true, this will include all items which are
711 : * on pages marked as all-frozen but which do not seem to in fact be frozen.
712 : *
713 : * Checks relkind of relid and will throw an error if the relation does not
714 : * have a VM.
715 : */
716 : static corrupt_items *
717 26 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
718 : {
719 : Relation rel;
720 : corrupt_items *items;
721 26 : Buffer vmbuffer = InvalidBuffer;
722 26 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
723 26 : TransactionId OldestXmin = InvalidTransactionId;
724 : struct collect_corrupt_items_read_stream_private p;
725 : ReadStream *stream;
726 : Buffer buffer;
727 :
728 26 : rel = relation_open(relid, AccessShareLock);
729 :
730 : /* Only some relkinds have a visibility map */
731 26 : check_relation_relkind(rel);
732 :
733 16 : if (all_visible)
734 6 : OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
735 :
736 : /*
737 : * Guess an initial array size. We don't expect many corrupted tuples, so
738 : * start with a small array. This function uses the "next" field to track
739 : * the next offset where we can store an item (which is the same thing as
740 : * the number of items found so far) and the "count" field to track the
741 : * number of entries allocated. We'll repurpose these fields before
742 : * returning.
743 : */
744 16 : items = palloc0(sizeof(corrupt_items));
745 16 : items->next = 0;
746 16 : items->count = 64;
747 16 : items->tids = palloc(items->count * sizeof(ItemPointerData));
748 :
749 16 : p.current_blocknum = 0;
750 16 : p.last_exclusive = RelationGetNumberOfBlocks(rel);
751 16 : p.rel = rel;
752 16 : p.vmbuffer = InvalidBuffer;
753 16 : p.all_frozen = all_frozen;
754 16 : p.all_visible = all_visible;
755 16 : stream = read_stream_begin_relation(READ_STREAM_FULL,
756 : bstrategy,
757 : rel,
758 : MAIN_FORKNUM,
759 : collect_corrupt_items_read_stream_next_block,
760 : &p,
761 : 0);
762 :
763 : /* Loop over every block in the relation. */
764 206 : while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
765 : {
766 190 : bool check_frozen = all_frozen;
767 190 : bool check_visible = all_visible;
768 : Page page;
769 : OffsetNumber offnum,
770 : maxoff;
771 : BlockNumber blkno;
772 :
773 : /* Make sure we are interruptible. */
774 190 : CHECK_FOR_INTERRUPTS();
775 :
776 190 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
777 :
778 190 : page = BufferGetPage(buffer);
779 190 : maxoff = PageGetMaxOffsetNumber(page);
780 190 : blkno = BufferGetBlockNumber(buffer);
781 :
782 : /*
783 : * The visibility map bits might have changed while we were acquiring
784 : * the page lock. Recheck to avoid returning spurious results.
785 : */
786 190 : if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
787 0 : check_frozen = false;
788 190 : if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
789 0 : check_visible = false;
790 190 : if (!check_visible && !check_frozen)
791 : {
792 0 : UnlockReleaseBuffer(buffer);
793 0 : continue;
794 : }
795 :
796 : /* Iterate over each tuple on the page. */
797 32252 : for (offnum = FirstOffsetNumber;
798 : offnum <= maxoff;
799 32062 : offnum = OffsetNumberNext(offnum))
800 : {
801 : HeapTupleData tuple;
802 : ItemId itemid;
803 :
804 32062 : itemid = PageGetItemId(page, offnum);
805 :
806 : /* Unused or redirect line pointers are of no interest. */
807 32062 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
808 0 : continue;
809 :
810 : /* Dead line pointers are neither all-visible nor frozen. */
811 32062 : if (ItemIdIsDead(itemid))
812 : {
813 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
814 0 : record_corrupt_item(items, &tuple.t_self);
815 0 : continue;
816 : }
817 :
818 : /* Initialize a HeapTupleData structure for checks below. */
819 32062 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
820 32062 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
821 32062 : tuple.t_len = ItemIdGetLength(itemid);
822 32062 : tuple.t_tableOid = relid;
823 :
824 : /*
825 : * If we're checking whether the page is all-visible, we expect
826 : * the tuple to be all-visible.
827 : */
828 32062 : if (check_visible &&
829 16018 : !tuple_all_visible(&tuple, OldestXmin, buffer))
830 : {
831 : TransactionId RecomputedOldestXmin;
832 :
833 : /*
834 : * Time has passed since we computed OldestXmin, so it's
835 : * possible that this tuple is all-visible in reality even
836 : * though it doesn't appear so based on our
837 : * previously-computed value. Let's compute a new value so we
838 : * can be certain whether there is a problem.
839 : *
840 : * From a concurrency point of view, it sort of sucks to
841 : * retake ProcArrayLock here while we're holding the buffer
842 : * exclusively locked, but it should be safe against
843 : * deadlocks, because surely
844 : * GetStrictOldestNonRemovableTransactionId() should never
845 : * take a buffer lock. And this shouldn't happen often, so
846 : * it's worth being careful so as to avoid false positives.
847 : */
848 10 : RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
849 :
850 10 : if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
851 10 : record_corrupt_item(items, &tuple.t_self);
852 : else
853 : {
854 0 : OldestXmin = RecomputedOldestXmin;
855 0 : if (!tuple_all_visible(&tuple, OldestXmin, buffer))
856 0 : record_corrupt_item(items, &tuple.t_self);
857 : }
858 : }
859 :
860 : /*
861 : * If we're checking whether the page is all-frozen, we expect the
862 : * tuple to be in a state where it will never need freezing.
863 : */
864 32062 : if (check_frozen)
865 : {
866 16044 : if (heap_tuple_needs_eventual_freeze(tuple.t_data))
867 10 : record_corrupt_item(items, &tuple.t_self);
868 : }
869 : }
870 :
871 190 : UnlockReleaseBuffer(buffer);
872 : }
873 16 : read_stream_end(stream);
874 :
875 : /* Clean up. */
876 16 : if (vmbuffer != InvalidBuffer)
877 14 : ReleaseBuffer(vmbuffer);
878 16 : if (p.vmbuffer != InvalidBuffer)
879 16 : ReleaseBuffer(p.vmbuffer);
880 16 : relation_close(rel, AccessShareLock);
881 :
882 : /*
883 : * Before returning, repurpose the fields to match caller's expectations.
884 : * next is now the next item that should be read (rather than written) and
885 : * count is now the number of items we wrote (rather than the number we
886 : * allocated).
887 : */
888 16 : items->count = items->next;
889 16 : items->next = 0;
890 :
891 16 : return items;
892 : }
893 :
894 : /*
895 : * Remember one corrupt item.
896 : */
897 : static void
898 20 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
899 : {
900 : /* enlarge output array if needed. */
901 20 : if (items->next >= items->count)
902 : {
903 0 : items->count *= 2;
904 0 : items->tids = repalloc(items->tids,
905 0 : items->count * sizeof(ItemPointerData));
906 : }
907 : /* and add the new item */
908 20 : items->tids[items->next++] = *tid;
909 20 : }
910 :
911 : /*
912 : * Check whether a tuple is all-visible relative to a given OldestXmin value.
913 : * The buffer should contain the tuple and should be locked and pinned.
914 : */
915 : static bool
916 16018 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
917 : {
918 : HTSV_Result state;
919 : TransactionId xmin;
920 :
921 16018 : state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
922 16018 : if (state != HEAPTUPLE_LIVE)
923 10 : return false; /* all-visible implies live */
924 :
925 : /*
926 : * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
927 : * all-visible unless every tuple is hinted committed. However, those hint
928 : * bits could be lost after a crash, so we can't be certain that they'll
929 : * be set here. So just check the xmin.
930 : */
931 :
932 16008 : xmin = HeapTupleHeaderGetXmin(tup->t_data);
933 16008 : if (!TransactionIdPrecedes(xmin, OldestXmin))
934 0 : return false; /* xmin not old enough for all to see */
935 :
936 16008 : return true;
937 : }
938 :
939 : /*
940 : * check_relation_relkind - convenience routine to check that relation
941 : * is of the relkind supported by the callers
942 : */
943 : static void
944 92 : check_relation_relkind(Relation rel)
945 : {
946 92 : if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
947 50 : ereport(ERROR,
948 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
949 : errmsg("relation \"%s\" is of wrong relation kind",
950 : RelationGetRelationName(rel)),
951 : errdetail_relkind_not_supported(rel->rd_rel->relkind)));
952 42 : }
|