Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_visibility.c
4 : * display visibility map information and page-level visibility bits
5 : *
6 : * Copyright (c) 2016-2025, PostgreSQL Global Development Group
7 : *
8 : * contrib/pg_visibility/pg_visibility.c
9 : *-------------------------------------------------------------------------
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/heapam.h"
14 : #include "access/htup_details.h"
15 : #include "access/visibilitymap.h"
16 : #include "access/xloginsert.h"
17 : #include "catalog/pg_type.h"
18 : #include "catalog/storage_xlog.h"
19 : #include "funcapi.h"
20 : #include "miscadmin.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 : #include "storage/procarray.h"
24 : #include "storage/read_stream.h"
25 : #include "storage/smgr.h"
26 : #include "utils/rel.h"
27 :
28 14 : PG_MODULE_MAGIC;
29 :
30 : typedef struct vbits
31 : {
32 : BlockNumber next;
33 : BlockNumber count;
34 : uint8 bits[FLEXIBLE_ARRAY_MEMBER];
35 : } vbits;
36 :
37 : typedef struct corrupt_items
38 : {
39 : BlockNumber next;
40 : BlockNumber count;
41 : ItemPointer tids;
42 : } corrupt_items;
43 :
44 : /* for collect_corrupt_items_read_stream_next_block */
45 : struct collect_corrupt_items_read_stream_private
46 : {
47 : bool all_frozen;
48 : bool all_visible;
49 : BlockNumber current_blocknum;
50 : BlockNumber last_exclusive;
51 : Relation rel;
52 : Buffer vmbuffer;
53 : };
54 :
55 6 : PG_FUNCTION_INFO_V1(pg_visibility_map);
56 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
57 8 : PG_FUNCTION_INFO_V1(pg_visibility);
58 8 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
59 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
60 10 : PG_FUNCTION_INFO_V1(pg_check_frozen);
61 12 : PG_FUNCTION_INFO_V1(pg_check_visible);
62 8 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
63 :
64 : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
65 : static vbits *collect_visibility_data(Oid relid, bool include_pd);
66 : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
67 : bool all_frozen);
68 : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
69 : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
70 : Buffer buffer);
71 : static void check_relation_relkind(Relation rel);
72 :
73 : /*
74 : * Visibility map information for a single block of a relation.
75 : *
76 : * Note: the VM code will silently return zeroes for pages past the end
77 : * of the map, so we allow probes up to MaxBlockNumber regardless of the
78 : * actual relation size.
79 : */
80 : Datum
81 0 : pg_visibility_map(PG_FUNCTION_ARGS)
82 : {
83 0 : Oid relid = PG_GETARG_OID(0);
84 0 : int64 blkno = PG_GETARG_INT64(1);
85 : int32 mapbits;
86 : Relation rel;
87 0 : Buffer vmbuffer = InvalidBuffer;
88 : TupleDesc tupdesc;
89 : Datum values[2];
90 0 : bool nulls[2] = {0};
91 :
92 0 : rel = relation_open(relid, AccessShareLock);
93 :
94 : /* Only some relkinds have a visibility map */
95 0 : check_relation_relkind(rel);
96 :
97 0 : if (blkno < 0 || blkno > MaxBlockNumber)
98 0 : ereport(ERROR,
99 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
100 : errmsg("invalid block number")));
101 :
102 0 : tupdesc = pg_visibility_tupdesc(false, false);
103 :
104 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
105 0 : if (vmbuffer != InvalidBuffer)
106 0 : ReleaseBuffer(vmbuffer);
107 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
108 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
109 :
110 0 : relation_close(rel, AccessShareLock);
111 :
112 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
113 : }
114 :
115 : /*
116 : * Visibility map information for a single block of a relation, plus the
117 : * page-level information for the same block.
118 : */
119 : Datum
120 12 : pg_visibility(PG_FUNCTION_ARGS)
121 : {
122 12 : Oid relid = PG_GETARG_OID(0);
123 12 : int64 blkno = PG_GETARG_INT64(1);
124 : int32 mapbits;
125 : Relation rel;
126 12 : Buffer vmbuffer = InvalidBuffer;
127 : Buffer buffer;
128 : Page page;
129 : TupleDesc tupdesc;
130 : Datum values[3];
131 12 : bool nulls[3] = {0};
132 :
133 12 : rel = relation_open(relid, AccessShareLock);
134 :
135 : /* Only some relkinds have a visibility map */
136 12 : check_relation_relkind(rel);
137 :
138 2 : if (blkno < 0 || blkno > MaxBlockNumber)
139 0 : ereport(ERROR,
140 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
141 : errmsg("invalid block number")));
142 :
143 2 : tupdesc = pg_visibility_tupdesc(false, true);
144 :
145 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
146 2 : if (vmbuffer != InvalidBuffer)
147 2 : ReleaseBuffer(vmbuffer);
148 2 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
149 2 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
150 :
151 : /* Here we have to explicitly check rel size ... */
152 2 : if (blkno < RelationGetNumberOfBlocks(rel))
153 : {
154 2 : buffer = ReadBuffer(rel, blkno);
155 2 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
156 :
157 2 : page = BufferGetPage(buffer);
158 2 : values[2] = BoolGetDatum(PageIsAllVisible(page));
159 :
160 2 : UnlockReleaseBuffer(buffer);
161 : }
162 : else
163 : {
164 : /* As with the vismap, silently return 0 for pages past EOF */
165 0 : values[2] = BoolGetDatum(false);
166 : }
167 :
168 2 : relation_close(rel, AccessShareLock);
169 :
170 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
171 : }
172 :
173 : /*
174 : * Visibility map information for every block in a relation.
175 : */
176 : Datum
177 40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
178 : {
179 : FuncCallContext *funcctx;
180 : vbits *info;
181 :
182 40 : if (SRF_IS_FIRSTCALL())
183 : {
184 22 : Oid relid = PG_GETARG_OID(0);
185 : MemoryContext oldcontext;
186 :
187 22 : funcctx = SRF_FIRSTCALL_INIT();
188 22 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
189 22 : funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
190 : /* collect_visibility_data will verify the relkind */
191 22 : funcctx->user_fctx = collect_visibility_data(relid, false);
192 8 : MemoryContextSwitchTo(oldcontext);
193 : }
194 :
195 26 : funcctx = SRF_PERCALL_SETUP();
196 26 : info = (vbits *) funcctx->user_fctx;
197 :
198 26 : if (info->next < info->count)
199 : {
200 : Datum values[3];
201 18 : bool nulls[3] = {0};
202 : HeapTuple tuple;
203 :
204 18 : values[0] = Int64GetDatum(info->next);
205 18 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
206 18 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
207 18 : info->next++;
208 :
209 18 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
210 18 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
211 : }
212 :
213 8 : SRF_RETURN_DONE(funcctx);
214 : }
215 :
216 : /*
217 : * Visibility map information for every block in a relation, plus the page
218 : * level information for each block.
219 : */
220 : Datum
221 18 : pg_visibility_rel(PG_FUNCTION_ARGS)
222 : {
223 : FuncCallContext *funcctx;
224 : vbits *info;
225 :
226 18 : if (SRF_IS_FIRSTCALL())
227 : {
228 12 : Oid relid = PG_GETARG_OID(0);
229 : MemoryContext oldcontext;
230 :
231 12 : funcctx = SRF_FIRSTCALL_INIT();
232 12 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
233 12 : funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
234 : /* collect_visibility_data will verify the relkind */
235 12 : funcctx->user_fctx = collect_visibility_data(relid, true);
236 12 : MemoryContextSwitchTo(oldcontext);
237 : }
238 :
239 18 : funcctx = SRF_PERCALL_SETUP();
240 18 : info = (vbits *) funcctx->user_fctx;
241 :
242 18 : if (info->next < info->count)
243 : {
244 : Datum values[4];
245 6 : bool nulls[4] = {0};
246 : HeapTuple tuple;
247 :
248 6 : values[0] = Int64GetDatum(info->next);
249 6 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
250 6 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
251 6 : values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
252 6 : info->next++;
253 :
254 6 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
255 6 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
256 : }
257 :
258 12 : SRF_RETURN_DONE(funcctx);
259 : }
260 :
261 : /*
262 : * Count the number of all-visible and all-frozen pages in the visibility
263 : * map for a particular relation.
264 : */
265 : Datum
266 12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
267 : {
268 12 : Oid relid = PG_GETARG_OID(0);
269 : Relation rel;
270 : BlockNumber nblocks;
271 : BlockNumber blkno;
272 12 : Buffer vmbuffer = InvalidBuffer;
273 12 : int64 all_visible = 0;
274 12 : int64 all_frozen = 0;
275 : TupleDesc tupdesc;
276 : Datum values[2];
277 12 : bool nulls[2] = {0};
278 :
279 12 : rel = relation_open(relid, AccessShareLock);
280 :
281 : /* Only some relkinds have a visibility map */
282 12 : check_relation_relkind(rel);
283 :
284 2 : nblocks = RelationGetNumberOfBlocks(rel);
285 :
286 4 : for (blkno = 0; blkno < nblocks; ++blkno)
287 : {
288 : int32 mapbits;
289 :
290 : /* Make sure we are interruptible. */
291 2 : CHECK_FOR_INTERRUPTS();
292 :
293 : /* Get map info. */
294 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
295 2 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
296 2 : ++all_visible;
297 2 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
298 0 : ++all_frozen;
299 : }
300 :
301 : /* Clean up. */
302 2 : if (vmbuffer != InvalidBuffer)
303 2 : ReleaseBuffer(vmbuffer);
304 2 : relation_close(rel, AccessShareLock);
305 :
306 2 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
307 0 : elog(ERROR, "return type must be a row type");
308 :
309 2 : values[0] = Int64GetDatum(all_visible);
310 2 : values[1] = Int64GetDatum(all_frozen);
311 :
312 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
313 : }
314 :
315 : /*
316 : * Return the TIDs of non-frozen tuples present in pages marked all-frozen
317 : * in the visibility map. We hope no one will ever find any, but there could
318 : * be bugs, database corruption, etc.
319 : */
320 : Datum
321 30 : pg_check_frozen(PG_FUNCTION_ARGS)
322 : {
323 : FuncCallContext *funcctx;
324 : corrupt_items *items;
325 :
326 30 : if (SRF_IS_FIRSTCALL())
327 : {
328 20 : Oid relid = PG_GETARG_OID(0);
329 : MemoryContext oldcontext;
330 :
331 20 : funcctx = SRF_FIRSTCALL_INIT();
332 20 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
333 : /* collect_corrupt_items will verify the relkind */
334 20 : funcctx->user_fctx = collect_corrupt_items(relid, false, true);
335 10 : MemoryContextSwitchTo(oldcontext);
336 : }
337 :
338 20 : funcctx = SRF_PERCALL_SETUP();
339 20 : items = (corrupt_items *) funcctx->user_fctx;
340 :
341 20 : if (items->next < items->count)
342 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
343 :
344 10 : SRF_RETURN_DONE(funcctx);
345 : }
346 :
347 : /*
348 : * Return the TIDs of not-all-visible tuples in pages marked all-visible
349 : * in the visibility map. We hope no one will ever find any, but there could
350 : * be bugs, database corruption, etc.
351 : */
352 : Datum
353 16 : pg_check_visible(PG_FUNCTION_ARGS)
354 : {
355 : FuncCallContext *funcctx;
356 : corrupt_items *items;
357 :
358 16 : if (SRF_IS_FIRSTCALL())
359 : {
360 6 : Oid relid = PG_GETARG_OID(0);
361 : MemoryContext oldcontext;
362 :
363 6 : funcctx = SRF_FIRSTCALL_INIT();
364 6 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
365 : /* collect_corrupt_items will verify the relkind */
366 6 : funcctx->user_fctx = collect_corrupt_items(relid, true, false);
367 6 : MemoryContextSwitchTo(oldcontext);
368 : }
369 :
370 16 : funcctx = SRF_PERCALL_SETUP();
371 16 : items = (corrupt_items *) funcctx->user_fctx;
372 :
373 16 : if (items->next < items->count)
374 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
375 :
376 6 : SRF_RETURN_DONE(funcctx);
377 : }
378 :
379 : /*
380 : * Remove the visibility map fork for a relation. If there turn out to be
381 : * any bugs in the visibility map code that require rebuilding the VM, this
382 : * provides users with a way to do it that is cleaner than shutting down the
383 : * server and removing files by hand.
384 : *
385 : * This is a cut-down version of RelationTruncate.
386 : */
387 : Datum
388 12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
389 : {
390 12 : Oid relid = PG_GETARG_OID(0);
391 : Relation rel;
392 : ForkNumber fork;
393 : BlockNumber block;
394 : BlockNumber old_block;
395 :
396 12 : rel = relation_open(relid, AccessExclusiveLock);
397 :
398 : /* Only some relkinds have a visibility map */
399 12 : check_relation_relkind(rel);
400 :
401 : /* Forcibly reset cached file size */
402 2 : RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
403 :
404 : /* Compute new and old size before entering critical section. */
405 2 : fork = VISIBILITYMAP_FORKNUM;
406 2 : block = visibilitymap_prepare_truncate(rel, 0);
407 2 : old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
408 :
409 : /*
410 : * WAL-logging, buffer dropping, file truncation must be atomic and all on
411 : * one side of a checkpoint. See RelationTruncate() for discussion.
412 : */
413 : Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
414 2 : MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
415 2 : START_CRIT_SECTION();
416 :
417 2 : if (RelationNeedsWAL(rel))
418 : {
419 : XLogRecPtr lsn;
420 : xl_smgr_truncate xlrec;
421 :
422 2 : xlrec.blkno = 0;
423 2 : xlrec.rlocator = rel->rd_locator;
424 2 : xlrec.flags = SMGR_TRUNCATE_VM;
425 :
426 2 : XLogBeginInsert();
427 2 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
428 :
429 2 : lsn = XLogInsert(RM_SMGR_ID,
430 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
431 2 : XLogFlush(lsn);
432 : }
433 :
434 2 : if (BlockNumberIsValid(block))
435 2 : smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
436 :
437 2 : END_CRIT_SECTION();
438 2 : MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
439 :
440 : /*
441 : * Release the lock right away, not at commit time.
442 : *
443 : * It would be a problem to release the lock prior to commit if this
444 : * truncate operation sends any transactional invalidation messages. Other
445 : * backends would potentially be able to lock the relation without
446 : * processing them in the window of time between when we release the lock
447 : * here and when we sent the messages at our eventual commit. However,
448 : * we're currently only sending a non-transactional smgr invalidation,
449 : * which will have been posted to shared memory immediately from within
450 : * smgr_truncate. Therefore, there should be no race here.
451 : *
452 : * The reason why it's desirable to release the lock early here is because
453 : * of the possibility that someone will need to use this to blow away many
454 : * visibility map forks at once. If we can't release the lock until
455 : * commit time, the transaction doing this will accumulate
456 : * AccessExclusiveLocks on all of those relations at the same time, which
457 : * is undesirable. However, if this turns out to be unsafe we may have no
458 : * choice...
459 : */
460 2 : relation_close(rel, AccessExclusiveLock);
461 :
462 : /* Nothing to return. */
463 2 : PG_RETURN_VOID();
464 : }
465 :
466 : /*
467 : * Helper function to construct whichever TupleDesc we need for a particular
468 : * call.
469 : */
470 : static TupleDesc
471 36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
472 : {
473 : TupleDesc tupdesc;
474 36 : AttrNumber maxattr = 2;
475 36 : AttrNumber a = 0;
476 :
477 36 : if (include_blkno)
478 34 : ++maxattr;
479 36 : if (include_pd)
480 14 : ++maxattr;
481 36 : tupdesc = CreateTemplateTupleDesc(maxattr);
482 36 : if (include_blkno)
483 34 : TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
484 36 : TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
485 36 : TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
486 36 : if (include_pd)
487 14 : TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
488 : Assert(a == maxattr);
489 :
490 36 : return BlessTupleDesc(tupdesc);
491 : }
492 :
493 : /*
494 : * Collect visibility data about a relation.
495 : *
496 : * Checks relkind of relid and will throw an error if the relation does not
497 : * have a VM.
498 : */
499 : static vbits *
500 34 : collect_visibility_data(Oid relid, bool include_pd)
501 : {
502 : Relation rel;
503 : BlockNumber nblocks;
504 : vbits *info;
505 : BlockNumber blkno;
506 34 : Buffer vmbuffer = InvalidBuffer;
507 34 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
508 : BlockRangeReadStreamPrivate p;
509 34 : ReadStream *stream = NULL;
510 :
511 34 : rel = relation_open(relid, AccessShareLock);
512 :
513 : /* Only some relkinds have a visibility map */
514 30 : check_relation_relkind(rel);
515 :
516 20 : nblocks = RelationGetNumberOfBlocks(rel);
517 20 : info = palloc0(offsetof(vbits, bits) + nblocks);
518 20 : info->next = 0;
519 20 : info->count = nblocks;
520 :
521 : /* Create a stream if reading main fork. */
522 20 : if (include_pd)
523 : {
524 12 : p.current_blocknum = 0;
525 12 : p.last_exclusive = nblocks;
526 12 : stream = read_stream_begin_relation(READ_STREAM_FULL,
527 : bstrategy,
528 : rel,
529 : MAIN_FORKNUM,
530 : block_range_read_stream_cb,
531 : &p,
532 : 0);
533 : }
534 :
535 44 : for (blkno = 0; blkno < nblocks; ++blkno)
536 : {
537 : int32 mapbits;
538 :
539 : /* Make sure we are interruptible. */
540 24 : CHECK_FOR_INTERRUPTS();
541 :
542 : /* Get map info. */
543 24 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
544 24 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
545 16 : info->bits[blkno] |= (1 << 0);
546 24 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
547 10 : info->bits[blkno] |= (1 << 1);
548 :
549 : /*
550 : * Page-level data requires reading every block, so only get it if the
551 : * caller needs it. Use a buffer access strategy, too, to prevent
552 : * cache-trashing.
553 : */
554 24 : if (include_pd)
555 : {
556 : Buffer buffer;
557 : Page page;
558 :
559 6 : buffer = read_stream_next_buffer(stream, NULL);
560 6 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
561 :
562 6 : page = BufferGetPage(buffer);
563 6 : if (PageIsAllVisible(page))
564 4 : info->bits[blkno] |= (1 << 2);
565 :
566 6 : UnlockReleaseBuffer(buffer);
567 : }
568 : }
569 :
570 20 : if (include_pd)
571 : {
572 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
573 12 : read_stream_end(stream);
574 : }
575 :
576 : /* Clean up. */
577 20 : if (vmbuffer != InvalidBuffer)
578 14 : ReleaseBuffer(vmbuffer);
579 20 : relation_close(rel, AccessShareLock);
580 :
581 20 : return info;
582 : }
583 :
584 : /*
585 : * The "strict" version of GetOldestNonRemovableTransactionId(). The
586 : * pg_visibility check can tolerate false positives (don't report some of the
587 : * errors), but can't tolerate false negatives (report false errors). Normally,
588 : * horizons move forwards, but there are cases when it could move backward
589 : * (see comment for ComputeXidHorizons()).
590 : *
591 : * This is why we have to implement our own function for xid horizon, which
592 : * would be guaranteed to be newer or equal to any xid horizon computed before.
593 : * We have to do the following to achieve this.
594 : *
595 : * 1. Ignore processes xmin's, because they consider connection to other
596 : * databases that were ignored before.
597 : * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
598 : * now perform minimal checking on a standby by always using nextXid, this
599 : * approach is better than nothing and will at least catch extremely broken
600 : * cases where a xid is in the future.
601 : * 3. Ignore walsender xmin, because it could go backward if some replication
602 : * connections don't use replication slots.
603 : *
604 : * While it might seem like we could use KnownAssignedXids for shared
605 : * catalogs, since shared catalogs rely on a global horizon rather than a
606 : * database-specific one - there are potential edge cases. For example, a
607 : * transaction may crash on the primary without writing a commit/abort record.
608 : * This would lead to a situation where it appears to still be running on the
609 : * standby, even though it has already ended on the primary. For this reason,
610 : * it's safer to ignore KnownAssignedXids, even for shared catalogs.
611 : *
612 : * As a result, we're using only currently running xids to compute the horizon.
613 : * Surely these would significantly sacrifice accuracy. But we have to do so
614 : * to avoid reporting false errors.
615 : */
616 : static TransactionId
617 16 : GetStrictOldestNonRemovableTransactionId(Relation rel)
618 : {
619 : RunningTransactions runningTransactions;
620 :
621 16 : if (RecoveryInProgress())
622 : {
623 : TransactionId result;
624 :
625 : /* As we ignore KnownAssignedXids on standby, just pick nextXid */
626 2 : LWLockAcquire(XidGenLock, LW_SHARED);
627 2 : result = XidFromFullTransactionId(TransamVariables->nextXid);
628 2 : LWLockRelease(XidGenLock);
629 2 : return result;
630 : }
631 14 : else if (rel == NULL || rel->rd_rel->relisshared)
632 : {
633 : /* Shared relation: take into account all running xids */
634 0 : runningTransactions = GetRunningTransactionData();
635 0 : LWLockRelease(ProcArrayLock);
636 0 : LWLockRelease(XidGenLock);
637 0 : return runningTransactions->oldestRunningXid;
638 : }
639 14 : else if (!RELATION_IS_LOCAL(rel))
640 : {
641 : /*
642 : * Normal relation: take into account xids running within the current
643 : * database
644 : */
645 14 : runningTransactions = GetRunningTransactionData();
646 14 : LWLockRelease(ProcArrayLock);
647 14 : LWLockRelease(XidGenLock);
648 14 : return runningTransactions->oldestDatabaseRunningXid;
649 : }
650 : else
651 : {
652 : /*
653 : * For temporary relations, ComputeXidHorizons() uses only
654 : * TransamVariables->latestCompletedXid and MyProc->xid. These two
655 : * shouldn't go backwards. So we're fine with this horizon.
656 : */
657 0 : return GetOldestNonRemovableTransactionId(rel);
658 : }
659 : }
660 :
661 : /*
662 : * Callback function to get next block for read stream object used in
663 : * collect_corrupt_items() function.
664 : */
665 : static BlockNumber
666 206 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
667 : void *callback_private_data,
668 : void *per_buffer_data)
669 : {
670 206 : struct collect_corrupt_items_read_stream_private *p = callback_private_data;
671 :
672 218 : for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
673 : {
674 202 : bool check_frozen = false;
675 202 : bool check_visible = false;
676 :
677 : /* Make sure we are interruptible. */
678 202 : CHECK_FOR_INTERRUPTS();
679 :
680 202 : if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
681 98 : check_frozen = true;
682 202 : if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
683 92 : check_visible = true;
684 202 : if (!check_visible && !check_frozen)
685 12 : continue;
686 :
687 190 : return p->current_blocknum++;
688 : }
689 :
690 16 : return InvalidBlockNumber;
691 : }
692 :
693 : /*
694 : * Returns a list of items whose visibility map information does not match
695 : * the status of the tuples on the page.
696 : *
697 : * If all_visible is passed as true, this will include all items which are
698 : * on pages marked as all-visible in the visibility map but which do not
699 : * seem to in fact be all-visible.
700 : *
701 : * If all_frozen is passed as true, this will include all items which are
702 : * on pages marked as all-frozen but which do not seem to in fact be frozen.
703 : *
704 : * Checks relkind of relid and will throw an error if the relation does not
705 : * have a VM.
706 : */
707 : static corrupt_items *
708 26 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
709 : {
710 : Relation rel;
711 : corrupt_items *items;
712 26 : Buffer vmbuffer = InvalidBuffer;
713 26 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
714 26 : TransactionId OldestXmin = InvalidTransactionId;
715 : struct collect_corrupt_items_read_stream_private p;
716 : ReadStream *stream;
717 : Buffer buffer;
718 :
719 26 : rel = relation_open(relid, AccessShareLock);
720 :
721 : /* Only some relkinds have a visibility map */
722 26 : check_relation_relkind(rel);
723 :
724 16 : if (all_visible)
725 6 : OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
726 :
727 : /*
728 : * Guess an initial array size. We don't expect many corrupted tuples, so
729 : * start with a small array. This function uses the "next" field to track
730 : * the next offset where we can store an item (which is the same thing as
731 : * the number of items found so far) and the "count" field to track the
732 : * number of entries allocated. We'll repurpose these fields before
733 : * returning.
734 : */
735 16 : items = palloc0(sizeof(corrupt_items));
736 16 : items->next = 0;
737 16 : items->count = 64;
738 16 : items->tids = palloc(items->count * sizeof(ItemPointerData));
739 :
740 16 : p.current_blocknum = 0;
741 16 : p.last_exclusive = RelationGetNumberOfBlocks(rel);
742 16 : p.rel = rel;
743 16 : p.vmbuffer = InvalidBuffer;
744 16 : p.all_frozen = all_frozen;
745 16 : p.all_visible = all_visible;
746 16 : stream = read_stream_begin_relation(READ_STREAM_FULL,
747 : bstrategy,
748 : rel,
749 : MAIN_FORKNUM,
750 : collect_corrupt_items_read_stream_next_block,
751 : &p,
752 : 0);
753 :
754 : /* Loop over every block in the relation. */
755 206 : while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
756 : {
757 190 : bool check_frozen = all_frozen;
758 190 : bool check_visible = all_visible;
759 : Page page;
760 : OffsetNumber offnum,
761 : maxoff;
762 : BlockNumber blkno;
763 :
764 : /* Make sure we are interruptible. */
765 190 : CHECK_FOR_INTERRUPTS();
766 :
767 190 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
768 :
769 190 : page = BufferGetPage(buffer);
770 190 : maxoff = PageGetMaxOffsetNumber(page);
771 190 : blkno = BufferGetBlockNumber(buffer);
772 :
773 : /*
774 : * The visibility map bits might have changed while we were acquiring
775 : * the page lock. Recheck to avoid returning spurious results.
776 : */
777 190 : if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
778 0 : check_frozen = false;
779 190 : if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
780 0 : check_visible = false;
781 190 : if (!check_visible && !check_frozen)
782 : {
783 0 : UnlockReleaseBuffer(buffer);
784 0 : continue;
785 : }
786 :
787 : /* Iterate over each tuple on the page. */
788 32252 : for (offnum = FirstOffsetNumber;
789 : offnum <= maxoff;
790 32062 : offnum = OffsetNumberNext(offnum))
791 : {
792 : HeapTupleData tuple;
793 : ItemId itemid;
794 :
795 32062 : itemid = PageGetItemId(page, offnum);
796 :
797 : /* Unused or redirect line pointers are of no interest. */
798 32062 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
799 0 : continue;
800 :
801 : /* Dead line pointers are neither all-visible nor frozen. */
802 32062 : if (ItemIdIsDead(itemid))
803 : {
804 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
805 0 : record_corrupt_item(items, &tuple.t_self);
806 0 : continue;
807 : }
808 :
809 : /* Initialize a HeapTupleData structure for checks below. */
810 32062 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
811 32062 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
812 32062 : tuple.t_len = ItemIdGetLength(itemid);
813 32062 : tuple.t_tableOid = relid;
814 :
815 : /*
816 : * If we're checking whether the page is all-visible, we expect
817 : * the tuple to be all-visible.
818 : */
819 32062 : if (check_visible &&
820 16018 : !tuple_all_visible(&tuple, OldestXmin, buffer))
821 : {
822 : TransactionId RecomputedOldestXmin;
823 :
824 : /*
825 : * Time has passed since we computed OldestXmin, so it's
826 : * possible that this tuple is all-visible in reality even
827 : * though it doesn't appear so based on our
828 : * previously-computed value. Let's compute a new value so we
829 : * can be certain whether there is a problem.
830 : *
831 : * From a concurrency point of view, it sort of sucks to
832 : * retake ProcArrayLock here while we're holding the buffer
833 : * exclusively locked, but it should be safe against
834 : * deadlocks, because surely
835 : * GetStrictOldestNonRemovableTransactionId() should never
836 : * take a buffer lock. And this shouldn't happen often, so
837 : * it's worth being careful so as to avoid false positives.
838 : */
839 10 : RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
840 :
841 10 : if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
842 10 : record_corrupt_item(items, &tuple.t_self);
843 : else
844 : {
845 0 : OldestXmin = RecomputedOldestXmin;
846 0 : if (!tuple_all_visible(&tuple, OldestXmin, buffer))
847 0 : record_corrupt_item(items, &tuple.t_self);
848 : }
849 : }
850 :
851 : /*
852 : * If we're checking whether the page is all-frozen, we expect the
853 : * tuple to be in a state where it will never need freezing.
854 : */
855 32062 : if (check_frozen)
856 : {
857 16044 : if (heap_tuple_needs_eventual_freeze(tuple.t_data))
858 10 : record_corrupt_item(items, &tuple.t_self);
859 : }
860 : }
861 :
862 190 : UnlockReleaseBuffer(buffer);
863 : }
864 16 : read_stream_end(stream);
865 :
866 : /* Clean up. */
867 16 : if (vmbuffer != InvalidBuffer)
868 14 : ReleaseBuffer(vmbuffer);
869 16 : if (p.vmbuffer != InvalidBuffer)
870 16 : ReleaseBuffer(p.vmbuffer);
871 16 : relation_close(rel, AccessShareLock);
872 :
873 : /*
874 : * Before returning, repurpose the fields to match caller's expectations.
875 : * next is now the next item that should be read (rather than written) and
876 : * count is now the number of items we wrote (rather than the number we
877 : * allocated).
878 : */
879 16 : items->count = items->next;
880 16 : items->next = 0;
881 :
882 16 : return items;
883 : }
884 :
885 : /*
886 : * Remember one corrupt item.
887 : */
888 : static void
889 20 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
890 : {
891 : /* enlarge output array if needed. */
892 20 : if (items->next >= items->count)
893 : {
894 0 : items->count *= 2;
895 0 : items->tids = repalloc(items->tids,
896 0 : items->count * sizeof(ItemPointerData));
897 : }
898 : /* and add the new item */
899 20 : items->tids[items->next++] = *tid;
900 20 : }
901 :
902 : /*
903 : * Check whether a tuple is all-visible relative to a given OldestXmin value.
904 : * The buffer should contain the tuple and should be locked and pinned.
905 : */
906 : static bool
907 16018 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
908 : {
909 : HTSV_Result state;
910 : TransactionId xmin;
911 :
912 16018 : state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
913 16018 : if (state != HEAPTUPLE_LIVE)
914 10 : return false; /* all-visible implies live */
915 :
916 : /*
917 : * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
918 : * all-visible unless every tuple is hinted committed. However, those hint
919 : * bits could be lost after a crash, so we can't be certain that they'll
920 : * be set here. So just check the xmin.
921 : */
922 :
923 16008 : xmin = HeapTupleHeaderGetXmin(tup->t_data);
924 16008 : if (!TransactionIdPrecedes(xmin, OldestXmin))
925 0 : return false; /* xmin not old enough for all to see */
926 :
927 16008 : return true;
928 : }
929 :
930 : /*
931 : * check_relation_relkind - convenience routine to check that relation
932 : * is of the relkind supported by the callers
933 : */
934 : static void
935 92 : check_relation_relkind(Relation rel)
936 : {
937 92 : if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
938 50 : ereport(ERROR,
939 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
940 : errmsg("relation \"%s\" is of wrong relation kind",
941 : RelationGetRelationName(rel)),
942 : errdetail_relkind_not_supported(rel->rd_rel->relkind)));
943 42 : }
|