Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_visibility.c
4 : * display visibility map information and page-level visibility bits
5 : *
6 : * Copyright (c) 2016-2024, PostgreSQL Global Development Group
7 : *
8 : * contrib/pg_visibility/pg_visibility.c
9 : *-------------------------------------------------------------------------
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/heapam.h"
14 : #include "access/htup_details.h"
15 : #include "access/visibilitymap.h"
16 : #include "access/xloginsert.h"
17 : #include "catalog/pg_type.h"
18 : #include "catalog/storage_xlog.h"
19 : #include "funcapi.h"
20 : #include "miscadmin.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/procarray.h"
23 : #include "storage/read_stream.h"
24 : #include "storage/smgr.h"
25 : #include "utils/rel.h"
26 :
27 14 : PG_MODULE_MAGIC;
28 :
29 : typedef struct vbits
30 : {
31 : BlockNumber next;
32 : BlockNumber count;
33 : uint8 bits[FLEXIBLE_ARRAY_MEMBER];
34 : } vbits;
35 :
36 : typedef struct corrupt_items
37 : {
38 : BlockNumber next;
39 : BlockNumber count;
40 : ItemPointer tids;
41 : } corrupt_items;
42 :
43 : /* for collect_corrupt_items_read_stream_next_block */
44 : struct collect_corrupt_items_read_stream_private
45 : {
46 : bool all_frozen;
47 : bool all_visible;
48 : BlockNumber current_blocknum;
49 : BlockNumber last_exclusive;
50 : Relation rel;
51 : Buffer vmbuffer;
52 : };
53 :
54 6 : PG_FUNCTION_INFO_V1(pg_visibility_map);
55 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
56 8 : PG_FUNCTION_INFO_V1(pg_visibility);
57 8 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
58 8 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
59 10 : PG_FUNCTION_INFO_V1(pg_check_frozen);
60 12 : PG_FUNCTION_INFO_V1(pg_check_visible);
61 8 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
62 :
63 : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
64 : static vbits *collect_visibility_data(Oid relid, bool include_pd);
65 : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
66 : bool all_frozen);
67 : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
68 : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
69 : Buffer buffer);
70 : static void check_relation_relkind(Relation rel);
71 :
72 : /*
73 : * Visibility map information for a single block of a relation.
74 : *
75 : * Note: the VM code will silently return zeroes for pages past the end
76 : * of the map, so we allow probes up to MaxBlockNumber regardless of the
77 : * actual relation size.
78 : */
79 : Datum
80 0 : pg_visibility_map(PG_FUNCTION_ARGS)
81 : {
82 0 : Oid relid = PG_GETARG_OID(0);
83 0 : int64 blkno = PG_GETARG_INT64(1);
84 : int32 mapbits;
85 : Relation rel;
86 0 : Buffer vmbuffer = InvalidBuffer;
87 : TupleDesc tupdesc;
88 : Datum values[2];
89 0 : bool nulls[2] = {0};
90 :
91 0 : rel = relation_open(relid, AccessShareLock);
92 :
93 : /* Only some relkinds have a visibility map */
94 0 : check_relation_relkind(rel);
95 :
96 0 : if (blkno < 0 || blkno > MaxBlockNumber)
97 0 : ereport(ERROR,
98 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
99 : errmsg("invalid block number")));
100 :
101 0 : tupdesc = pg_visibility_tupdesc(false, false);
102 :
103 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
104 0 : if (vmbuffer != InvalidBuffer)
105 0 : ReleaseBuffer(vmbuffer);
106 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
107 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
108 :
109 0 : relation_close(rel, AccessShareLock);
110 :
111 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
112 : }
113 :
114 : /*
115 : * Visibility map information for a single block of a relation, plus the
116 : * page-level information for the same block.
117 : */
118 : Datum
119 12 : pg_visibility(PG_FUNCTION_ARGS)
120 : {
121 12 : Oid relid = PG_GETARG_OID(0);
122 12 : int64 blkno = PG_GETARG_INT64(1);
123 : int32 mapbits;
124 : Relation rel;
125 12 : Buffer vmbuffer = InvalidBuffer;
126 : Buffer buffer;
127 : Page page;
128 : TupleDesc tupdesc;
129 : Datum values[3];
130 12 : bool nulls[3] = {0};
131 :
132 12 : rel = relation_open(relid, AccessShareLock);
133 :
134 : /* Only some relkinds have a visibility map */
135 12 : check_relation_relkind(rel);
136 :
137 2 : if (blkno < 0 || blkno > MaxBlockNumber)
138 0 : ereport(ERROR,
139 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
140 : errmsg("invalid block number")));
141 :
142 2 : tupdesc = pg_visibility_tupdesc(false, true);
143 :
144 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
145 2 : if (vmbuffer != InvalidBuffer)
146 2 : ReleaseBuffer(vmbuffer);
147 2 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
148 2 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
149 :
150 : /* Here we have to explicitly check rel size ... */
151 2 : if (blkno < RelationGetNumberOfBlocks(rel))
152 : {
153 2 : buffer = ReadBuffer(rel, blkno);
154 2 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
155 :
156 2 : page = BufferGetPage(buffer);
157 2 : values[2] = BoolGetDatum(PageIsAllVisible(page));
158 :
159 2 : UnlockReleaseBuffer(buffer);
160 : }
161 : else
162 : {
163 : /* As with the vismap, silently return 0 for pages past EOF */
164 0 : values[2] = BoolGetDatum(false);
165 : }
166 :
167 2 : relation_close(rel, AccessShareLock);
168 :
169 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
170 : }
171 :
172 : /*
173 : * Visibility map information for every block in a relation.
174 : */
175 : Datum
176 40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
177 : {
178 : FuncCallContext *funcctx;
179 : vbits *info;
180 :
181 40 : if (SRF_IS_FIRSTCALL())
182 : {
183 22 : Oid relid = PG_GETARG_OID(0);
184 : MemoryContext oldcontext;
185 :
186 22 : funcctx = SRF_FIRSTCALL_INIT();
187 22 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
188 22 : funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
189 : /* collect_visibility_data will verify the relkind */
190 22 : funcctx->user_fctx = collect_visibility_data(relid, false);
191 8 : MemoryContextSwitchTo(oldcontext);
192 : }
193 :
194 26 : funcctx = SRF_PERCALL_SETUP();
195 26 : info = (vbits *) funcctx->user_fctx;
196 :
197 26 : if (info->next < info->count)
198 : {
199 : Datum values[3];
200 18 : bool nulls[3] = {0};
201 : HeapTuple tuple;
202 :
203 18 : values[0] = Int64GetDatum(info->next);
204 18 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
205 18 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
206 18 : info->next++;
207 :
208 18 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
209 18 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
210 : }
211 :
212 8 : SRF_RETURN_DONE(funcctx);
213 : }
214 :
215 : /*
216 : * Visibility map information for every block in a relation, plus the page
217 : * level information for each block.
218 : */
219 : Datum
220 18 : pg_visibility_rel(PG_FUNCTION_ARGS)
221 : {
222 : FuncCallContext *funcctx;
223 : vbits *info;
224 :
225 18 : if (SRF_IS_FIRSTCALL())
226 : {
227 12 : Oid relid = PG_GETARG_OID(0);
228 : MemoryContext oldcontext;
229 :
230 12 : funcctx = SRF_FIRSTCALL_INIT();
231 12 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
232 12 : funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
233 : /* collect_visibility_data will verify the relkind */
234 12 : funcctx->user_fctx = collect_visibility_data(relid, true);
235 12 : MemoryContextSwitchTo(oldcontext);
236 : }
237 :
238 18 : funcctx = SRF_PERCALL_SETUP();
239 18 : info = (vbits *) funcctx->user_fctx;
240 :
241 18 : if (info->next < info->count)
242 : {
243 : Datum values[4];
244 6 : bool nulls[4] = {0};
245 : HeapTuple tuple;
246 :
247 6 : values[0] = Int64GetDatum(info->next);
248 6 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
249 6 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
250 6 : values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
251 6 : info->next++;
252 :
253 6 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
254 6 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
255 : }
256 :
257 12 : SRF_RETURN_DONE(funcctx);
258 : }
259 :
260 : /*
261 : * Count the number of all-visible and all-frozen pages in the visibility
262 : * map for a particular relation.
263 : */
264 : Datum
265 12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
266 : {
267 12 : Oid relid = PG_GETARG_OID(0);
268 : Relation rel;
269 : BlockNumber nblocks;
270 : BlockNumber blkno;
271 12 : Buffer vmbuffer = InvalidBuffer;
272 12 : int64 all_visible = 0;
273 12 : int64 all_frozen = 0;
274 : TupleDesc tupdesc;
275 : Datum values[2];
276 12 : bool nulls[2] = {0};
277 :
278 12 : rel = relation_open(relid, AccessShareLock);
279 :
280 : /* Only some relkinds have a visibility map */
281 12 : check_relation_relkind(rel);
282 :
283 2 : nblocks = RelationGetNumberOfBlocks(rel);
284 :
285 4 : for (blkno = 0; blkno < nblocks; ++blkno)
286 : {
287 : int32 mapbits;
288 :
289 : /* Make sure we are interruptible. */
290 2 : CHECK_FOR_INTERRUPTS();
291 :
292 : /* Get map info. */
293 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
294 2 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
295 2 : ++all_visible;
296 2 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
297 0 : ++all_frozen;
298 : }
299 :
300 : /* Clean up. */
301 2 : if (vmbuffer != InvalidBuffer)
302 2 : ReleaseBuffer(vmbuffer);
303 2 : relation_close(rel, AccessShareLock);
304 :
305 2 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
306 0 : elog(ERROR, "return type must be a row type");
307 :
308 2 : values[0] = Int64GetDatum(all_visible);
309 2 : values[1] = Int64GetDatum(all_frozen);
310 :
311 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
312 : }
313 :
314 : /*
315 : * Return the TIDs of non-frozen tuples present in pages marked all-frozen
316 : * in the visibility map. We hope no one will ever find any, but there could
317 : * be bugs, database corruption, etc.
318 : */
319 : Datum
320 30 : pg_check_frozen(PG_FUNCTION_ARGS)
321 : {
322 : FuncCallContext *funcctx;
323 : corrupt_items *items;
324 :
325 30 : if (SRF_IS_FIRSTCALL())
326 : {
327 20 : Oid relid = PG_GETARG_OID(0);
328 : MemoryContext oldcontext;
329 :
330 20 : funcctx = SRF_FIRSTCALL_INIT();
331 20 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
332 : /* collect_corrupt_items will verify the relkind */
333 20 : funcctx->user_fctx = collect_corrupt_items(relid, false, true);
334 10 : MemoryContextSwitchTo(oldcontext);
335 : }
336 :
337 20 : funcctx = SRF_PERCALL_SETUP();
338 20 : items = (corrupt_items *) funcctx->user_fctx;
339 :
340 20 : if (items->next < items->count)
341 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
342 :
343 10 : SRF_RETURN_DONE(funcctx);
344 : }
345 :
346 : /*
347 : * Return the TIDs of not-all-visible tuples in pages marked all-visible
348 : * in the visibility map. We hope no one will ever find any, but there could
349 : * be bugs, database corruption, etc.
350 : */
351 : Datum
352 16 : pg_check_visible(PG_FUNCTION_ARGS)
353 : {
354 : FuncCallContext *funcctx;
355 : corrupt_items *items;
356 :
357 16 : if (SRF_IS_FIRSTCALL())
358 : {
359 6 : Oid relid = PG_GETARG_OID(0);
360 : MemoryContext oldcontext;
361 :
362 6 : funcctx = SRF_FIRSTCALL_INIT();
363 6 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
364 : /* collect_corrupt_items will verify the relkind */
365 6 : funcctx->user_fctx = collect_corrupt_items(relid, true, false);
366 6 : MemoryContextSwitchTo(oldcontext);
367 : }
368 :
369 16 : funcctx = SRF_PERCALL_SETUP();
370 16 : items = (corrupt_items *) funcctx->user_fctx;
371 :
372 16 : if (items->next < items->count)
373 10 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
374 :
375 6 : SRF_RETURN_DONE(funcctx);
376 : }
377 :
378 : /*
379 : * Remove the visibility map fork for a relation. If there turn out to be
380 : * any bugs in the visibility map code that require rebuilding the VM, this
381 : * provides users with a way to do it that is cleaner than shutting down the
382 : * server and removing files by hand.
383 : *
384 : * This is a cut-down version of RelationTruncate.
385 : */
386 : Datum
387 12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
388 : {
389 12 : Oid relid = PG_GETARG_OID(0);
390 : Relation rel;
391 : ForkNumber fork;
392 : BlockNumber block;
393 :
394 12 : rel = relation_open(relid, AccessExclusiveLock);
395 :
396 : /* Only some relkinds have a visibility map */
397 12 : check_relation_relkind(rel);
398 :
399 : /* Forcibly reset cached file size */
400 2 : RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
401 :
402 2 : block = visibilitymap_prepare_truncate(rel, 0);
403 2 : if (BlockNumberIsValid(block))
404 : {
405 2 : fork = VISIBILITYMAP_FORKNUM;
406 2 : smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
407 : }
408 :
409 2 : if (RelationNeedsWAL(rel))
410 : {
411 : xl_smgr_truncate xlrec;
412 :
413 2 : xlrec.blkno = 0;
414 2 : xlrec.rlocator = rel->rd_locator;
415 2 : xlrec.flags = SMGR_TRUNCATE_VM;
416 :
417 2 : XLogBeginInsert();
418 2 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
419 :
420 2 : XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
421 : }
422 :
423 : /*
424 : * Release the lock right away, not at commit time.
425 : *
426 : * It would be a problem to release the lock prior to commit if this
427 : * truncate operation sends any transactional invalidation messages. Other
428 : * backends would potentially be able to lock the relation without
429 : * processing them in the window of time between when we release the lock
430 : * here and when we sent the messages at our eventual commit. However,
431 : * we're currently only sending a non-transactional smgr invalidation,
432 : * which will have been posted to shared memory immediately from within
433 : * smgr_truncate. Therefore, there should be no race here.
434 : *
435 : * The reason why it's desirable to release the lock early here is because
436 : * of the possibility that someone will need to use this to blow away many
437 : * visibility map forks at once. If we can't release the lock until
438 : * commit time, the transaction doing this will accumulate
439 : * AccessExclusiveLocks on all of those relations at the same time, which
440 : * is undesirable. However, if this turns out to be unsafe we may have no
441 : * choice...
442 : */
443 2 : relation_close(rel, AccessExclusiveLock);
444 :
445 : /* Nothing to return. */
446 2 : PG_RETURN_VOID();
447 : }
448 :
449 : /*
450 : * Helper function to construct whichever TupleDesc we need for a particular
451 : * call.
452 : */
453 : static TupleDesc
454 36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
455 : {
456 : TupleDesc tupdesc;
457 36 : AttrNumber maxattr = 2;
458 36 : AttrNumber a = 0;
459 :
460 36 : if (include_blkno)
461 34 : ++maxattr;
462 36 : if (include_pd)
463 14 : ++maxattr;
464 36 : tupdesc = CreateTemplateTupleDesc(maxattr);
465 36 : if (include_blkno)
466 34 : TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
467 36 : TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
468 36 : TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
469 36 : if (include_pd)
470 14 : TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
471 : Assert(a == maxattr);
472 :
473 36 : return BlessTupleDesc(tupdesc);
474 : }
475 :
476 : /*
477 : * Collect visibility data about a relation.
478 : *
479 : * Checks relkind of relid and will throw an error if the relation does not
480 : * have a VM.
481 : */
482 : static vbits *
483 34 : collect_visibility_data(Oid relid, bool include_pd)
484 : {
485 : Relation rel;
486 : BlockNumber nblocks;
487 : vbits *info;
488 : BlockNumber blkno;
489 34 : Buffer vmbuffer = InvalidBuffer;
490 34 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
491 : BlockRangeReadStreamPrivate p;
492 34 : ReadStream *stream = NULL;
493 :
494 34 : rel = relation_open(relid, AccessShareLock);
495 :
496 : /* Only some relkinds have a visibility map */
497 30 : check_relation_relkind(rel);
498 :
499 20 : nblocks = RelationGetNumberOfBlocks(rel);
500 20 : info = palloc0(offsetof(vbits, bits) + nblocks);
501 20 : info->next = 0;
502 20 : info->count = nblocks;
503 :
504 : /* Create a stream if reading main fork. */
505 20 : if (include_pd)
506 : {
507 12 : p.current_blocknum = 0;
508 12 : p.last_exclusive = nblocks;
509 12 : stream = read_stream_begin_relation(READ_STREAM_FULL,
510 : bstrategy,
511 : rel,
512 : MAIN_FORKNUM,
513 : block_range_read_stream_cb,
514 : &p,
515 : 0);
516 : }
517 :
518 44 : for (blkno = 0; blkno < nblocks; ++blkno)
519 : {
520 : int32 mapbits;
521 :
522 : /* Make sure we are interruptible. */
523 24 : CHECK_FOR_INTERRUPTS();
524 :
525 : /* Get map info. */
526 24 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
527 24 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
528 16 : info->bits[blkno] |= (1 << 0);
529 24 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
530 10 : info->bits[blkno] |= (1 << 1);
531 :
532 : /*
533 : * Page-level data requires reading every block, so only get it if the
534 : * caller needs it. Use a buffer access strategy, too, to prevent
535 : * cache-trashing.
536 : */
537 24 : if (include_pd)
538 : {
539 : Buffer buffer;
540 : Page page;
541 :
542 6 : buffer = read_stream_next_buffer(stream, NULL);
543 6 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
544 :
545 6 : page = BufferGetPage(buffer);
546 6 : if (PageIsAllVisible(page))
547 4 : info->bits[blkno] |= (1 << 2);
548 :
549 6 : UnlockReleaseBuffer(buffer);
550 : }
551 : }
552 :
553 20 : if (include_pd)
554 : {
555 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
556 12 : read_stream_end(stream);
557 : }
558 :
559 : /* Clean up. */
560 20 : if (vmbuffer != InvalidBuffer)
561 14 : ReleaseBuffer(vmbuffer);
562 20 : relation_close(rel, AccessShareLock);
563 :
564 20 : return info;
565 : }
566 :
567 : /*
568 : * The "strict" version of GetOldestNonRemovableTransactionId(). The
569 : * pg_visibility check can tolerate false positives (don't report some of the
570 : * errors), but can't tolerate false negatives (report false errors). Normally,
571 : * horizons move forwards, but there are cases when it could move backward
572 : * (see comment for ComputeXidHorizons()).
573 : *
574 : * This is why we have to implement our own function for xid horizon, which
575 : * would be guaranteed to be newer or equal to any xid horizon computed before.
576 : * We have to do the following to achieve this.
577 : *
578 : * 1. Ignore processes xmin's, because they consider connection to other
579 : * databases that were ignored before.
580 : * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
581 : * now perform minimal checking on a standby by always using nextXid, this
582 : * approach is better than nothing and will at least catch extremely broken
583 : * cases where a xid is in the future.
584 : * 3. Ignore walsender xmin, because it could go backward if some replication
585 : * connections don't use replication slots.
586 : *
587 : * While it might seem like we could use KnownAssignedXids for shared
588 : * catalogs, since shared catalogs rely on a global horizon rather than a
589 : * database-specific one - there are potential edge cases. For example, a
590 : * transaction may crash on the primary without writing a commit/abort record.
591 : * This would lead to a situation where it appears to still be running on the
592 : * standby, even though it has already ended on the primary. For this reason,
593 : * it's safer to ignore KnownAssignedXids, even for shared catalogs.
594 : *
595 : * As a result, we're using only currently running xids to compute the horizon.
596 : * Surely these would significantly sacrifice accuracy. But we have to do so
597 : * to avoid reporting false errors.
598 : */
599 : static TransactionId
600 16 : GetStrictOldestNonRemovableTransactionId(Relation rel)
601 : {
602 : RunningTransactions runningTransactions;
603 :
604 16 : if (RecoveryInProgress())
605 : {
606 : TransactionId result;
607 :
608 : /* As we ignore KnownAssignedXids on standby, just pick nextXid */
609 2 : LWLockAcquire(XidGenLock, LW_SHARED);
610 2 : result = XidFromFullTransactionId(TransamVariables->nextXid);
611 2 : LWLockRelease(XidGenLock);
612 2 : return result;
613 : }
614 14 : else if (rel == NULL || rel->rd_rel->relisshared)
615 : {
616 : /* Shared relation: take into account all running xids */
617 0 : runningTransactions = GetRunningTransactionData();
618 0 : LWLockRelease(ProcArrayLock);
619 0 : LWLockRelease(XidGenLock);
620 0 : return runningTransactions->oldestRunningXid;
621 : }
622 14 : else if (!RELATION_IS_LOCAL(rel))
623 : {
624 : /*
625 : * Normal relation: take into account xids running within the current
626 : * database
627 : */
628 14 : runningTransactions = GetRunningTransactionData();
629 14 : LWLockRelease(ProcArrayLock);
630 14 : LWLockRelease(XidGenLock);
631 14 : return runningTransactions->oldestDatabaseRunningXid;
632 : }
633 : else
634 : {
635 : /*
636 : * For temporary relations, ComputeXidHorizons() uses only
637 : * TransamVariables->latestCompletedXid and MyProc->xid. These two
638 : * shouldn't go backwards. So we're fine with this horizon.
639 : */
640 0 : return GetOldestNonRemovableTransactionId(rel);
641 : }
642 : }
643 :
644 : /*
645 : * Callback function to get next block for read stream object used in
646 : * collect_corrupt_items() function.
647 : */
648 : static BlockNumber
649 206 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
650 : void *callback_private_data,
651 : void *per_buffer_data)
652 : {
653 206 : struct collect_corrupt_items_read_stream_private *p = callback_private_data;
654 :
655 218 : for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
656 : {
657 202 : bool check_frozen = false;
658 202 : bool check_visible = false;
659 :
660 : /* Make sure we are interruptible. */
661 202 : CHECK_FOR_INTERRUPTS();
662 :
663 202 : if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
664 98 : check_frozen = true;
665 202 : if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
666 92 : check_visible = true;
667 202 : if (!check_visible && !check_frozen)
668 12 : continue;
669 :
670 190 : return p->current_blocknum++;
671 : }
672 :
673 16 : return InvalidBlockNumber;
674 : }
675 :
676 : /*
677 : * Returns a list of items whose visibility map information does not match
678 : * the status of the tuples on the page.
679 : *
680 : * If all_visible is passed as true, this will include all items which are
681 : * on pages marked as all-visible in the visibility map but which do not
682 : * seem to in fact be all-visible.
683 : *
684 : * If all_frozen is passed as true, this will include all items which are
685 : * on pages marked as all-frozen but which do not seem to in fact be frozen.
686 : *
687 : * Checks relkind of relid and will throw an error if the relation does not
688 : * have a VM.
689 : */
690 : static corrupt_items *
691 26 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
692 : {
693 : Relation rel;
694 : corrupt_items *items;
695 26 : Buffer vmbuffer = InvalidBuffer;
696 26 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
697 26 : TransactionId OldestXmin = InvalidTransactionId;
698 : struct collect_corrupt_items_read_stream_private p;
699 : ReadStream *stream;
700 : Buffer buffer;
701 :
702 26 : rel = relation_open(relid, AccessShareLock);
703 :
704 : /* Only some relkinds have a visibility map */
705 26 : check_relation_relkind(rel);
706 :
707 16 : if (all_visible)
708 6 : OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
709 :
710 : /*
711 : * Guess an initial array size. We don't expect many corrupted tuples, so
712 : * start with a small array. This function uses the "next" field to track
713 : * the next offset where we can store an item (which is the same thing as
714 : * the number of items found so far) and the "count" field to track the
715 : * number of entries allocated. We'll repurpose these fields before
716 : * returning.
717 : */
718 16 : items = palloc0(sizeof(corrupt_items));
719 16 : items->next = 0;
720 16 : items->count = 64;
721 16 : items->tids = palloc(items->count * sizeof(ItemPointerData));
722 :
723 16 : p.current_blocknum = 0;
724 16 : p.last_exclusive = RelationGetNumberOfBlocks(rel);
725 16 : p.rel = rel;
726 16 : p.vmbuffer = InvalidBuffer;
727 16 : p.all_frozen = all_frozen;
728 16 : p.all_visible = all_visible;
729 16 : stream = read_stream_begin_relation(READ_STREAM_FULL,
730 : bstrategy,
731 : rel,
732 : MAIN_FORKNUM,
733 : collect_corrupt_items_read_stream_next_block,
734 : &p,
735 : 0);
736 :
737 : /* Loop over every block in the relation. */
738 206 : while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
739 : {
740 190 : bool check_frozen = all_frozen;
741 190 : bool check_visible = all_visible;
742 : Page page;
743 : OffsetNumber offnum,
744 : maxoff;
745 : BlockNumber blkno;
746 :
747 : /* Make sure we are interruptible. */
748 190 : CHECK_FOR_INTERRUPTS();
749 :
750 190 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
751 :
752 190 : page = BufferGetPage(buffer);
753 190 : maxoff = PageGetMaxOffsetNumber(page);
754 190 : blkno = BufferGetBlockNumber(buffer);
755 :
756 : /*
757 : * The visibility map bits might have changed while we were acquiring
758 : * the page lock. Recheck to avoid returning spurious results.
759 : */
760 190 : if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
761 0 : check_frozen = false;
762 190 : if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
763 0 : check_visible = false;
764 190 : if (!check_visible && !check_frozen)
765 : {
766 0 : UnlockReleaseBuffer(buffer);
767 0 : continue;
768 : }
769 :
770 : /* Iterate over each tuple on the page. */
771 32252 : for (offnum = FirstOffsetNumber;
772 : offnum <= maxoff;
773 32062 : offnum = OffsetNumberNext(offnum))
774 : {
775 : HeapTupleData tuple;
776 : ItemId itemid;
777 :
778 32062 : itemid = PageGetItemId(page, offnum);
779 :
780 : /* Unused or redirect line pointers are of no interest. */
781 32062 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
782 0 : continue;
783 :
784 : /* Dead line pointers are neither all-visible nor frozen. */
785 32062 : if (ItemIdIsDead(itemid))
786 : {
787 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
788 0 : record_corrupt_item(items, &tuple.t_self);
789 0 : continue;
790 : }
791 :
792 : /* Initialize a HeapTupleData structure for checks below. */
793 32062 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
794 32062 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
795 32062 : tuple.t_len = ItemIdGetLength(itemid);
796 32062 : tuple.t_tableOid = relid;
797 :
798 : /*
799 : * If we're checking whether the page is all-visible, we expect
800 : * the tuple to be all-visible.
801 : */
802 32062 : if (check_visible &&
803 16018 : !tuple_all_visible(&tuple, OldestXmin, buffer))
804 : {
805 : TransactionId RecomputedOldestXmin;
806 :
807 : /*
808 : * Time has passed since we computed OldestXmin, so it's
809 : * possible that this tuple is all-visible in reality even
810 : * though it doesn't appear so based on our
811 : * previously-computed value. Let's compute a new value so we
812 : * can be certain whether there is a problem.
813 : *
814 : * From a concurrency point of view, it sort of sucks to
815 : * retake ProcArrayLock here while we're holding the buffer
816 : * exclusively locked, but it should be safe against
817 : * deadlocks, because surely
818 : * GetStrictOldestNonRemovableTransactionId() should never
819 : * take a buffer lock. And this shouldn't happen often, so
820 : * it's worth being careful so as to avoid false positives.
821 : */
822 10 : RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
823 :
824 10 : if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
825 10 : record_corrupt_item(items, &tuple.t_self);
826 : else
827 : {
828 0 : OldestXmin = RecomputedOldestXmin;
829 0 : if (!tuple_all_visible(&tuple, OldestXmin, buffer))
830 0 : record_corrupt_item(items, &tuple.t_self);
831 : }
832 : }
833 :
834 : /*
835 : * If we're checking whether the page is all-frozen, we expect the
836 : * tuple to be in a state where it will never need freezing.
837 : */
838 32062 : if (check_frozen)
839 : {
840 16044 : if (heap_tuple_needs_eventual_freeze(tuple.t_data))
841 10 : record_corrupt_item(items, &tuple.t_self);
842 : }
843 : }
844 :
845 190 : UnlockReleaseBuffer(buffer);
846 : }
847 16 : read_stream_end(stream);
848 :
849 : /* Clean up. */
850 16 : if (vmbuffer != InvalidBuffer)
851 14 : ReleaseBuffer(vmbuffer);
852 16 : if (p.vmbuffer != InvalidBuffer)
853 16 : ReleaseBuffer(p.vmbuffer);
854 16 : relation_close(rel, AccessShareLock);
855 :
856 : /*
857 : * Before returning, repurpose the fields to match caller's expectations.
858 : * next is now the next item that should be read (rather than written) and
859 : * count is now the number of items we wrote (rather than the number we
860 : * allocated).
861 : */
862 16 : items->count = items->next;
863 16 : items->next = 0;
864 :
865 16 : return items;
866 : }
867 :
868 : /*
869 : * Remember one corrupt item.
870 : */
871 : static void
872 20 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
873 : {
874 : /* enlarge output array if needed. */
875 20 : if (items->next >= items->count)
876 : {
877 0 : items->count *= 2;
878 0 : items->tids = repalloc(items->tids,
879 0 : items->count * sizeof(ItemPointerData));
880 : }
881 : /* and add the new item */
882 20 : items->tids[items->next++] = *tid;
883 20 : }
884 :
885 : /*
886 : * Check whether a tuple is all-visible relative to a given OldestXmin value.
887 : * The buffer should contain the tuple and should be locked and pinned.
888 : */
889 : static bool
890 16018 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
891 : {
892 : HTSV_Result state;
893 : TransactionId xmin;
894 :
895 16018 : state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
896 16018 : if (state != HEAPTUPLE_LIVE)
897 10 : return false; /* all-visible implies live */
898 :
899 : /*
900 : * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
901 : * all-visible unless every tuple is hinted committed. However, those hint
902 : * bits could be lost after a crash, so we can't be certain that they'll
903 : * be set here. So just check the xmin.
904 : */
905 :
906 16008 : xmin = HeapTupleHeaderGetXmin(tup->t_data);
907 16008 : if (!TransactionIdPrecedes(xmin, OldestXmin))
908 0 : return false; /* xmin not old enough for all to see */
909 :
910 16008 : return true;
911 : }
912 :
913 : /*
914 : * check_relation_relkind - convenience routine to check that relation
915 : * is of the relkind supported by the callers
916 : */
917 : static void
918 92 : check_relation_relkind(Relation rel)
919 : {
920 92 : if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
921 50 : ereport(ERROR,
922 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
923 : errmsg("relation \"%s\" is of wrong relation kind",
924 : RelationGetRelationName(rel)),
925 : errdetail_relkind_not_supported(rel->rd_rel->relkind)));
926 42 : }
|