Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_visibility.c
4 : * display visibility map information and page-level visibility bits
5 : *
6 : * Copyright (c) 2016-2024, PostgreSQL Global Development Group
7 : *
8 : * contrib/pg_visibility/pg_visibility.c
9 : *-------------------------------------------------------------------------
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/heapam.h"
14 : #include "access/htup_details.h"
15 : #include "access/visibilitymap.h"
16 : #include "access/xloginsert.h"
17 : #include "catalog/pg_type.h"
18 : #include "catalog/storage_xlog.h"
19 : #include "funcapi.h"
20 : #include "miscadmin.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 : #include "storage/procarray.h"
24 : #include "storage/smgr.h"
25 : #include "utils/rel.h"
26 : #include "utils/snapmgr.h"
27 :
28 6 : PG_MODULE_MAGIC;
29 :
30 : typedef struct vbits
31 : {
32 : BlockNumber next;
33 : BlockNumber count;
34 : uint8 bits[FLEXIBLE_ARRAY_MEMBER];
35 : } vbits;
36 :
37 : typedef struct corrupt_items
38 : {
39 : BlockNumber next;
40 : BlockNumber count;
41 : ItemPointer tids;
42 : } corrupt_items;
43 :
44 4 : PG_FUNCTION_INFO_V1(pg_visibility_map);
45 6 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
46 6 : PG_FUNCTION_INFO_V1(pg_visibility);
47 6 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
48 6 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
49 6 : PG_FUNCTION_INFO_V1(pg_check_frozen);
50 6 : PG_FUNCTION_INFO_V1(pg_check_visible);
51 6 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
52 :
53 : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
54 : static vbits *collect_visibility_data(Oid relid, bool include_pd);
55 : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
56 : bool all_frozen);
57 : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
58 : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
59 : Buffer buffer);
60 : static void check_relation_relkind(Relation rel);
61 :
62 : /*
63 : * Visibility map information for a single block of a relation.
64 : *
65 : * Note: the VM code will silently return zeroes for pages past the end
66 : * of the map, so we allow probes up to MaxBlockNumber regardless of the
67 : * actual relation size.
68 : */
69 : Datum
70 0 : pg_visibility_map(PG_FUNCTION_ARGS)
71 : {
72 0 : Oid relid = PG_GETARG_OID(0);
73 0 : int64 blkno = PG_GETARG_INT64(1);
74 : int32 mapbits;
75 : Relation rel;
76 0 : Buffer vmbuffer = InvalidBuffer;
77 : TupleDesc tupdesc;
78 : Datum values[2];
79 0 : bool nulls[2] = {0};
80 :
81 0 : rel = relation_open(relid, AccessShareLock);
82 :
83 : /* Only some relkinds have a visibility map */
84 0 : check_relation_relkind(rel);
85 :
86 0 : if (blkno < 0 || blkno > MaxBlockNumber)
87 0 : ereport(ERROR,
88 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
89 : errmsg("invalid block number")));
90 :
91 0 : tupdesc = pg_visibility_tupdesc(false, false);
92 :
93 0 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
94 0 : if (vmbuffer != InvalidBuffer)
95 0 : ReleaseBuffer(vmbuffer);
96 0 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
97 0 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
98 :
99 0 : relation_close(rel, AccessShareLock);
100 :
101 0 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
102 : }
103 :
104 : /*
105 : * Visibility map information for a single block of a relation, plus the
106 : * page-level information for the same block.
107 : */
108 : Datum
109 12 : pg_visibility(PG_FUNCTION_ARGS)
110 : {
111 12 : Oid relid = PG_GETARG_OID(0);
112 12 : int64 blkno = PG_GETARG_INT64(1);
113 : int32 mapbits;
114 : Relation rel;
115 12 : Buffer vmbuffer = InvalidBuffer;
116 : Buffer buffer;
117 : Page page;
118 : TupleDesc tupdesc;
119 : Datum values[3];
120 12 : bool nulls[3] = {0};
121 :
122 12 : rel = relation_open(relid, AccessShareLock);
123 :
124 : /* Only some relkinds have a visibility map */
125 12 : check_relation_relkind(rel);
126 :
127 2 : if (blkno < 0 || blkno > MaxBlockNumber)
128 0 : ereport(ERROR,
129 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
130 : errmsg("invalid block number")));
131 :
132 2 : tupdesc = pg_visibility_tupdesc(false, true);
133 :
134 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
135 2 : if (vmbuffer != InvalidBuffer)
136 2 : ReleaseBuffer(vmbuffer);
137 2 : values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
138 2 : values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
139 :
140 : /* Here we have to explicitly check rel size ... */
141 2 : if (blkno < RelationGetNumberOfBlocks(rel))
142 : {
143 2 : buffer = ReadBuffer(rel, blkno);
144 2 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
145 :
146 2 : page = BufferGetPage(buffer);
147 2 : values[2] = BoolGetDatum(PageIsAllVisible(page));
148 :
149 2 : UnlockReleaseBuffer(buffer);
150 : }
151 : else
152 : {
153 : /* As with the vismap, silently return 0 for pages past EOF */
154 0 : values[2] = BoolGetDatum(false);
155 : }
156 :
157 2 : relation_close(rel, AccessShareLock);
158 :
159 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
160 : }
161 :
162 : /*
163 : * Visibility map information for every block in a relation.
164 : */
165 : Datum
166 40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
167 : {
168 : FuncCallContext *funcctx;
169 : vbits *info;
170 :
171 40 : if (SRF_IS_FIRSTCALL())
172 : {
173 22 : Oid relid = PG_GETARG_OID(0);
174 : MemoryContext oldcontext;
175 :
176 22 : funcctx = SRF_FIRSTCALL_INIT();
177 22 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
178 22 : funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
179 : /* collect_visibility_data will verify the relkind */
180 22 : funcctx->user_fctx = collect_visibility_data(relid, false);
181 8 : MemoryContextSwitchTo(oldcontext);
182 : }
183 :
184 26 : funcctx = SRF_PERCALL_SETUP();
185 26 : info = (vbits *) funcctx->user_fctx;
186 :
187 26 : if (info->next < info->count)
188 : {
189 : Datum values[3];
190 18 : bool nulls[3] = {0};
191 : HeapTuple tuple;
192 :
193 18 : values[0] = Int64GetDatum(info->next);
194 18 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
195 18 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
196 18 : info->next++;
197 :
198 18 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
199 18 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
200 : }
201 :
202 8 : SRF_RETURN_DONE(funcctx);
203 : }
204 :
205 : /*
206 : * Visibility map information for every block in a relation, plus the page
207 : * level information for each block.
208 : */
209 : Datum
210 18 : pg_visibility_rel(PG_FUNCTION_ARGS)
211 : {
212 : FuncCallContext *funcctx;
213 : vbits *info;
214 :
215 18 : if (SRF_IS_FIRSTCALL())
216 : {
217 12 : Oid relid = PG_GETARG_OID(0);
218 : MemoryContext oldcontext;
219 :
220 12 : funcctx = SRF_FIRSTCALL_INIT();
221 12 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
222 12 : funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
223 : /* collect_visibility_data will verify the relkind */
224 12 : funcctx->user_fctx = collect_visibility_data(relid, true);
225 12 : MemoryContextSwitchTo(oldcontext);
226 : }
227 :
228 18 : funcctx = SRF_PERCALL_SETUP();
229 18 : info = (vbits *) funcctx->user_fctx;
230 :
231 18 : if (info->next < info->count)
232 : {
233 : Datum values[4];
234 6 : bool nulls[4] = {0};
235 : HeapTuple tuple;
236 :
237 6 : values[0] = Int64GetDatum(info->next);
238 6 : values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
239 6 : values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
240 6 : values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
241 6 : info->next++;
242 :
243 6 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
244 6 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
245 : }
246 :
247 12 : SRF_RETURN_DONE(funcctx);
248 : }
249 :
250 : /*
251 : * Count the number of all-visible and all-frozen pages in the visibility
252 : * map for a particular relation.
253 : */
254 : Datum
255 12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
256 : {
257 12 : Oid relid = PG_GETARG_OID(0);
258 : Relation rel;
259 : BlockNumber nblocks;
260 : BlockNumber blkno;
261 12 : Buffer vmbuffer = InvalidBuffer;
262 12 : int64 all_visible = 0;
263 12 : int64 all_frozen = 0;
264 : TupleDesc tupdesc;
265 : Datum values[2];
266 12 : bool nulls[2] = {0};
267 :
268 12 : rel = relation_open(relid, AccessShareLock);
269 :
270 : /* Only some relkinds have a visibility map */
271 12 : check_relation_relkind(rel);
272 :
273 2 : nblocks = RelationGetNumberOfBlocks(rel);
274 :
275 4 : for (blkno = 0; blkno < nblocks; ++blkno)
276 : {
277 : int32 mapbits;
278 :
279 : /* Make sure we are interruptible. */
280 2 : CHECK_FOR_INTERRUPTS();
281 :
282 : /* Get map info. */
283 2 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
284 2 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
285 2 : ++all_visible;
286 2 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
287 0 : ++all_frozen;
288 : }
289 :
290 : /* Clean up. */
291 2 : if (vmbuffer != InvalidBuffer)
292 2 : ReleaseBuffer(vmbuffer);
293 2 : relation_close(rel, AccessShareLock);
294 :
295 2 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
296 0 : elog(ERROR, "return type must be a row type");
297 :
298 2 : values[0] = Int64GetDatum(all_visible);
299 2 : values[1] = Int64GetDatum(all_frozen);
300 :
301 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
302 : }
303 :
304 : /*
305 : * Return the TIDs of non-frozen tuples present in pages marked all-frozen
306 : * in the visibility map. We hope no one will ever find any, but there could
307 : * be bugs, database corruption, etc.
308 : */
309 : Datum
310 18 : pg_check_frozen(PG_FUNCTION_ARGS)
311 : {
312 : FuncCallContext *funcctx;
313 : corrupt_items *items;
314 :
315 18 : if (SRF_IS_FIRSTCALL())
316 : {
317 18 : Oid relid = PG_GETARG_OID(0);
318 : MemoryContext oldcontext;
319 :
320 18 : funcctx = SRF_FIRSTCALL_INIT();
321 18 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
322 : /* collect_corrupt_items will verify the relkind */
323 18 : funcctx->user_fctx = collect_corrupt_items(relid, false, true);
324 8 : MemoryContextSwitchTo(oldcontext);
325 : }
326 :
327 8 : funcctx = SRF_PERCALL_SETUP();
328 8 : items = (corrupt_items *) funcctx->user_fctx;
329 :
330 8 : if (items->next < items->count)
331 0 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
332 :
333 8 : SRF_RETURN_DONE(funcctx);
334 : }
335 :
336 : /*
337 : * Return the TIDs of not-all-visible tuples in pages marked all-visible
338 : * in the visibility map. We hope no one will ever find any, but there could
339 : * be bugs, database corruption, etc.
340 : */
341 : Datum
342 2 : pg_check_visible(PG_FUNCTION_ARGS)
343 : {
344 : FuncCallContext *funcctx;
345 : corrupt_items *items;
346 :
347 2 : if (SRF_IS_FIRSTCALL())
348 : {
349 2 : Oid relid = PG_GETARG_OID(0);
350 : MemoryContext oldcontext;
351 :
352 2 : funcctx = SRF_FIRSTCALL_INIT();
353 2 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
354 : /* collect_corrupt_items will verify the relkind */
355 2 : funcctx->user_fctx = collect_corrupt_items(relid, true, false);
356 2 : MemoryContextSwitchTo(oldcontext);
357 : }
358 :
359 2 : funcctx = SRF_PERCALL_SETUP();
360 2 : items = (corrupt_items *) funcctx->user_fctx;
361 :
362 2 : if (items->next < items->count)
363 0 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
364 :
365 2 : SRF_RETURN_DONE(funcctx);
366 : }
367 :
368 : /*
369 : * Remove the visibility map fork for a relation. If there turn out to be
370 : * any bugs in the visibility map code that require rebuilding the VM, this
371 : * provides users with a way to do it that is cleaner than shutting down the
372 : * server and removing files by hand.
373 : *
374 : * This is a cut-down version of RelationTruncate.
375 : */
376 : Datum
377 12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
378 : {
379 12 : Oid relid = PG_GETARG_OID(0);
380 : Relation rel;
381 : ForkNumber fork;
382 : BlockNumber block;
383 :
384 12 : rel = relation_open(relid, AccessExclusiveLock);
385 :
386 : /* Only some relkinds have a visibility map */
387 12 : check_relation_relkind(rel);
388 :
389 : /* Forcibly reset cached file size */
390 2 : RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
391 :
392 2 : block = visibilitymap_prepare_truncate(rel, 0);
393 2 : if (BlockNumberIsValid(block))
394 : {
395 2 : fork = VISIBILITYMAP_FORKNUM;
396 2 : smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
397 : }
398 :
399 2 : if (RelationNeedsWAL(rel))
400 : {
401 : xl_smgr_truncate xlrec;
402 :
403 2 : xlrec.blkno = 0;
404 2 : xlrec.rlocator = rel->rd_locator;
405 2 : xlrec.flags = SMGR_TRUNCATE_VM;
406 :
407 2 : XLogBeginInsert();
408 2 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
409 :
410 2 : XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
411 : }
412 :
413 : /*
414 : * Release the lock right away, not at commit time.
415 : *
416 : * It would be a problem to release the lock prior to commit if this
417 : * truncate operation sends any transactional invalidation messages. Other
418 : * backends would potentially be able to lock the relation without
419 : * processing them in the window of time between when we release the lock
420 : * here and when we sent the messages at our eventual commit. However,
421 : * we're currently only sending a non-transactional smgr invalidation,
422 : * which will have been posted to shared memory immediately from within
423 : * smgr_truncate. Therefore, there should be no race here.
424 : *
425 : * The reason why it's desirable to release the lock early here is because
426 : * of the possibility that someone will need to use this to blow away many
427 : * visibility map forks at once. If we can't release the lock until
428 : * commit time, the transaction doing this will accumulate
429 : * AccessExclusiveLocks on all of those relations at the same time, which
430 : * is undesirable. However, if this turns out to be unsafe we may have no
431 : * choice...
432 : */
433 2 : relation_close(rel, AccessExclusiveLock);
434 :
435 : /* Nothing to return. */
436 2 : PG_RETURN_VOID();
437 : }
438 :
439 : /*
440 : * Helper function to construct whichever TupleDesc we need for a particular
441 : * call.
442 : */
443 : static TupleDesc
444 36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
445 : {
446 : TupleDesc tupdesc;
447 36 : AttrNumber maxattr = 2;
448 36 : AttrNumber a = 0;
449 :
450 36 : if (include_blkno)
451 34 : ++maxattr;
452 36 : if (include_pd)
453 14 : ++maxattr;
454 36 : tupdesc = CreateTemplateTupleDesc(maxattr);
455 36 : if (include_blkno)
456 34 : TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
457 36 : TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
458 36 : TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
459 36 : if (include_pd)
460 14 : TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
461 : Assert(a == maxattr);
462 :
463 36 : return BlessTupleDesc(tupdesc);
464 : }
465 :
466 : /*
467 : * Collect visibility data about a relation.
468 : *
469 : * Checks relkind of relid and will throw an error if the relation does not
470 : * have a VM.
471 : */
472 : static vbits *
473 34 : collect_visibility_data(Oid relid, bool include_pd)
474 : {
475 : Relation rel;
476 : BlockNumber nblocks;
477 : vbits *info;
478 : BlockNumber blkno;
479 34 : Buffer vmbuffer = InvalidBuffer;
480 34 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
481 :
482 34 : rel = relation_open(relid, AccessShareLock);
483 :
484 : /* Only some relkinds have a visibility map */
485 30 : check_relation_relkind(rel);
486 :
487 20 : nblocks = RelationGetNumberOfBlocks(rel);
488 20 : info = palloc0(offsetof(vbits, bits) + nblocks);
489 20 : info->next = 0;
490 20 : info->count = nblocks;
491 :
492 44 : for (blkno = 0; blkno < nblocks; ++blkno)
493 : {
494 : int32 mapbits;
495 :
496 : /* Make sure we are interruptible. */
497 24 : CHECK_FOR_INTERRUPTS();
498 :
499 : /* Get map info. */
500 24 : mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
501 24 : if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
502 16 : info->bits[blkno] |= (1 << 0);
503 24 : if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
504 10 : info->bits[blkno] |= (1 << 1);
505 :
506 : /*
507 : * Page-level data requires reading every block, so only get it if the
508 : * caller needs it. Use a buffer access strategy, too, to prevent
509 : * cache-trashing.
510 : */
511 24 : if (include_pd)
512 : {
513 : Buffer buffer;
514 : Page page;
515 :
516 6 : buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
517 : bstrategy);
518 6 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
519 :
520 6 : page = BufferGetPage(buffer);
521 6 : if (PageIsAllVisible(page))
522 4 : info->bits[blkno] |= (1 << 2);
523 :
524 6 : UnlockReleaseBuffer(buffer);
525 : }
526 : }
527 :
528 : /* Clean up. */
529 20 : if (vmbuffer != InvalidBuffer)
530 14 : ReleaseBuffer(vmbuffer);
531 20 : relation_close(rel, AccessShareLock);
532 :
533 20 : return info;
534 : }
535 :
536 : /*
537 : * The "strict" version of GetOldestNonRemovableTransactionId(). The
538 : * pg_visibility check can tolerate false positives (don't report some of the
539 : * errors), but can't tolerate false negatives (report false errors). Normally,
540 : * horizons move forwards, but there are cases when it could move backward
541 : * (see comment for ComputeXidHorizons()).
542 : *
543 : * This is why we have to implement our own function for xid horizon, which
544 : * would be guaranteed to be newer or equal to any xid horizon computed before.
545 : * We have to do the following to achieve this.
546 : *
547 : * 1. Ignore processes xmin's, because they consider connection to other
548 : * databases that were ignored before.
549 : * 2. Ignore KnownAssignedXids, because they are not database-aware. At the
550 : * same time, the primary could compute its horizons database-aware.
551 : * 3. Ignore walsender xmin, because it could go backward if some replication
552 : * connections don't use replication slots.
553 : *
554 : * As a result, we're using only currently running xids to compute the horizon.
555 : * Surely these would significantly sacrifice accuracy. But we have to do so
556 : * to avoid reporting false errors.
557 : */
558 : static TransactionId
559 2 : GetStrictOldestNonRemovableTransactionId(Relation rel)
560 : {
561 : RunningTransactions runningTransactions;
562 :
563 2 : if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
564 : {
565 : /* Shared relation: take into account all running xids */
566 0 : runningTransactions = GetRunningTransactionData();
567 0 : LWLockRelease(ProcArrayLock);
568 0 : LWLockRelease(XidGenLock);
569 0 : return runningTransactions->oldestRunningXid;
570 : }
571 2 : else if (!RELATION_IS_LOCAL(rel))
572 : {
573 : /*
574 : * Normal relation: take into account xids running within the current
575 : * database
576 : */
577 2 : runningTransactions = GetRunningTransactionData();
578 2 : LWLockRelease(ProcArrayLock);
579 2 : LWLockRelease(XidGenLock);
580 2 : return runningTransactions->oldestDatabaseRunningXid;
581 : }
582 : else
583 : {
584 : /*
585 : * For temporary relations, ComputeXidHorizons() uses only
586 : * TransamVariables->latestCompletedXid and MyProc->xid. These two
587 : * shouldn't go backwards. So we're fine with this horizon.
588 : */
589 0 : return GetOldestNonRemovableTransactionId(rel);
590 : }
591 : }
592 :
593 : /*
594 : * Returns a list of items whose visibility map information does not match
595 : * the status of the tuples on the page.
596 : *
597 : * If all_visible is passed as true, this will include all items which are
598 : * on pages marked as all-visible in the visibility map but which do not
599 : * seem to in fact be all-visible.
600 : *
601 : * If all_frozen is passed as true, this will include all items which are
602 : * on pages marked as all-frozen but which do not seem to in fact be frozen.
603 : *
604 : * Checks relkind of relid and will throw an error if the relation does not
605 : * have a VM.
606 : */
607 : static corrupt_items *
608 20 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
609 : {
610 : Relation rel;
611 : BlockNumber nblocks;
612 : corrupt_items *items;
613 : BlockNumber blkno;
614 20 : Buffer vmbuffer = InvalidBuffer;
615 20 : BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
616 20 : TransactionId OldestXmin = InvalidTransactionId;
617 :
618 20 : rel = relation_open(relid, AccessShareLock);
619 :
620 : /* Only some relkinds have a visibility map */
621 20 : check_relation_relkind(rel);
622 :
623 10 : if (all_visible)
624 2 : OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
625 :
626 10 : nblocks = RelationGetNumberOfBlocks(rel);
627 :
628 : /*
629 : * Guess an initial array size. We don't expect many corrupted tuples, so
630 : * start with a small array. This function uses the "next" field to track
631 : * the next offset where we can store an item (which is the same thing as
632 : * the number of items found so far) and the "count" field to track the
633 : * number of entries allocated. We'll repurpose these fields before
634 : * returning.
635 : */
636 10 : items = palloc0(sizeof(corrupt_items));
637 10 : items->next = 0;
638 10 : items->count = 64;
639 10 : items->tids = palloc(items->count * sizeof(ItemPointerData));
640 :
641 : /* Loop over every block in the relation. */
642 30 : for (blkno = 0; blkno < nblocks; ++blkno)
643 : {
644 20 : bool check_frozen = false;
645 20 : bool check_visible = false;
646 : Buffer buffer;
647 : Page page;
648 : OffsetNumber offnum,
649 : maxoff;
650 :
651 : /* Make sure we are interruptible. */
652 20 : CHECK_FOR_INTERRUPTS();
653 :
654 : /* Use the visibility map to decide whether to check this page. */
655 20 : if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer))
656 10 : check_frozen = true;
657 20 : if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
658 2 : check_visible = true;
659 20 : if (!check_visible && !check_frozen)
660 8 : continue;
661 :
662 : /* Read and lock the page. */
663 12 : buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
664 : bstrategy);
665 12 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
666 :
667 12 : page = BufferGetPage(buffer);
668 12 : maxoff = PageGetMaxOffsetNumber(page);
669 :
670 : /*
671 : * The visibility map bits might have changed while we were acquiring
672 : * the page lock. Recheck to avoid returning spurious results.
673 : */
674 12 : if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
675 0 : check_frozen = false;
676 12 : if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
677 0 : check_visible = false;
678 12 : if (!check_visible && !check_frozen)
679 : {
680 0 : UnlockReleaseBuffer(buffer);
681 0 : continue;
682 : }
683 :
684 : /* Iterate over each tuple on the page. */
685 44 : for (offnum = FirstOffsetNumber;
686 : offnum <= maxoff;
687 32 : offnum = OffsetNumberNext(offnum))
688 : {
689 : HeapTupleData tuple;
690 : ItemId itemid;
691 :
692 32 : itemid = PageGetItemId(page, offnum);
693 :
694 : /* Unused or redirect line pointers are of no interest. */
695 32 : if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
696 0 : continue;
697 :
698 : /* Dead line pointers are neither all-visible nor frozen. */
699 32 : if (ItemIdIsDead(itemid))
700 : {
701 0 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
702 0 : record_corrupt_item(items, &tuple.t_self);
703 0 : continue;
704 : }
705 :
706 : /* Initialize a HeapTupleData structure for checks below. */
707 32 : ItemPointerSet(&(tuple.t_self), blkno, offnum);
708 32 : tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
709 32 : tuple.t_len = ItemIdGetLength(itemid);
710 32 : tuple.t_tableOid = relid;
711 :
712 : /*
713 : * If we're checking whether the page is all-visible, we expect
714 : * the tuple to be all-visible.
715 : */
716 32 : if (check_visible &&
717 2 : !tuple_all_visible(&tuple, OldestXmin, buffer))
718 : {
719 : TransactionId RecomputedOldestXmin;
720 :
721 : /*
722 : * Time has passed since we computed OldestXmin, so it's
723 : * possible that this tuple is all-visible in reality even
724 : * though it doesn't appear so based on our
725 : * previously-computed value. Let's compute a new value so we
726 : * can be certain whether there is a problem.
727 : *
728 : * From a concurrency point of view, it sort of sucks to
729 : * retake ProcArrayLock here while we're holding the buffer
730 : * exclusively locked, but it should be safe against
731 : * deadlocks, because surely
732 : * GetStrictOldestNonRemovableTransactionId() should never
733 : * take a buffer lock. And this shouldn't happen often, so
734 : * it's worth being careful so as to avoid false positives.
735 : */
736 0 : RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
737 :
738 0 : if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
739 0 : record_corrupt_item(items, &tuple.t_self);
740 : else
741 : {
742 0 : OldestXmin = RecomputedOldestXmin;
743 0 : if (!tuple_all_visible(&tuple, OldestXmin, buffer))
744 0 : record_corrupt_item(items, &tuple.t_self);
745 : }
746 : }
747 :
748 : /*
749 : * If we're checking whether the page is all-frozen, we expect the
750 : * tuple to be in a state where it will never need freezing.
751 : */
752 32 : if (check_frozen)
753 : {
754 30 : if (heap_tuple_needs_eventual_freeze(tuple.t_data))
755 0 : record_corrupt_item(items, &tuple.t_self);
756 : }
757 : }
758 :
759 12 : UnlockReleaseBuffer(buffer);
760 : }
761 :
762 : /* Clean up. */
763 10 : if (vmbuffer != InvalidBuffer)
764 10 : ReleaseBuffer(vmbuffer);
765 10 : relation_close(rel, AccessShareLock);
766 :
767 : /*
768 : * Before returning, repurpose the fields to match caller's expectations.
769 : * next is now the next item that should be read (rather than written) and
770 : * count is now the number of items we wrote (rather than the number we
771 : * allocated).
772 : */
773 10 : items->count = items->next;
774 10 : items->next = 0;
775 :
776 10 : return items;
777 : }
778 :
779 : /*
780 : * Remember one corrupt item.
781 : */
782 : static void
783 0 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
784 : {
785 : /* enlarge output array if needed. */
786 0 : if (items->next >= items->count)
787 : {
788 0 : items->count *= 2;
789 0 : items->tids = repalloc(items->tids,
790 0 : items->count * sizeof(ItemPointerData));
791 : }
792 : /* and add the new item */
793 0 : items->tids[items->next++] = *tid;
794 0 : }
795 :
796 : /*
797 : * Check whether a tuple is all-visible relative to a given OldestXmin value.
798 : * The buffer should contain the tuple and should be locked and pinned.
799 : */
800 : static bool
801 2 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
802 : {
803 : HTSV_Result state;
804 : TransactionId xmin;
805 :
806 2 : state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
807 2 : if (state != HEAPTUPLE_LIVE)
808 0 : return false; /* all-visible implies live */
809 :
810 : /*
811 : * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
812 : * all-visible unless every tuple is hinted committed. However, those hint
813 : * bits could be lost after a crash, so we can't be certain that they'll
814 : * be set here. So just check the xmin.
815 : */
816 :
817 2 : xmin = HeapTupleHeaderGetXmin(tup->t_data);
818 2 : if (!TransactionIdPrecedes(xmin, OldestXmin))
819 0 : return false; /* xmin not old enough for all to see */
820 :
821 2 : return true;
822 : }
823 :
824 : /*
825 : * check_relation_relkind - convenience routine to check that relation
826 : * is of the relkind supported by the callers
827 : */
828 : static void
829 86 : check_relation_relkind(Relation rel)
830 : {
831 86 : if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
832 50 : ereport(ERROR,
833 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
834 : errmsg("relation \"%s\" is of wrong relation kind",
835 : RelationGetRelationName(rel)),
836 : errdetail_relkind_not_supported(rel->rd_rel->relkind)));
837 36 : }
|