LCOV - code coverage report
Current view: top level - contrib/pg_visibility - pg_visibility.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 87.5 % 319 279
Test Date: 2026-03-04 20:14:49 Functions: 96.0 % 25 24
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_visibility.c
       4              :  *    display visibility map information and page-level visibility bits
       5              :  *
       6              :  * Copyright (c) 2016-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  *    contrib/pg_visibility/pg_visibility.c
       9              :  *-------------------------------------------------------------------------
      10              :  */
      11              : #include "postgres.h"
      12              : 
      13              : #include "access/heapam.h"
      14              : #include "access/htup_details.h"
      15              : #include "access/visibilitymap.h"
      16              : #include "access/xloginsert.h"
      17              : #include "catalog/pg_type.h"
      18              : #include "catalog/storage_xlog.h"
      19              : #include "funcapi.h"
      20              : #include "miscadmin.h"
      21              : #include "storage/bufmgr.h"
      22              : #include "storage/proc.h"
      23              : #include "storage/procarray.h"
      24              : #include "storage/read_stream.h"
      25              : #include "storage/smgr.h"
      26              : #include "utils/rel.h"
      27              : 
      28            7 : PG_MODULE_MAGIC_EXT(
      29              :                     .name = "pg_visibility",
      30              :                     .version = PG_VERSION
      31              : );
      32              : 
      33              : typedef struct vbits
      34              : {
      35              :     BlockNumber next;
      36              :     BlockNumber count;
      37              :     uint8       bits[FLEXIBLE_ARRAY_MEMBER];
      38              : } vbits;
      39              : 
      40              : typedef struct corrupt_items
      41              : {
      42              :     BlockNumber next;
      43              :     BlockNumber count;
      44              :     ItemPointer tids;
      45              : } corrupt_items;
      46              : 
      47              : /* for collect_corrupt_items_read_stream_next_block */
      48              : struct collect_corrupt_items_read_stream_private
      49              : {
      50              :     bool        all_frozen;
      51              :     bool        all_visible;
      52              :     BlockNumber current_blocknum;
      53              :     BlockNumber last_exclusive;
      54              :     Relation    rel;
      55              :     Buffer      vmbuffer;
      56              : };
      57              : 
      58            3 : PG_FUNCTION_INFO_V1(pg_visibility_map);
      59            4 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
      60            4 : PG_FUNCTION_INFO_V1(pg_visibility);
      61            4 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
      62            4 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
      63            5 : PG_FUNCTION_INFO_V1(pg_check_frozen);
      64            6 : PG_FUNCTION_INFO_V1(pg_check_visible);
      65            4 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
      66              : 
      67              : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
      68              : static vbits *collect_visibility_data(Oid relid, bool include_pd);
      69              : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
      70              :                                             bool all_frozen);
      71              : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
      72              : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
      73              :                               Buffer buffer);
      74              : static void check_relation_relkind(Relation rel);
      75              : 
      76              : /*
      77              :  * Visibility map information for a single block of a relation.
      78              :  *
      79              :  * Note: the VM code will silently return zeroes for pages past the end
      80              :  * of the map, so we allow probes up to MaxBlockNumber regardless of the
      81              :  * actual relation size.
      82              :  */
      83              : Datum
      84            0 : pg_visibility_map(PG_FUNCTION_ARGS)
      85              : {
      86            0 :     Oid         relid = PG_GETARG_OID(0);
      87            0 :     int64       blkno = PG_GETARG_INT64(1);
      88              :     int32       mapbits;
      89              :     Relation    rel;
      90            0 :     Buffer      vmbuffer = InvalidBuffer;
      91              :     TupleDesc   tupdesc;
      92              :     Datum       values[2];
      93            0 :     bool        nulls[2] = {0};
      94              : 
      95            0 :     rel = relation_open(relid, AccessShareLock);
      96              : 
      97              :     /* Only some relkinds have a visibility map */
      98            0 :     check_relation_relkind(rel);
      99              : 
     100            0 :     if (blkno < 0 || blkno > MaxBlockNumber)
     101            0 :         ereport(ERROR,
     102              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     103              :                  errmsg("invalid block number")));
     104              : 
     105            0 :     tupdesc = pg_visibility_tupdesc(false, false);
     106              : 
     107            0 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     108            0 :     if (vmbuffer != InvalidBuffer)
     109            0 :         ReleaseBuffer(vmbuffer);
     110            0 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     111            0 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     112              : 
     113            0 :     relation_close(rel, AccessShareLock);
     114              : 
     115            0 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     116              : }
     117              : 
     118              : /*
     119              :  * Visibility map information for a single block of a relation, plus the
     120              :  * page-level information for the same block.
     121              :  */
     122              : Datum
     123            6 : pg_visibility(PG_FUNCTION_ARGS)
     124              : {
     125            6 :     Oid         relid = PG_GETARG_OID(0);
     126            6 :     int64       blkno = PG_GETARG_INT64(1);
     127              :     int32       mapbits;
     128              :     Relation    rel;
     129            6 :     Buffer      vmbuffer = InvalidBuffer;
     130              :     Buffer      buffer;
     131              :     Page        page;
     132              :     TupleDesc   tupdesc;
     133              :     Datum       values[3];
     134            6 :     bool        nulls[3] = {0};
     135              : 
     136            6 :     rel = relation_open(relid, AccessShareLock);
     137              : 
     138              :     /* Only some relkinds have a visibility map */
     139            6 :     check_relation_relkind(rel);
     140              : 
     141            1 :     if (blkno < 0 || blkno > MaxBlockNumber)
     142            0 :         ereport(ERROR,
     143              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     144              :                  errmsg("invalid block number")));
     145              : 
     146            1 :     tupdesc = pg_visibility_tupdesc(false, true);
     147              : 
     148            1 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     149            1 :     if (vmbuffer != InvalidBuffer)
     150            1 :         ReleaseBuffer(vmbuffer);
     151            1 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     152            1 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     153              : 
     154              :     /* Here we have to explicitly check rel size ... */
     155            1 :     if (blkno < RelationGetNumberOfBlocks(rel))
     156              :     {
     157            1 :         buffer = ReadBuffer(rel, blkno);
     158            1 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     159              : 
     160            1 :         page = BufferGetPage(buffer);
     161            1 :         values[2] = BoolGetDatum(PageIsAllVisible(page));
     162              : 
     163            1 :         UnlockReleaseBuffer(buffer);
     164              :     }
     165              :     else
     166              :     {
     167              :         /* As with the vismap, silently return 0 for pages past EOF */
     168            0 :         values[2] = BoolGetDatum(false);
     169              :     }
     170              : 
     171            1 :     relation_close(rel, AccessShareLock);
     172              : 
     173            1 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     174              : }
     175              : 
     176              : /*
     177              :  * Visibility map information for every block in a relation.
     178              :  */
     179              : Datum
     180           20 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
     181              : {
     182              :     FuncCallContext *funcctx;
     183              :     vbits      *info;
     184              : 
     185           20 :     if (SRF_IS_FIRSTCALL())
     186              :     {
     187           11 :         Oid         relid = PG_GETARG_OID(0);
     188              :         MemoryContext oldcontext;
     189              : 
     190           11 :         funcctx = SRF_FIRSTCALL_INIT();
     191           11 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     192           11 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
     193              :         /* collect_visibility_data will verify the relkind */
     194           11 :         funcctx->user_fctx = collect_visibility_data(relid, false);
     195            4 :         MemoryContextSwitchTo(oldcontext);
     196              :     }
     197              : 
     198           13 :     funcctx = SRF_PERCALL_SETUP();
     199           13 :     info = (vbits *) funcctx->user_fctx;
     200              : 
     201           13 :     if (info->next < info->count)
     202              :     {
     203              :         Datum       values[3];
     204            9 :         bool        nulls[3] = {0};
     205              :         HeapTuple   tuple;
     206              : 
     207            9 :         values[0] = Int64GetDatum(info->next);
     208            9 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     209            9 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     210            9 :         info->next++;
     211              : 
     212            9 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     213            9 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     214              :     }
     215              : 
     216            4 :     SRF_RETURN_DONE(funcctx);
     217              : }
     218              : 
     219              : /*
     220              :  * Visibility map information for every block in a relation, plus the page
     221              :  * level information for each block.
     222              :  */
     223              : Datum
     224            9 : pg_visibility_rel(PG_FUNCTION_ARGS)
     225              : {
     226              :     FuncCallContext *funcctx;
     227              :     vbits      *info;
     228              : 
     229            9 :     if (SRF_IS_FIRSTCALL())
     230              :     {
     231            6 :         Oid         relid = PG_GETARG_OID(0);
     232              :         MemoryContext oldcontext;
     233              : 
     234            6 :         funcctx = SRF_FIRSTCALL_INIT();
     235            6 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     236            6 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
     237              :         /* collect_visibility_data will verify the relkind */
     238            6 :         funcctx->user_fctx = collect_visibility_data(relid, true);
     239            6 :         MemoryContextSwitchTo(oldcontext);
     240              :     }
     241              : 
     242            9 :     funcctx = SRF_PERCALL_SETUP();
     243            9 :     info = (vbits *) funcctx->user_fctx;
     244              : 
     245            9 :     if (info->next < info->count)
     246              :     {
     247              :         Datum       values[4];
     248            3 :         bool        nulls[4] = {0};
     249              :         HeapTuple   tuple;
     250              : 
     251            3 :         values[0] = Int64GetDatum(info->next);
     252            3 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     253            3 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     254            3 :         values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
     255            3 :         info->next++;
     256              : 
     257            3 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     258            3 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     259              :     }
     260              : 
     261            6 :     SRF_RETURN_DONE(funcctx);
     262              : }
     263              : 
     264              : /*
     265              :  * Count the number of all-visible and all-frozen pages in the visibility
     266              :  * map for a particular relation.
     267              :  */
     268              : Datum
     269            9 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
     270              : {
     271            9 :     Oid         relid = PG_GETARG_OID(0);
     272              :     Relation    rel;
     273            9 :     BlockNumber all_visible = 0;
     274            9 :     BlockNumber all_frozen = 0;
     275              :     TupleDesc   tupdesc;
     276              :     Datum       values[2];
     277            9 :     bool        nulls[2] = {0};
     278              : 
     279            9 :     rel = relation_open(relid, AccessShareLock);
     280              : 
     281              :     /* Only some relkinds have a visibility map */
     282            9 :     check_relation_relkind(rel);
     283              : 
     284            4 :     visibilitymap_count(rel, &all_visible, &all_frozen);
     285              : 
     286            4 :     relation_close(rel, AccessShareLock);
     287              : 
     288            4 :     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
     289            0 :         elog(ERROR, "return type must be a row type");
     290              : 
     291            4 :     values[0] = Int64GetDatum((int64) all_visible);
     292            4 :     values[1] = Int64GetDatum((int64) all_frozen);
     293              : 
     294            4 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     295              : }
     296              : 
     297              : /*
     298              :  * Return the TIDs of non-frozen tuples present in pages marked all-frozen
     299              :  * in the visibility map.  We hope no one will ever find any, but there could
     300              :  * be bugs, database corruption, etc.
     301              :  */
     302              : Datum
     303           15 : pg_check_frozen(PG_FUNCTION_ARGS)
     304              : {
     305              :     FuncCallContext *funcctx;
     306              :     corrupt_items *items;
     307              : 
     308           15 :     if (SRF_IS_FIRSTCALL())
     309              :     {
     310           10 :         Oid         relid = PG_GETARG_OID(0);
     311              :         MemoryContext oldcontext;
     312              : 
     313           10 :         funcctx = SRF_FIRSTCALL_INIT();
     314           10 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     315              :         /* collect_corrupt_items will verify the relkind */
     316           10 :         funcctx->user_fctx = collect_corrupt_items(relid, false, true);
     317            5 :         MemoryContextSwitchTo(oldcontext);
     318              :     }
     319              : 
     320           10 :     funcctx = SRF_PERCALL_SETUP();
     321           10 :     items = (corrupt_items *) funcctx->user_fctx;
     322              : 
     323           10 :     if (items->next < items->count)
     324            5 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     325              : 
     326            5 :     SRF_RETURN_DONE(funcctx);
     327              : }
     328              : 
     329              : /*
     330              :  * Return the TIDs of not-all-visible tuples in pages marked all-visible
     331              :  * in the visibility map.  We hope no one will ever find any, but there could
     332              :  * be bugs, database corruption, etc.
     333              :  */
     334              : Datum
     335            8 : pg_check_visible(PG_FUNCTION_ARGS)
     336              : {
     337              :     FuncCallContext *funcctx;
     338              :     corrupt_items *items;
     339              : 
     340            8 :     if (SRF_IS_FIRSTCALL())
     341              :     {
     342            3 :         Oid         relid = PG_GETARG_OID(0);
     343              :         MemoryContext oldcontext;
     344              : 
     345            3 :         funcctx = SRF_FIRSTCALL_INIT();
     346            3 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     347              :         /* collect_corrupt_items will verify the relkind */
     348            3 :         funcctx->user_fctx = collect_corrupt_items(relid, true, false);
     349            3 :         MemoryContextSwitchTo(oldcontext);
     350              :     }
     351              : 
     352            8 :     funcctx = SRF_PERCALL_SETUP();
     353            8 :     items = (corrupt_items *) funcctx->user_fctx;
     354              : 
     355            8 :     if (items->next < items->count)
     356            5 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     357              : 
     358            3 :     SRF_RETURN_DONE(funcctx);
     359              : }
     360              : 
     361              : /*
     362              :  * Remove the visibility map fork for a relation.  If there turn out to be
     363              :  * any bugs in the visibility map code that require rebuilding the VM, this
     364              :  * provides users with a way to do it that is cleaner than shutting down the
     365              :  * server and removing files by hand.
     366              :  *
     367              :  * This is a cut-down version of RelationTruncate.
     368              :  */
     369              : Datum
     370            7 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
     371              : {
     372            7 :     Oid         relid = PG_GETARG_OID(0);
     373              :     Relation    rel;
     374              :     ForkNumber  fork;
     375              :     BlockNumber block;
     376              :     BlockNumber old_block;
     377              : 
     378            7 :     rel = relation_open(relid, AccessExclusiveLock);
     379              : 
     380              :     /* Only some relkinds have a visibility map */
     381            7 :     check_relation_relkind(rel);
     382              : 
     383              :     /* Forcibly reset cached file size */
     384            2 :     RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
     385              : 
     386              :     /* Compute new and old size before entering critical section. */
     387            2 :     fork = VISIBILITYMAP_FORKNUM;
     388            2 :     block = visibilitymap_prepare_truncate(rel, 0);
     389            2 :     old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
     390              : 
     391              :     /*
     392              :      * WAL-logging, buffer dropping, file truncation must be atomic and all on
     393              :      * one side of a checkpoint.  See RelationTruncate() for discussion.
     394              :      */
     395              :     Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
     396            2 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
     397            2 :     START_CRIT_SECTION();
     398              : 
     399            2 :     if (RelationNeedsWAL(rel))
     400              :     {
     401              :         XLogRecPtr  lsn;
     402              :         xl_smgr_truncate xlrec;
     403              : 
     404            1 :         xlrec.blkno = 0;
     405            1 :         xlrec.rlocator = rel->rd_locator;
     406            1 :         xlrec.flags = SMGR_TRUNCATE_VM;
     407              : 
     408            1 :         XLogBeginInsert();
     409            1 :         XLogRegisterData(&xlrec, sizeof(xlrec));
     410              : 
     411            1 :         lsn = XLogInsert(RM_SMGR_ID,
     412              :                          XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
     413            1 :         XLogFlush(lsn);
     414              :     }
     415              : 
     416            2 :     if (BlockNumberIsValid(block))
     417            2 :         smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
     418              : 
     419            2 :     END_CRIT_SECTION();
     420            2 :     MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
     421              : 
     422              :     /*
     423              :      * Release the lock right away, not at commit time.
     424              :      *
     425              :      * It would be a problem to release the lock prior to commit if this
     426              :      * truncate operation sends any transactional invalidation messages. Other
     427              :      * backends would potentially be able to lock the relation without
     428              :      * processing them in the window of time between when we release the lock
     429              :      * here and when we sent the messages at our eventual commit.  However,
     430              :      * we're currently only sending a non-transactional smgr invalidation,
     431              :      * which will have been posted to shared memory immediately from within
     432              :      * smgr_truncate.  Therefore, there should be no race here.
     433              :      *
     434              :      * The reason why it's desirable to release the lock early here is because
     435              :      * of the possibility that someone will need to use this to blow away many
     436              :      * visibility map forks at once.  If we can't release the lock until
     437              :      * commit time, the transaction doing this will accumulate
     438              :      * AccessExclusiveLocks on all of those relations at the same time, which
     439              :      * is undesirable. However, if this turns out to be unsafe we may have no
     440              :      * choice...
     441              :      */
     442            2 :     relation_close(rel, AccessExclusiveLock);
     443              : 
     444              :     /* Nothing to return. */
     445            2 :     PG_RETURN_VOID();
     446              : }
     447              : 
     448              : /*
     449              :  * Helper function to construct whichever TupleDesc we need for a particular
     450              :  * call.
     451              :  */
     452              : static TupleDesc
     453           18 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
     454              : {
     455              :     TupleDesc   tupdesc;
     456           18 :     AttrNumber  maxattr = 2;
     457           18 :     AttrNumber  a = 0;
     458              : 
     459           18 :     if (include_blkno)
     460           17 :         ++maxattr;
     461           18 :     if (include_pd)
     462            7 :         ++maxattr;
     463           18 :     tupdesc = CreateTemplateTupleDesc(maxattr);
     464           18 :     if (include_blkno)
     465           17 :         TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
     466           18 :     TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
     467           18 :     TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
     468           18 :     if (include_pd)
     469            7 :         TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
     470              :     Assert(a == maxattr);
     471              : 
     472           18 :     return BlessTupleDesc(tupdesc);
     473              : }
     474              : 
     475              : /*
     476              :  * Collect visibility data about a relation.
     477              :  *
     478              :  * Checks relkind of relid and will throw an error if the relation does not
     479              :  * have a VM.
     480              :  */
     481              : static vbits *
     482           17 : collect_visibility_data(Oid relid, bool include_pd)
     483              : {
     484              :     Relation    rel;
     485              :     BlockNumber nblocks;
     486              :     vbits      *info;
     487              :     BlockNumber blkno;
     488           17 :     Buffer      vmbuffer = InvalidBuffer;
     489           17 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     490              :     BlockRangeReadStreamPrivate p;
     491           17 :     ReadStream *stream = NULL;
     492              : 
     493           17 :     rel = relation_open(relid, AccessShareLock);
     494              : 
     495              :     /* Only some relkinds have a visibility map */
     496           15 :     check_relation_relkind(rel);
     497              : 
     498           10 :     nblocks = RelationGetNumberOfBlocks(rel);
     499           10 :     info = palloc0(offsetof(vbits, bits) + nblocks);
     500           10 :     info->next = 0;
     501           10 :     info->count = nblocks;
     502              : 
     503              :     /* Create a stream if reading main fork. */
     504           10 :     if (include_pd)
     505              :     {
     506            6 :         p.current_blocknum = 0;
     507            6 :         p.last_exclusive = nblocks;
     508              : 
     509              :         /*
     510              :          * It is safe to use batchmode as block_range_read_stream_cb takes no
     511              :          * locks.
     512              :          */
     513            6 :         stream = read_stream_begin_relation(READ_STREAM_FULL |
     514              :                                             READ_STREAM_USE_BATCHING,
     515              :                                             bstrategy,
     516              :                                             rel,
     517              :                                             MAIN_FORKNUM,
     518              :                                             block_range_read_stream_cb,
     519              :                                             &p,
     520              :                                             0);
     521              :     }
     522              : 
     523           22 :     for (blkno = 0; blkno < nblocks; ++blkno)
     524              :     {
     525              :         int32       mapbits;
     526              : 
     527              :         /* Make sure we are interruptible. */
     528           12 :         CHECK_FOR_INTERRUPTS();
     529              : 
     530              :         /* Get map info. */
     531           12 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     532           12 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     533            8 :             info->bits[blkno] |= (1 << 0);
     534           12 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     535            5 :             info->bits[blkno] |= (1 << 1);
     536              : 
     537              :         /*
     538              :          * Page-level data requires reading every block, so only get it if the
     539              :          * caller needs it.  Use a buffer access strategy, too, to prevent
     540              :          * cache-trashing.
     541              :          */
     542           12 :         if (include_pd)
     543              :         {
     544              :             Buffer      buffer;
     545              :             Page        page;
     546              : 
     547            3 :             buffer = read_stream_next_buffer(stream, NULL);
     548            3 :             LockBuffer(buffer, BUFFER_LOCK_SHARE);
     549              : 
     550            3 :             page = BufferGetPage(buffer);
     551            3 :             if (PageIsAllVisible(page))
     552            2 :                 info->bits[blkno] |= (1 << 2);
     553              : 
     554            3 :             UnlockReleaseBuffer(buffer);
     555              :         }
     556              :     }
     557              : 
     558           10 :     if (include_pd)
     559              :     {
     560              :         Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
     561            6 :         read_stream_end(stream);
     562              :     }
     563              : 
     564              :     /* Clean up. */
     565           10 :     if (vmbuffer != InvalidBuffer)
     566            7 :         ReleaseBuffer(vmbuffer);
     567           10 :     relation_close(rel, AccessShareLock);
     568              : 
     569           10 :     return info;
     570              : }
     571              : 
     572              : /*
     573              :  * The "strict" version of GetOldestNonRemovableTransactionId().  The
     574              :  * pg_visibility check can tolerate false positives (don't report some of the
     575              :  * errors), but can't tolerate false negatives (report false errors). Normally,
     576              :  * horizons move forwards, but there are cases when it could move backward
     577              :  * (see comment for ComputeXidHorizons()).
     578              :  *
     579              :  * This is why we have to implement our own function for xid horizon, which
     580              :  * would be guaranteed to be newer or equal to any xid horizon computed before.
     581              :  * We have to do the following to achieve this.
     582              :  *
     583              :  * 1. Ignore processes xmin's, because they consider connection to other
     584              :  *    databases that were ignored before.
     585              :  * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
     586              :  *    now perform minimal checking on a standby by always using nextXid, this
     587              :  *    approach is better than nothing and will at least catch extremely broken
     588              :  *    cases where a xid is in the future.
     589              :  * 3. Ignore walsender xmin, because it could go backward if some replication
     590              :  *    connections don't use replication slots.
     591              :  *
     592              :  * While it might seem like we could use KnownAssignedXids for shared
     593              :  * catalogs, since shared catalogs rely on a global horizon rather than a
     594              :  * database-specific one - there are potential edge cases.  For example, a
     595              :  * transaction may crash on the primary without writing a commit/abort record.
     596              :  * This would lead to a situation where it appears to still be running on the
     597              :  * standby, even though it has already ended on the primary.  For this reason,
     598              :  * it's safer to ignore KnownAssignedXids, even for shared catalogs.
     599              :  *
     600              :  * As a result, we're using only currently running xids to compute the horizon.
     601              :  * Surely these would significantly sacrifice accuracy.  But we have to do so
     602              :  * to avoid reporting false errors.
     603              :  */
     604              : static TransactionId
     605            8 : GetStrictOldestNonRemovableTransactionId(Relation rel)
     606              : {
     607              :     RunningTransactions runningTransactions;
     608              : 
     609            8 :     if (RecoveryInProgress())
     610              :     {
     611              :         TransactionId result;
     612              : 
     613              :         /* As we ignore KnownAssignedXids on standby, just pick nextXid */
     614            1 :         LWLockAcquire(XidGenLock, LW_SHARED);
     615            1 :         result = XidFromFullTransactionId(TransamVariables->nextXid);
     616            1 :         LWLockRelease(XidGenLock);
     617            1 :         return result;
     618              :     }
     619            7 :     else if (rel == NULL || rel->rd_rel->relisshared)
     620              :     {
     621              :         /* Shared relation: take into account all running xids */
     622            0 :         runningTransactions = GetRunningTransactionData();
     623            0 :         LWLockRelease(ProcArrayLock);
     624            0 :         LWLockRelease(XidGenLock);
     625            0 :         return runningTransactions->oldestRunningXid;
     626              :     }
     627            7 :     else if (!RELATION_IS_LOCAL(rel))
     628              :     {
     629              :         /*
     630              :          * Normal relation: take into account xids running within the current
     631              :          * database
     632              :          */
     633            7 :         runningTransactions = GetRunningTransactionData();
     634            7 :         LWLockRelease(ProcArrayLock);
     635            7 :         LWLockRelease(XidGenLock);
     636            7 :         return runningTransactions->oldestDatabaseRunningXid;
     637              :     }
     638              :     else
     639              :     {
     640              :         /*
     641              :          * For temporary relations, ComputeXidHorizons() uses only
     642              :          * TransamVariables->latestCompletedXid and MyProc->xid.  These two
     643              :          * shouldn't go backwards.  So we're fine with this horizon.
     644              :          */
     645            0 :         return GetOldestNonRemovableTransactionId(rel);
     646              :     }
     647              : }
     648              : 
     649              : /*
     650              :  * Callback function to get next block for read stream object used in
     651              :  * collect_corrupt_items() function.
     652              :  */
     653              : static BlockNumber
     654          103 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
     655              :                                              void *callback_private_data,
     656              :                                              void *per_buffer_data)
     657              : {
     658          103 :     struct collect_corrupt_items_read_stream_private *p = callback_private_data;
     659              : 
     660          109 :     for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
     661              :     {
     662          101 :         bool        check_frozen = false;
     663          101 :         bool        check_visible = false;
     664              : 
     665              :         /* Make sure we are interruptible. */
     666          101 :         CHECK_FOR_INTERRUPTS();
     667              : 
     668          101 :         if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
     669           49 :             check_frozen = true;
     670          101 :         if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
     671           46 :             check_visible = true;
     672          101 :         if (!check_visible && !check_frozen)
     673            6 :             continue;
     674              : 
     675           95 :         return p->current_blocknum++;
     676              :     }
     677              : 
     678            8 :     return InvalidBlockNumber;
     679              : }
     680              : 
     681              : /*
     682              :  * Returns a list of items whose visibility map information does not match
     683              :  * the status of the tuples on the page.
     684              :  *
     685              :  * If all_visible is passed as true, this will include all items which are
     686              :  * on pages marked as all-visible in the visibility map but which do not
     687              :  * seem to in fact be all-visible.
     688              :  *
     689              :  * If all_frozen is passed as true, this will include all items which are
     690              :  * on pages marked as all-frozen but which do not seem to in fact be frozen.
     691              :  *
     692              :  * Checks relkind of relid and will throw an error if the relation does not
     693              :  * have a VM.
     694              :  */
     695              : static corrupt_items *
     696           13 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
     697              : {
     698              :     Relation    rel;
     699              :     corrupt_items *items;
     700           13 :     Buffer      vmbuffer = InvalidBuffer;
     701           13 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     702           13 :     TransactionId OldestXmin = InvalidTransactionId;
     703              :     struct collect_corrupt_items_read_stream_private p;
     704              :     ReadStream *stream;
     705              :     Buffer      buffer;
     706              : 
     707           13 :     rel = relation_open(relid, AccessShareLock);
     708              : 
     709              :     /* Only some relkinds have a visibility map */
     710           13 :     check_relation_relkind(rel);
     711              : 
     712            8 :     if (all_visible)
     713            3 :         OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     714              : 
     715              :     /*
     716              :      * Guess an initial array size. We don't expect many corrupted tuples, so
     717              :      * start with a small array.  This function uses the "next" field to track
     718              :      * the next offset where we can store an item (which is the same thing as
     719              :      * the number of items found so far) and the "count" field to track the
     720              :      * number of entries allocated.  We'll repurpose these fields before
     721              :      * returning.
     722              :      */
     723            8 :     items = palloc0_object(corrupt_items);
     724            8 :     items->next = 0;
     725            8 :     items->count = 64;
     726            8 :     items->tids = palloc(items->count * sizeof(ItemPointerData));
     727              : 
     728            8 :     p.current_blocknum = 0;
     729            8 :     p.last_exclusive = RelationGetNumberOfBlocks(rel);
     730            8 :     p.rel = rel;
     731            8 :     p.vmbuffer = InvalidBuffer;
     732            8 :     p.all_frozen = all_frozen;
     733            8 :     p.all_visible = all_visible;
     734            8 :     stream = read_stream_begin_relation(READ_STREAM_FULL,
     735              :                                         bstrategy,
     736              :                                         rel,
     737              :                                         MAIN_FORKNUM,
     738              :                                         collect_corrupt_items_read_stream_next_block,
     739              :                                         &p,
     740              :                                         0);
     741              : 
     742              :     /* Loop over every block in the relation. */
     743          103 :     while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
     744              :     {
     745           95 :         bool        check_frozen = all_frozen;
     746           95 :         bool        check_visible = all_visible;
     747              :         Page        page;
     748              :         OffsetNumber offnum,
     749              :                     maxoff;
     750              :         BlockNumber blkno;
     751              : 
     752              :         /* Make sure we are interruptible. */
     753           95 :         CHECK_FOR_INTERRUPTS();
     754              : 
     755           95 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     756              : 
     757           95 :         page = BufferGetPage(buffer);
     758           95 :         maxoff = PageGetMaxOffsetNumber(page);
     759           95 :         blkno = BufferGetBlockNumber(buffer);
     760              : 
     761              :         /*
     762              :          * The visibility map bits might have changed while we were acquiring
     763              :          * the page lock.  Recheck to avoid returning spurious results.
     764              :          */
     765           95 :         if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
     766            0 :             check_frozen = false;
     767           95 :         if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
     768            0 :             check_visible = false;
     769           95 :         if (!check_visible && !check_frozen)
     770              :         {
     771            0 :             UnlockReleaseBuffer(buffer);
     772            0 :             continue;
     773              :         }
     774              : 
     775              :         /* Iterate over each tuple on the page. */
     776           95 :         for (offnum = FirstOffsetNumber;
     777        16126 :              offnum <= maxoff;
     778        16031 :              offnum = OffsetNumberNext(offnum))
     779              :         {
     780              :             HeapTupleData tuple;
     781              :             ItemId      itemid;
     782              : 
     783        16031 :             itemid = PageGetItemId(page, offnum);
     784              : 
     785              :             /* Unused or redirect line pointers are of no interest. */
     786        16031 :             if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
     787            0 :                 continue;
     788              : 
     789              :             /* Dead line pointers are neither all-visible nor frozen. */
     790        16031 :             if (ItemIdIsDead(itemid))
     791              :             {
     792            0 :                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
     793            0 :                 record_corrupt_item(items, &tuple.t_self);
     794            0 :                 continue;
     795              :             }
     796              : 
     797              :             /* Initialize a HeapTupleData structure for checks below. */
     798        16031 :             ItemPointerSet(&(tuple.t_self), blkno, offnum);
     799        16031 :             tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
     800        16031 :             tuple.t_len = ItemIdGetLength(itemid);
     801        16031 :             tuple.t_tableOid = relid;
     802              : 
     803              :             /*
     804              :              * If we're checking whether the page is all-visible, we expect
     805              :              * the tuple to be all-visible.
     806              :              */
     807        16031 :             if (check_visible &&
     808         8009 :                 !tuple_all_visible(&tuple, OldestXmin, buffer))
     809              :             {
     810              :                 TransactionId RecomputedOldestXmin;
     811              : 
     812              :                 /*
     813              :                  * Time has passed since we computed OldestXmin, so it's
     814              :                  * possible that this tuple is all-visible in reality even
     815              :                  * though it doesn't appear so based on our
     816              :                  * previously-computed value.  Let's compute a new value so we
     817              :                  * can be certain whether there is a problem.
     818              :                  *
     819              :                  * From a concurrency point of view, it sort of sucks to
     820              :                  * retake ProcArrayLock here while we're holding the buffer
     821              :                  * locked in shared mode, but it should be safe against
     822              :                  * deadlocks, because surely
     823              :                  * GetStrictOldestNonRemovableTransactionId() should never
     824              :                  * take a buffer lock. And this shouldn't happen often, so
     825              :                  * it's worth being careful so as to avoid false positives.
     826              :                  */
     827            5 :                 RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     828              : 
     829            5 :                 if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
     830            5 :                     record_corrupt_item(items, &tuple.t_self);
     831              :                 else
     832              :                 {
     833            0 :                     OldestXmin = RecomputedOldestXmin;
     834            0 :                     if (!tuple_all_visible(&tuple, OldestXmin, buffer))
     835            0 :                         record_corrupt_item(items, &tuple.t_self);
     836              :                 }
     837              :             }
     838              : 
     839              :             /*
     840              :              * If we're checking whether the page is all-frozen, we expect the
     841              :              * tuple to be in a state where it will never need freezing.
     842              :              */
     843        16031 :             if (check_frozen)
     844              :             {
     845         8022 :                 if (heap_tuple_needs_eventual_freeze(tuple.t_data))
     846            5 :                     record_corrupt_item(items, &tuple.t_self);
     847              :             }
     848              :         }
     849              : 
     850           95 :         UnlockReleaseBuffer(buffer);
     851              :     }
     852            8 :     read_stream_end(stream);
     853              : 
     854              :     /* Clean up. */
     855            8 :     if (vmbuffer != InvalidBuffer)
     856            7 :         ReleaseBuffer(vmbuffer);
     857            8 :     if (p.vmbuffer != InvalidBuffer)
     858            8 :         ReleaseBuffer(p.vmbuffer);
     859            8 :     relation_close(rel, AccessShareLock);
     860              : 
     861              :     /*
     862              :      * Before returning, repurpose the fields to match caller's expectations.
     863              :      * next is now the next item that should be read (rather than written) and
     864              :      * count is now the number of items we wrote (rather than the number we
     865              :      * allocated).
     866              :      */
     867            8 :     items->count = items->next;
     868            8 :     items->next = 0;
     869              : 
     870            8 :     return items;
     871              : }
     872              : 
     873              : /*
     874              :  * Remember one corrupt item.
     875              :  */
     876              : static void
     877           10 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
     878              : {
     879              :     /* enlarge output array if needed. */
     880           10 :     if (items->next >= items->count)
     881              :     {
     882            0 :         items->count *= 2;
     883            0 :         items->tids = repalloc(items->tids,
     884            0 :                                items->count * sizeof(ItemPointerData));
     885              :     }
     886              :     /* and add the new item */
     887           10 :     items->tids[items->next++] = *tid;
     888           10 : }
     889              : 
     890              : /*
     891              :  * Check whether a tuple is all-visible relative to a given OldestXmin value.
     892              :  * The buffer should contain the tuple and should be locked and pinned.
     893              :  */
     894              : static bool
     895         8009 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
     896              : {
     897              :     HTSV_Result state;
     898              :     TransactionId xmin;
     899              : 
     900         8009 :     state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
     901         8009 :     if (state != HEAPTUPLE_LIVE)
     902            5 :         return false;           /* all-visible implies live */
     903              : 
     904              :     /*
     905              :      * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
     906              :      * all-visible unless every tuple is hinted committed. However, those hint
     907              :      * bits could be lost after a crash, so we can't be certain that they'll
     908              :      * be set here.  So just check the xmin.
     909              :      */
     910              : 
     911         8004 :     xmin = HeapTupleHeaderGetXmin(tup->t_data);
     912         8004 :     if (!TransactionIdPrecedes(xmin, OldestXmin))
     913            0 :         return false;           /* xmin not old enough for all to see */
     914              : 
     915         8004 :     return true;
     916              : }
     917              : 
     918              : /*
     919              :  * check_relation_relkind - convenience routine to check that relation
     920              :  * is of the relkind supported by the callers
     921              :  */
     922              : static void
     923           50 : check_relation_relkind(Relation rel)
     924              : {
     925           50 :     if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
     926           25 :         ereport(ERROR,
     927              :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
     928              :                  errmsg("relation \"%s\" is of wrong relation kind",
     929              :                         RelationGetRelationName(rel)),
     930              :                  errdetail_relkind_not_supported(rel->rd_rel->relkind)));
     931           25 : }
        

Generated by: LCOV version 2.0-1