LCOV - code coverage report
Current view: top level - contrib/pg_visibility - pg_visibility.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 87.5 % 320 280
Test Date: 2026-04-06 21:16:29 Functions: 96.0 % 25 24
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_visibility.c
       4              :  *    display visibility map information and page-level visibility bits
       5              :  *
       6              :  * Copyright (c) 2016-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  *    contrib/pg_visibility/pg_visibility.c
       9              :  *-------------------------------------------------------------------------
      10              :  */
      11              : #include "postgres.h"
      12              : 
      13              : #include "access/heapam.h"
      14              : #include "access/htup_details.h"
      15              : #include "access/visibilitymap.h"
      16              : #include "access/xloginsert.h"
      17              : #include "catalog/pg_type.h"
      18              : #include "catalog/storage_xlog.h"
      19              : #include "funcapi.h"
      20              : #include "miscadmin.h"
      21              : #include "storage/bufmgr.h"
      22              : #include "storage/proc.h"
      23              : #include "storage/procarray.h"
      24              : #include "storage/read_stream.h"
      25              : #include "storage/smgr.h"
      26              : #include "utils/rel.h"
      27              : 
      28            7 : PG_MODULE_MAGIC_EXT(
      29              :                     .name = "pg_visibility",
      30              :                     .version = PG_VERSION
      31              : );
      32              : 
      33              : typedef struct vbits
      34              : {
      35              :     BlockNumber next;
      36              :     BlockNumber count;
      37              :     uint8       bits[FLEXIBLE_ARRAY_MEMBER];
      38              : } vbits;
      39              : 
      40              : typedef struct corrupt_items
      41              : {
      42              :     BlockNumber next;
      43              :     BlockNumber count;
      44              :     ItemPointer tids;
      45              : } corrupt_items;
      46              : 
      47              : /* for collect_corrupt_items_read_stream_next_block */
      48              : struct collect_corrupt_items_read_stream_private
      49              : {
      50              :     bool        all_frozen;
      51              :     bool        all_visible;
      52              :     BlockNumber current_blocknum;
      53              :     BlockNumber last_exclusive;
      54              :     Relation    rel;
      55              :     Buffer      vmbuffer;
      56              : };
      57              : 
      58            3 : PG_FUNCTION_INFO_V1(pg_visibility_map);
      59            4 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
      60            4 : PG_FUNCTION_INFO_V1(pg_visibility);
      61            4 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
      62            4 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
      63            5 : PG_FUNCTION_INFO_V1(pg_check_frozen);
      64            6 : PG_FUNCTION_INFO_V1(pg_check_visible);
      65            4 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
      66              : 
      67              : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
      68              : static vbits *collect_visibility_data(Oid relid, bool include_pd);
      69              : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
      70              :                                             bool all_frozen);
      71              : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
      72              : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
      73              :                               Buffer buffer);
      74              : static void check_relation_relkind(Relation rel);
      75              : 
      76              : /*
      77              :  * Visibility map information for a single block of a relation.
      78              :  *
      79              :  * Note: the VM code will silently return zeroes for pages past the end
      80              :  * of the map, so we allow probes up to MaxBlockNumber regardless of the
      81              :  * actual relation size.
      82              :  */
      83              : Datum
      84            0 : pg_visibility_map(PG_FUNCTION_ARGS)
      85              : {
      86            0 :     Oid         relid = PG_GETARG_OID(0);
      87            0 :     int64       blkno = PG_GETARG_INT64(1);
      88              :     int32       mapbits;
      89              :     Relation    rel;
      90            0 :     Buffer      vmbuffer = InvalidBuffer;
      91              :     TupleDesc   tupdesc;
      92              :     Datum       values[2];
      93            0 :     bool        nulls[2] = {0};
      94              : 
      95            0 :     rel = relation_open(relid, AccessShareLock);
      96              : 
      97              :     /* Only some relkinds have a visibility map */
      98            0 :     check_relation_relkind(rel);
      99              : 
     100            0 :     if (blkno < 0 || blkno > MaxBlockNumber)
     101            0 :         ereport(ERROR,
     102              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     103              :                  errmsg("invalid block number")));
     104              : 
     105            0 :     tupdesc = pg_visibility_tupdesc(false, false);
     106              : 
     107            0 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     108            0 :     if (vmbuffer != InvalidBuffer)
     109            0 :         ReleaseBuffer(vmbuffer);
     110            0 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     111            0 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     112              : 
     113            0 :     relation_close(rel, AccessShareLock);
     114              : 
     115            0 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     116              : }
     117              : 
     118              : /*
     119              :  * Visibility map information for a single block of a relation, plus the
     120              :  * page-level information for the same block.
     121              :  */
     122              : Datum
     123            6 : pg_visibility(PG_FUNCTION_ARGS)
     124              : {
     125            6 :     Oid         relid = PG_GETARG_OID(0);
     126            6 :     int64       blkno = PG_GETARG_INT64(1);
     127              :     int32       mapbits;
     128              :     Relation    rel;
     129            6 :     Buffer      vmbuffer = InvalidBuffer;
     130              :     Buffer      buffer;
     131              :     Page        page;
     132              :     TupleDesc   tupdesc;
     133              :     Datum       values[3];
     134            6 :     bool        nulls[3] = {0};
     135              : 
     136            6 :     rel = relation_open(relid, AccessShareLock);
     137              : 
     138              :     /* Only some relkinds have a visibility map */
     139            6 :     check_relation_relkind(rel);
     140              : 
     141            1 :     if (blkno < 0 || blkno > MaxBlockNumber)
     142            0 :         ereport(ERROR,
     143              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     144              :                  errmsg("invalid block number")));
     145              : 
     146            1 :     tupdesc = pg_visibility_tupdesc(false, true);
     147              : 
     148            1 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     149            1 :     if (vmbuffer != InvalidBuffer)
     150            1 :         ReleaseBuffer(vmbuffer);
     151            1 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     152            1 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     153              : 
     154              :     /* Here we have to explicitly check rel size ... */
     155            1 :     if (blkno < RelationGetNumberOfBlocks(rel))
     156              :     {
     157            1 :         buffer = ReadBuffer(rel, blkno);
     158            1 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     159              : 
     160            1 :         page = BufferGetPage(buffer);
     161            1 :         values[2] = BoolGetDatum(PageIsAllVisible(page));
     162              : 
     163            1 :         UnlockReleaseBuffer(buffer);
     164              :     }
     165              :     else
     166              :     {
     167              :         /* As with the vismap, silently return 0 for pages past EOF */
     168            0 :         values[2] = BoolGetDatum(false);
     169              :     }
     170              : 
     171            1 :     relation_close(rel, AccessShareLock);
     172              : 
     173            1 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     174              : }
     175              : 
     176              : /*
     177              :  * Visibility map information for every block in a relation.
     178              :  */
     179              : Datum
     180           20 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
     181              : {
     182              :     FuncCallContext *funcctx;
     183              :     vbits      *info;
     184              : 
     185           20 :     if (SRF_IS_FIRSTCALL())
     186              :     {
     187           11 :         Oid         relid = PG_GETARG_OID(0);
     188              :         MemoryContext oldcontext;
     189              : 
     190           11 :         funcctx = SRF_FIRSTCALL_INIT();
     191           11 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     192           11 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
     193              :         /* collect_visibility_data will verify the relkind */
     194           11 :         funcctx->user_fctx = collect_visibility_data(relid, false);
     195            4 :         MemoryContextSwitchTo(oldcontext);
     196              :     }
     197              : 
     198           13 :     funcctx = SRF_PERCALL_SETUP();
     199           13 :     info = (vbits *) funcctx->user_fctx;
     200              : 
     201           13 :     if (info->next < info->count)
     202              :     {
     203              :         Datum       values[3];
     204            9 :         bool        nulls[3] = {0};
     205              :         HeapTuple   tuple;
     206              : 
     207            9 :         values[0] = Int64GetDatum(info->next);
     208            9 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     209            9 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     210            9 :         info->next++;
     211              : 
     212            9 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     213            9 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     214              :     }
     215              : 
     216            4 :     SRF_RETURN_DONE(funcctx);
     217              : }
     218              : 
     219              : /*
     220              :  * Visibility map information for every block in a relation, plus the page
     221              :  * level information for each block.
     222              :  */
     223              : Datum
     224            9 : pg_visibility_rel(PG_FUNCTION_ARGS)
     225              : {
     226              :     FuncCallContext *funcctx;
     227              :     vbits      *info;
     228              : 
     229            9 :     if (SRF_IS_FIRSTCALL())
     230              :     {
     231            6 :         Oid         relid = PG_GETARG_OID(0);
     232              :         MemoryContext oldcontext;
     233              : 
     234            6 :         funcctx = SRF_FIRSTCALL_INIT();
     235            6 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     236            6 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
     237              :         /* collect_visibility_data will verify the relkind */
     238            6 :         funcctx->user_fctx = collect_visibility_data(relid, true);
     239            6 :         MemoryContextSwitchTo(oldcontext);
     240              :     }
     241              : 
     242            9 :     funcctx = SRF_PERCALL_SETUP();
     243            9 :     info = (vbits *) funcctx->user_fctx;
     244              : 
     245            9 :     if (info->next < info->count)
     246              :     {
     247              :         Datum       values[4];
     248            3 :         bool        nulls[4] = {0};
     249              :         HeapTuple   tuple;
     250              : 
     251            3 :         values[0] = Int64GetDatum(info->next);
     252            3 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     253            3 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     254            3 :         values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
     255            3 :         info->next++;
     256              : 
     257            3 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     258            3 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     259              :     }
     260              : 
     261            6 :     SRF_RETURN_DONE(funcctx);
     262              : }
     263              : 
     264              : /*
     265              :  * Count the number of all-visible and all-frozen pages in the visibility
     266              :  * map for a particular relation.
     267              :  */
     268              : Datum
     269            9 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
     270              : {
     271            9 :     Oid         relid = PG_GETARG_OID(0);
     272              :     Relation    rel;
     273            9 :     BlockNumber all_visible = 0;
     274            9 :     BlockNumber all_frozen = 0;
     275              :     TupleDesc   tupdesc;
     276              :     Datum       values[2];
     277            9 :     bool        nulls[2] = {0};
     278              : 
     279            9 :     rel = relation_open(relid, AccessShareLock);
     280              : 
     281              :     /* Only some relkinds have a visibility map */
     282            9 :     check_relation_relkind(rel);
     283              : 
     284            4 :     visibilitymap_count(rel, &all_visible, &all_frozen);
     285              : 
     286            4 :     relation_close(rel, AccessShareLock);
     287              : 
     288            4 :     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
     289            0 :         elog(ERROR, "return type must be a row type");
     290              : 
     291            4 :     values[0] = Int64GetDatum((int64) all_visible);
     292            4 :     values[1] = Int64GetDatum((int64) all_frozen);
     293              : 
     294            4 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     295              : }
     296              : 
     297              : /*
     298              :  * Return the TIDs of non-frozen tuples present in pages marked all-frozen
     299              :  * in the visibility map.  We hope no one will ever find any, but there could
     300              :  * be bugs, database corruption, etc.
     301              :  */
     302              : Datum
     303           15 : pg_check_frozen(PG_FUNCTION_ARGS)
     304              : {
     305              :     FuncCallContext *funcctx;
     306              :     corrupt_items *items;
     307              : 
     308           15 :     if (SRF_IS_FIRSTCALL())
     309              :     {
     310           10 :         Oid         relid = PG_GETARG_OID(0);
     311              :         MemoryContext oldcontext;
     312              : 
     313           10 :         funcctx = SRF_FIRSTCALL_INIT();
     314           10 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     315              :         /* collect_corrupt_items will verify the relkind */
     316           10 :         funcctx->user_fctx = collect_corrupt_items(relid, false, true);
     317            5 :         MemoryContextSwitchTo(oldcontext);
     318              :     }
     319              : 
     320           10 :     funcctx = SRF_PERCALL_SETUP();
     321           10 :     items = (corrupt_items *) funcctx->user_fctx;
     322              : 
     323           10 :     if (items->next < items->count)
     324            5 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     325              : 
     326            5 :     SRF_RETURN_DONE(funcctx);
     327              : }
     328              : 
     329              : /*
     330              :  * Return the TIDs of not-all-visible tuples in pages marked all-visible
     331              :  * in the visibility map.  We hope no one will ever find any, but there could
     332              :  * be bugs, database corruption, etc.
     333              :  */
     334              : Datum
     335            8 : pg_check_visible(PG_FUNCTION_ARGS)
     336              : {
     337              :     FuncCallContext *funcctx;
     338              :     corrupt_items *items;
     339              : 
     340            8 :     if (SRF_IS_FIRSTCALL())
     341              :     {
     342            3 :         Oid         relid = PG_GETARG_OID(0);
     343              :         MemoryContext oldcontext;
     344              : 
     345            3 :         funcctx = SRF_FIRSTCALL_INIT();
     346            3 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     347              :         /* collect_corrupt_items will verify the relkind */
     348            3 :         funcctx->user_fctx = collect_corrupt_items(relid, true, false);
     349            3 :         MemoryContextSwitchTo(oldcontext);
     350              :     }
     351              : 
     352            8 :     funcctx = SRF_PERCALL_SETUP();
     353            8 :     items = (corrupt_items *) funcctx->user_fctx;
     354              : 
     355            8 :     if (items->next < items->count)
     356            5 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     357              : 
     358            3 :     SRF_RETURN_DONE(funcctx);
     359              : }
     360              : 
     361              : /*
     362              :  * Remove the visibility map fork for a relation.  If there turn out to be
     363              :  * any bugs in the visibility map code that require rebuilding the VM, this
     364              :  * provides users with a way to do it that is cleaner than shutting down the
     365              :  * server and removing files by hand.
     366              :  *
     367              :  * This is a cut-down version of RelationTruncate.
     368              :  */
     369              : Datum
     370            7 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
     371              : {
     372            7 :     Oid         relid = PG_GETARG_OID(0);
     373              :     Relation    rel;
     374              :     ForkNumber  fork;
     375              :     BlockNumber block;
     376              :     BlockNumber old_block;
     377              : 
     378            7 :     rel = relation_open(relid, AccessExclusiveLock);
     379              : 
     380              :     /* Only some relkinds have a visibility map */
     381            7 :     check_relation_relkind(rel);
     382              : 
     383              :     /* Forcibly reset cached file size */
     384            2 :     RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
     385              : 
     386              :     /* Compute new and old size before entering critical section. */
     387            2 :     fork = VISIBILITYMAP_FORKNUM;
     388            2 :     block = visibilitymap_prepare_truncate(rel, 0);
     389            2 :     old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
     390              : 
     391              :     /*
     392              :      * WAL-logging, buffer dropping, file truncation must be atomic and all on
     393              :      * one side of a checkpoint.  See RelationTruncate() for discussion.
     394              :      */
     395              :     Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
     396            2 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
     397            2 :     START_CRIT_SECTION();
     398              : 
     399            2 :     if (RelationNeedsWAL(rel))
     400              :     {
     401              :         XLogRecPtr  lsn;
     402              :         xl_smgr_truncate xlrec;
     403              : 
     404            1 :         xlrec.blkno = 0;
     405            1 :         xlrec.rlocator = rel->rd_locator;
     406            1 :         xlrec.flags = SMGR_TRUNCATE_VM;
     407              : 
     408            1 :         XLogBeginInsert();
     409            1 :         XLogRegisterData(&xlrec, sizeof(xlrec));
     410              : 
     411            1 :         lsn = XLogInsert(RM_SMGR_ID,
     412              :                          XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
     413            1 :         XLogFlush(lsn);
     414              :     }
     415              : 
     416            2 :     if (BlockNumberIsValid(block))
     417            2 :         smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
     418              : 
     419            2 :     END_CRIT_SECTION();
     420            2 :     MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
     421              : 
     422              :     /*
     423              :      * Release the lock right away, not at commit time.
     424              :      *
     425              :      * It would be a problem to release the lock prior to commit if this
     426              :      * truncate operation sends any transactional invalidation messages. Other
     427              :      * backends would potentially be able to lock the relation without
     428              :      * processing them in the window of time between when we release the lock
     429              :      * here and when we sent the messages at our eventual commit.  However,
     430              :      * we're currently only sending a non-transactional smgr invalidation,
     431              :      * which will have been posted to shared memory immediately from within
     432              :      * smgr_truncate.  Therefore, there should be no race here.
     433              :      *
     434              :      * The reason why it's desirable to release the lock early here is because
     435              :      * of the possibility that someone will need to use this to blow away many
     436              :      * visibility map forks at once.  If we can't release the lock until
     437              :      * commit time, the transaction doing this will accumulate
     438              :      * AccessExclusiveLocks on all of those relations at the same time, which
     439              :      * is undesirable. However, if this turns out to be unsafe we may have no
     440              :      * choice...
     441              :      */
     442            2 :     relation_close(rel, AccessExclusiveLock);
     443              : 
     444              :     /* Nothing to return. */
     445            2 :     PG_RETURN_VOID();
     446              : }
     447              : 
     448              : /*
     449              :  * Helper function to construct whichever TupleDesc we need for a particular
     450              :  * call.
     451              :  */
     452              : static TupleDesc
     453           18 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
     454              : {
     455              :     TupleDesc   tupdesc;
     456           18 :     AttrNumber  maxattr = 2;
     457           18 :     AttrNumber  a = 0;
     458              : 
     459           18 :     if (include_blkno)
     460           17 :         ++maxattr;
     461           18 :     if (include_pd)
     462            7 :         ++maxattr;
     463           18 :     tupdesc = CreateTemplateTupleDesc(maxattr);
     464           18 :     if (include_blkno)
     465           17 :         TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
     466           18 :     TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
     467           18 :     TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
     468           18 :     if (include_pd)
     469            7 :         TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
     470              :     Assert(a == maxattr);
     471              : 
     472           18 :     TupleDescFinalize(tupdesc);
     473              : 
     474           18 :     return BlessTupleDesc(tupdesc);
     475              : }
     476              : 
     477              : /*
     478              :  * Collect visibility data about a relation.
     479              :  *
     480              :  * Checks relkind of relid and will throw an error if the relation does not
     481              :  * have a VM.
     482              :  */
     483              : static vbits *
     484           17 : collect_visibility_data(Oid relid, bool include_pd)
     485              : {
     486              :     Relation    rel;
     487              :     BlockNumber nblocks;
     488              :     vbits      *info;
     489              :     BlockNumber blkno;
     490           17 :     Buffer      vmbuffer = InvalidBuffer;
     491           17 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     492              :     BlockRangeReadStreamPrivate p;
     493           17 :     ReadStream *stream = NULL;
     494              : 
     495           17 :     rel = relation_open(relid, AccessShareLock);
     496              : 
     497              :     /* Only some relkinds have a visibility map */
     498           15 :     check_relation_relkind(rel);
     499              : 
     500           10 :     nblocks = RelationGetNumberOfBlocks(rel);
     501           10 :     info = palloc0(offsetof(vbits, bits) + nblocks);
     502           10 :     info->next = 0;
     503           10 :     info->count = nblocks;
     504              : 
     505              :     /* Create a stream if reading main fork. */
     506           10 :     if (include_pd)
     507              :     {
     508            6 :         p.current_blocknum = 0;
     509            6 :         p.last_exclusive = nblocks;
     510              : 
     511              :         /*
     512              :          * It is safe to use batchmode as block_range_read_stream_cb takes no
     513              :          * locks.
     514              :          */
     515            6 :         stream = read_stream_begin_relation(READ_STREAM_FULL |
     516              :                                             READ_STREAM_USE_BATCHING,
     517              :                                             bstrategy,
     518              :                                             rel,
     519              :                                             MAIN_FORKNUM,
     520              :                                             block_range_read_stream_cb,
     521              :                                             &p,
     522              :                                             0);
     523              :     }
     524              : 
     525           22 :     for (blkno = 0; blkno < nblocks; ++blkno)
     526              :     {
     527              :         int32       mapbits;
     528              : 
     529              :         /* Make sure we are interruptible. */
     530           12 :         CHECK_FOR_INTERRUPTS();
     531              : 
     532              :         /* Get map info. */
     533           12 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     534           12 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     535            8 :             info->bits[blkno] |= (1 << 0);
     536           12 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     537            5 :             info->bits[blkno] |= (1 << 1);
     538              : 
     539              :         /*
     540              :          * Page-level data requires reading every block, so only get it if the
     541              :          * caller needs it.  Use a buffer access strategy, too, to prevent
     542              :          * cache-trashing.
     543              :          */
     544           12 :         if (include_pd)
     545              :         {
     546              :             Buffer      buffer;
     547              :             Page        page;
     548              : 
     549            3 :             buffer = read_stream_next_buffer(stream, NULL);
     550            3 :             LockBuffer(buffer, BUFFER_LOCK_SHARE);
     551              : 
     552            3 :             page = BufferGetPage(buffer);
     553            3 :             if (PageIsAllVisible(page))
     554            2 :                 info->bits[blkno] |= (1 << 2);
     555              : 
     556            3 :             UnlockReleaseBuffer(buffer);
     557              :         }
     558              :     }
     559              : 
     560           10 :     if (include_pd)
     561              :     {
     562              :         Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
     563            6 :         read_stream_end(stream);
     564              :     }
     565              : 
     566              :     /* Clean up. */
     567           10 :     if (vmbuffer != InvalidBuffer)
     568            7 :         ReleaseBuffer(vmbuffer);
     569           10 :     relation_close(rel, AccessShareLock);
     570              : 
     571           10 :     return info;
     572              : }
     573              : 
     574              : /*
     575              :  * The "strict" version of GetOldestNonRemovableTransactionId().  The
     576              :  * pg_visibility check can tolerate false positives (don't report some of the
     577              :  * errors), but can't tolerate false negatives (report false errors). Normally,
     578              :  * horizons move forwards, but there are cases when it could move backward
     579              :  * (see comment for ComputeXidHorizons()).
     580              :  *
     581              :  * This is why we have to implement our own function for xid horizon, which
     582              :  * would be guaranteed to be newer or equal to any xid horizon computed before.
     583              :  * We have to do the following to achieve this.
     584              :  *
     585              :  * 1. Ignore processes xmin's, because they consider connection to other
     586              :  *    databases that were ignored before.
     587              :  * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
     588              :  *    now perform minimal checking on a standby by always using nextXid, this
     589              :  *    approach is better than nothing and will at least catch extremely broken
     590              :  *    cases where a xid is in the future.
     591              :  * 3. Ignore walsender xmin, because it could go backward if some replication
     592              :  *    connections don't use replication slots.
     593              :  *
     594              :  * While it might seem like we could use KnownAssignedXids for shared
     595              :  * catalogs, since shared catalogs rely on a global horizon rather than a
     596              :  * database-specific one - there are potential edge cases.  For example, a
     597              :  * transaction may crash on the primary without writing a commit/abort record.
     598              :  * This would lead to a situation where it appears to still be running on the
     599              :  * standby, even though it has already ended on the primary.  For this reason,
     600              :  * it's safer to ignore KnownAssignedXids, even for shared catalogs.
     601              :  *
     602              :  * As a result, we're using only currently running xids to compute the horizon.
     603              :  * Surely these would significantly sacrifice accuracy.  But we have to do so
     604              :  * to avoid reporting false errors.
     605              :  */
     606              : static TransactionId
     607            8 : GetStrictOldestNonRemovableTransactionId(Relation rel)
     608              : {
     609              :     RunningTransactions runningTransactions;
     610              : 
     611            8 :     if (RecoveryInProgress())
     612              :     {
     613              :         TransactionId result;
     614              : 
     615              :         /* As we ignore KnownAssignedXids on standby, just pick nextXid */
     616            1 :         LWLockAcquire(XidGenLock, LW_SHARED);
     617            1 :         result = XidFromFullTransactionId(TransamVariables->nextXid);
     618            1 :         LWLockRelease(XidGenLock);
     619            1 :         return result;
     620              :     }
     621            7 :     else if (rel == NULL || rel->rd_rel->relisshared)
     622              :     {
     623              :         /* Shared relation: take into account all running xids */
     624            0 :         runningTransactions = GetRunningTransactionData();
     625            0 :         LWLockRelease(ProcArrayLock);
     626            0 :         LWLockRelease(XidGenLock);
     627            0 :         return runningTransactions->oldestRunningXid;
     628              :     }
     629            7 :     else if (!RELATION_IS_LOCAL(rel))
     630              :     {
     631              :         /*
     632              :          * Normal relation: take into account xids running within the current
     633              :          * database
     634              :          */
     635            7 :         runningTransactions = GetRunningTransactionData();
     636            7 :         LWLockRelease(ProcArrayLock);
     637            7 :         LWLockRelease(XidGenLock);
     638            7 :         return runningTransactions->oldestDatabaseRunningXid;
     639              :     }
     640              :     else
     641              :     {
     642              :         /*
     643              :          * For temporary relations, ComputeXidHorizons() uses only
     644              :          * TransamVariables->latestCompletedXid and MyProc->xid.  These two
     645              :          * shouldn't go backwards.  So we're fine with this horizon.
     646              :          */
     647            0 :         return GetOldestNonRemovableTransactionId(rel);
     648              :     }
     649              : }
     650              : 
     651              : /*
     652              :  * Callback function to get next block for read stream object used in
     653              :  * collect_corrupt_items() function.
     654              :  */
     655              : static BlockNumber
     656          103 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
     657              :                                              void *callback_private_data,
     658              :                                              void *per_buffer_data)
     659              : {
     660          103 :     struct collect_corrupt_items_read_stream_private *p = callback_private_data;
     661              : 
     662          109 :     for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
     663              :     {
     664          101 :         bool        check_frozen = false;
     665          101 :         bool        check_visible = false;
     666              : 
     667              :         /* Make sure we are interruptible. */
     668          101 :         CHECK_FOR_INTERRUPTS();
     669              : 
     670          101 :         if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
     671           49 :             check_frozen = true;
     672          101 :         if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
     673           46 :             check_visible = true;
     674          101 :         if (!check_visible && !check_frozen)
     675            6 :             continue;
     676              : 
     677           95 :         return p->current_blocknum++;
     678              :     }
     679              : 
     680            8 :     return InvalidBlockNumber;
     681              : }
     682              : 
     683              : /*
     684              :  * Returns a list of items whose visibility map information does not match
     685              :  * the status of the tuples on the page.
     686              :  *
     687              :  * If all_visible is passed as true, this will include all items which are
     688              :  * on pages marked as all-visible in the visibility map but which do not
     689              :  * seem to in fact be all-visible.
     690              :  *
     691              :  * If all_frozen is passed as true, this will include all items which are
     692              :  * on pages marked as all-frozen but which do not seem to in fact be frozen.
     693              :  *
     694              :  * Checks relkind of relid and will throw an error if the relation does not
     695              :  * have a VM.
     696              :  */
     697              : static corrupt_items *
     698           13 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
     699              : {
     700              :     Relation    rel;
     701              :     corrupt_items *items;
     702           13 :     Buffer      vmbuffer = InvalidBuffer;
     703           13 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     704           13 :     TransactionId OldestXmin = InvalidTransactionId;
     705              :     struct collect_corrupt_items_read_stream_private p;
     706              :     ReadStream *stream;
     707              :     Buffer      buffer;
     708              : 
     709           13 :     rel = relation_open(relid, AccessShareLock);
     710              : 
     711              :     /* Only some relkinds have a visibility map */
     712           13 :     check_relation_relkind(rel);
     713              : 
     714            8 :     if (all_visible)
     715            3 :         OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     716              : 
     717              :     /*
     718              :      * Guess an initial array size. We don't expect many corrupted tuples, so
     719              :      * start with a small array.  This function uses the "next" field to track
     720              :      * the next offset where we can store an item (which is the same thing as
     721              :      * the number of items found so far) and the "count" field to track the
     722              :      * number of entries allocated.  We'll repurpose these fields before
     723              :      * returning.
     724              :      */
     725            8 :     items = palloc0_object(corrupt_items);
     726            8 :     items->next = 0;
     727            8 :     items->count = 64;
     728            8 :     items->tids = palloc(items->count * sizeof(ItemPointerData));
     729              : 
     730            8 :     p.current_blocknum = 0;
     731            8 :     p.last_exclusive = RelationGetNumberOfBlocks(rel);
     732            8 :     p.rel = rel;
     733            8 :     p.vmbuffer = InvalidBuffer;
     734            8 :     p.all_frozen = all_frozen;
     735            8 :     p.all_visible = all_visible;
     736            8 :     stream = read_stream_begin_relation(READ_STREAM_FULL,
     737              :                                         bstrategy,
     738              :                                         rel,
     739              :                                         MAIN_FORKNUM,
     740              :                                         collect_corrupt_items_read_stream_next_block,
     741              :                                         &p,
     742              :                                         0);
     743              : 
     744              :     /* Loop over every block in the relation. */
     745          103 :     while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
     746              :     {
     747           95 :         bool        check_frozen = all_frozen;
     748           95 :         bool        check_visible = all_visible;
     749              :         Page        page;
     750              :         OffsetNumber offnum,
     751              :                     maxoff;
     752              :         BlockNumber blkno;
     753              : 
     754              :         /* Make sure we are interruptible. */
     755           95 :         CHECK_FOR_INTERRUPTS();
     756              : 
     757           95 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     758              : 
     759           95 :         page = BufferGetPage(buffer);
     760           95 :         maxoff = PageGetMaxOffsetNumber(page);
     761           95 :         blkno = BufferGetBlockNumber(buffer);
     762              : 
     763              :         /*
     764              :          * The visibility map bits might have changed while we were acquiring
     765              :          * the page lock.  Recheck to avoid returning spurious results.
     766              :          */
     767           95 :         if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
     768            0 :             check_frozen = false;
     769           95 :         if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
     770            0 :             check_visible = false;
     771           95 :         if (!check_visible && !check_frozen)
     772              :         {
     773            0 :             UnlockReleaseBuffer(buffer);
     774            0 :             continue;
     775              :         }
     776              : 
     777              :         /* Iterate over each tuple on the page. */
     778           95 :         for (offnum = FirstOffsetNumber;
     779        16126 :              offnum <= maxoff;
     780        16031 :              offnum = OffsetNumberNext(offnum))
     781              :         {
     782              :             HeapTupleData tuple;
     783              :             ItemId      itemid;
     784              : 
     785        16031 :             itemid = PageGetItemId(page, offnum);
     786              : 
     787              :             /* Unused or redirect line pointers are of no interest. */
     788        16031 :             if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
     789            0 :                 continue;
     790              : 
     791              :             /* Dead line pointers are neither all-visible nor frozen. */
     792        16031 :             if (ItemIdIsDead(itemid))
     793              :             {
     794            0 :                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
     795            0 :                 record_corrupt_item(items, &tuple.t_self);
     796            0 :                 continue;
     797              :             }
     798              : 
     799              :             /* Initialize a HeapTupleData structure for checks below. */
     800        16031 :             ItemPointerSet(&(tuple.t_self), blkno, offnum);
     801        16031 :             tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
     802        16031 :             tuple.t_len = ItemIdGetLength(itemid);
     803        16031 :             tuple.t_tableOid = relid;
     804              : 
     805              :             /*
     806              :              * If we're checking whether the page is all-visible, we expect
     807              :              * the tuple to be all-visible.
     808              :              */
     809        16031 :             if (check_visible &&
     810         8009 :                 !tuple_all_visible(&tuple, OldestXmin, buffer))
     811              :             {
     812              :                 TransactionId RecomputedOldestXmin;
     813              : 
     814              :                 /*
     815              :                  * Time has passed since we computed OldestXmin, so it's
     816              :                  * possible that this tuple is all-visible in reality even
     817              :                  * though it doesn't appear so based on our
     818              :                  * previously-computed value.  Let's compute a new value so we
     819              :                  * can be certain whether there is a problem.
     820              :                  *
     821              :                  * From a concurrency point of view, it sort of sucks to
     822              :                  * retake ProcArrayLock here while we're holding the buffer
     823              :                  * locked in shared mode, but it should be safe against
     824              :                  * deadlocks, because surely
     825              :                  * GetStrictOldestNonRemovableTransactionId() should never
     826              :                  * take a buffer lock. And this shouldn't happen often, so
     827              :                  * it's worth being careful so as to avoid false positives.
     828              :                  */
     829            5 :                 RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     830              : 
     831            5 :                 if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
     832            5 :                     record_corrupt_item(items, &tuple.t_self);
     833              :                 else
     834              :                 {
     835            0 :                     OldestXmin = RecomputedOldestXmin;
     836            0 :                     if (!tuple_all_visible(&tuple, OldestXmin, buffer))
     837            0 :                         record_corrupt_item(items, &tuple.t_self);
     838              :                 }
     839              :             }
     840              : 
     841              :             /*
     842              :              * If we're checking whether the page is all-frozen, we expect the
     843              :              * tuple to be in a state where it will never need freezing.
     844              :              */
     845        16031 :             if (check_frozen)
     846              :             {
     847         8022 :                 if (heap_tuple_needs_eventual_freeze(tuple.t_data))
     848            5 :                     record_corrupt_item(items, &tuple.t_self);
     849              :             }
     850              :         }
     851              : 
     852           95 :         UnlockReleaseBuffer(buffer);
     853              :     }
     854            8 :     read_stream_end(stream);
     855              : 
     856              :     /* Clean up. */
     857            8 :     if (vmbuffer != InvalidBuffer)
     858            7 :         ReleaseBuffer(vmbuffer);
     859            8 :     if (p.vmbuffer != InvalidBuffer)
     860            8 :         ReleaseBuffer(p.vmbuffer);
     861            8 :     relation_close(rel, AccessShareLock);
     862              : 
     863              :     /*
     864              :      * Before returning, repurpose the fields to match caller's expectations.
     865              :      * next is now the next item that should be read (rather than written) and
     866              :      * count is now the number of items we wrote (rather than the number we
     867              :      * allocated).
     868              :      */
     869            8 :     items->count = items->next;
     870            8 :     items->next = 0;
     871              : 
     872            8 :     return items;
     873              : }
     874              : 
     875              : /*
     876              :  * Remember one corrupt item.
     877              :  */
     878              : static void
     879           10 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
     880              : {
     881              :     /* enlarge output array if needed. */
     882           10 :     if (items->next >= items->count)
     883              :     {
     884            0 :         items->count *= 2;
     885            0 :         items->tids = repalloc(items->tids,
     886            0 :                                items->count * sizeof(ItemPointerData));
     887              :     }
     888              :     /* and add the new item */
     889           10 :     items->tids[items->next++] = *tid;
     890           10 : }
     891              : 
     892              : /*
     893              :  * Check whether a tuple is all-visible relative to a given OldestXmin value.
     894              :  * The buffer should contain the tuple and should be locked and pinned.
     895              :  */
     896              : static bool
     897         8009 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
     898              : {
     899              :     HTSV_Result state;
     900              :     TransactionId xmin;
     901              : 
     902         8009 :     state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
     903         8009 :     if (state != HEAPTUPLE_LIVE)
     904            5 :         return false;           /* all-visible implies live */
     905              : 
     906              :     /*
     907              :      * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
     908              :      * all-visible unless every tuple is hinted committed. However, those hint
     909              :      * bits could be lost after a crash, so we can't be certain that they'll
     910              :      * be set here.  So just check the xmin.
     911              :      */
     912              : 
     913         8004 :     xmin = HeapTupleHeaderGetXmin(tup->t_data);
     914         8004 :     if (!TransactionIdPrecedes(xmin, OldestXmin))
     915            0 :         return false;           /* xmin not old enough for all to see */
     916              : 
     917         8004 :     return true;
     918              : }
     919              : 
     920              : /*
     921              :  * check_relation_relkind - convenience routine to check that relation
     922              :  * is of the relkind supported by the callers
     923              :  */
     924              : static void
     925           50 : check_relation_relkind(Relation rel)
     926              : {
     927           50 :     if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
     928           25 :         ereport(ERROR,
     929              :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
     930              :                  errmsg("relation \"%s\" is of wrong relation kind",
     931              :                         RelationGetRelationName(rel)),
     932              :                  errdetail_relkind_not_supported(rel->rd_rel->relkind)));
     933           25 : }
        

Generated by: LCOV version 2.0-1