LCOV - code coverage report
Current view: top level - contrib/pg_visibility - pg_visibility.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 287 328 87.5 %
Date: 2025-01-18 04:15:08 Functions: 24 25 96.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_visibility.c
       4             :  *    display visibility map information and page-level visibility bits
       5             :  *
       6             :  * Copyright (c) 2016-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  *    contrib/pg_visibility/pg_visibility.c
       9             :  *-------------------------------------------------------------------------
      10             :  */
      11             : #include "postgres.h"
      12             : 
      13             : #include "access/heapam.h"
      14             : #include "access/htup_details.h"
      15             : #include "access/visibilitymap.h"
      16             : #include "access/xloginsert.h"
      17             : #include "catalog/pg_type.h"
      18             : #include "catalog/storage_xlog.h"
      19             : #include "funcapi.h"
      20             : #include "miscadmin.h"
      21             : #include "storage/bufmgr.h"
      22             : #include "storage/proc.h"
      23             : #include "storage/procarray.h"
      24             : #include "storage/read_stream.h"
      25             : #include "storage/smgr.h"
      26             : #include "utils/rel.h"
      27             : 
      28          14 : PG_MODULE_MAGIC;
      29             : 
      30             : typedef struct vbits
      31             : {
      32             :     BlockNumber next;
      33             :     BlockNumber count;
      34             :     uint8       bits[FLEXIBLE_ARRAY_MEMBER];
      35             : } vbits;
      36             : 
      37             : typedef struct corrupt_items
      38             : {
      39             :     BlockNumber next;
      40             :     BlockNumber count;
      41             :     ItemPointer tids;
      42             : } corrupt_items;
      43             : 
      44             : /* for collect_corrupt_items_read_stream_next_block */
      45             : struct collect_corrupt_items_read_stream_private
      46             : {
      47             :     bool        all_frozen;
      48             :     bool        all_visible;
      49             :     BlockNumber current_blocknum;
      50             :     BlockNumber last_exclusive;
      51             :     Relation    rel;
      52             :     Buffer      vmbuffer;
      53             : };
      54             : 
      55           6 : PG_FUNCTION_INFO_V1(pg_visibility_map);
      56           8 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
      57           8 : PG_FUNCTION_INFO_V1(pg_visibility);
      58           8 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
      59           8 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
      60          10 : PG_FUNCTION_INFO_V1(pg_check_frozen);
      61          12 : PG_FUNCTION_INFO_V1(pg_check_visible);
      62           8 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
      63             : 
      64             : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
      65             : static vbits *collect_visibility_data(Oid relid, bool include_pd);
      66             : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
      67             :                                             bool all_frozen);
      68             : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
      69             : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
      70             :                               Buffer buffer);
      71             : static void check_relation_relkind(Relation rel);
      72             : 
      73             : /*
      74             :  * Visibility map information for a single block of a relation.
      75             :  *
      76             :  * Note: the VM code will silently return zeroes for pages past the end
      77             :  * of the map, so we allow probes up to MaxBlockNumber regardless of the
      78             :  * actual relation size.
      79             :  */
      80             : Datum
      81           0 : pg_visibility_map(PG_FUNCTION_ARGS)
      82             : {
      83           0 :     Oid         relid = PG_GETARG_OID(0);
      84           0 :     int64       blkno = PG_GETARG_INT64(1);
      85             :     int32       mapbits;
      86             :     Relation    rel;
      87           0 :     Buffer      vmbuffer = InvalidBuffer;
      88             :     TupleDesc   tupdesc;
      89             :     Datum       values[2];
      90           0 :     bool        nulls[2] = {0};
      91             : 
      92           0 :     rel = relation_open(relid, AccessShareLock);
      93             : 
      94             :     /* Only some relkinds have a visibility map */
      95           0 :     check_relation_relkind(rel);
      96             : 
      97           0 :     if (blkno < 0 || blkno > MaxBlockNumber)
      98           0 :         ereport(ERROR,
      99             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     100             :                  errmsg("invalid block number")));
     101             : 
     102           0 :     tupdesc = pg_visibility_tupdesc(false, false);
     103             : 
     104           0 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     105           0 :     if (vmbuffer != InvalidBuffer)
     106           0 :         ReleaseBuffer(vmbuffer);
     107           0 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     108           0 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     109             : 
     110           0 :     relation_close(rel, AccessShareLock);
     111             : 
     112           0 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     113             : }
     114             : 
     115             : /*
     116             :  * Visibility map information for a single block of a relation, plus the
     117             :  * page-level information for the same block.
     118             :  */
     119             : Datum
     120          12 : pg_visibility(PG_FUNCTION_ARGS)
     121             : {
     122          12 :     Oid         relid = PG_GETARG_OID(0);
     123          12 :     int64       blkno = PG_GETARG_INT64(1);
     124             :     int32       mapbits;
     125             :     Relation    rel;
     126          12 :     Buffer      vmbuffer = InvalidBuffer;
     127             :     Buffer      buffer;
     128             :     Page        page;
     129             :     TupleDesc   tupdesc;
     130             :     Datum       values[3];
     131          12 :     bool        nulls[3] = {0};
     132             : 
     133          12 :     rel = relation_open(relid, AccessShareLock);
     134             : 
     135             :     /* Only some relkinds have a visibility map */
     136          12 :     check_relation_relkind(rel);
     137             : 
     138           2 :     if (blkno < 0 || blkno > MaxBlockNumber)
     139           0 :         ereport(ERROR,
     140             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     141             :                  errmsg("invalid block number")));
     142             : 
     143           2 :     tupdesc = pg_visibility_tupdesc(false, true);
     144             : 
     145           2 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     146           2 :     if (vmbuffer != InvalidBuffer)
     147           2 :         ReleaseBuffer(vmbuffer);
     148           2 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     149           2 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     150             : 
     151             :     /* Here we have to explicitly check rel size ... */
     152           2 :     if (blkno < RelationGetNumberOfBlocks(rel))
     153             :     {
     154           2 :         buffer = ReadBuffer(rel, blkno);
     155           2 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     156             : 
     157           2 :         page = BufferGetPage(buffer);
     158           2 :         values[2] = BoolGetDatum(PageIsAllVisible(page));
     159             : 
     160           2 :         UnlockReleaseBuffer(buffer);
     161             :     }
     162             :     else
     163             :     {
     164             :         /* As with the vismap, silently return 0 for pages past EOF */
     165           0 :         values[2] = BoolGetDatum(false);
     166             :     }
     167             : 
     168           2 :     relation_close(rel, AccessShareLock);
     169             : 
     170           2 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     171             : }
     172             : 
     173             : /*
     174             :  * Visibility map information for every block in a relation.
     175             :  */
     176             : Datum
     177          40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
     178             : {
     179             :     FuncCallContext *funcctx;
     180             :     vbits      *info;
     181             : 
     182          40 :     if (SRF_IS_FIRSTCALL())
     183             :     {
     184          22 :         Oid         relid = PG_GETARG_OID(0);
     185             :         MemoryContext oldcontext;
     186             : 
     187          22 :         funcctx = SRF_FIRSTCALL_INIT();
     188          22 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     189          22 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
     190             :         /* collect_visibility_data will verify the relkind */
     191          22 :         funcctx->user_fctx = collect_visibility_data(relid, false);
     192           8 :         MemoryContextSwitchTo(oldcontext);
     193             :     }
     194             : 
     195          26 :     funcctx = SRF_PERCALL_SETUP();
     196          26 :     info = (vbits *) funcctx->user_fctx;
     197             : 
     198          26 :     if (info->next < info->count)
     199             :     {
     200             :         Datum       values[3];
     201          18 :         bool        nulls[3] = {0};
     202             :         HeapTuple   tuple;
     203             : 
     204          18 :         values[0] = Int64GetDatum(info->next);
     205          18 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     206          18 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     207          18 :         info->next++;
     208             : 
     209          18 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     210          18 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     211             :     }
     212             : 
     213           8 :     SRF_RETURN_DONE(funcctx);
     214             : }
     215             : 
     216             : /*
     217             :  * Visibility map information for every block in a relation, plus the page
     218             :  * level information for each block.
     219             :  */
     220             : Datum
     221          18 : pg_visibility_rel(PG_FUNCTION_ARGS)
     222             : {
     223             :     FuncCallContext *funcctx;
     224             :     vbits      *info;
     225             : 
     226          18 :     if (SRF_IS_FIRSTCALL())
     227             :     {
     228          12 :         Oid         relid = PG_GETARG_OID(0);
     229             :         MemoryContext oldcontext;
     230             : 
     231          12 :         funcctx = SRF_FIRSTCALL_INIT();
     232          12 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     233          12 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
     234             :         /* collect_visibility_data will verify the relkind */
     235          12 :         funcctx->user_fctx = collect_visibility_data(relid, true);
     236          12 :         MemoryContextSwitchTo(oldcontext);
     237             :     }
     238             : 
     239          18 :     funcctx = SRF_PERCALL_SETUP();
     240          18 :     info = (vbits *) funcctx->user_fctx;
     241             : 
     242          18 :     if (info->next < info->count)
     243             :     {
     244             :         Datum       values[4];
     245           6 :         bool        nulls[4] = {0};
     246             :         HeapTuple   tuple;
     247             : 
     248           6 :         values[0] = Int64GetDatum(info->next);
     249           6 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     250           6 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     251           6 :         values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
     252           6 :         info->next++;
     253             : 
     254           6 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     255           6 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     256             :     }
     257             : 
     258          12 :     SRF_RETURN_DONE(funcctx);
     259             : }
     260             : 
     261             : /*
     262             :  * Count the number of all-visible and all-frozen pages in the visibility
     263             :  * map for a particular relation.
     264             :  */
     265             : Datum
     266          12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
     267             : {
     268          12 :     Oid         relid = PG_GETARG_OID(0);
     269             :     Relation    rel;
     270             :     BlockNumber nblocks;
     271             :     BlockNumber blkno;
     272          12 :     Buffer      vmbuffer = InvalidBuffer;
     273          12 :     int64       all_visible = 0;
     274          12 :     int64       all_frozen = 0;
     275             :     TupleDesc   tupdesc;
     276             :     Datum       values[2];
     277          12 :     bool        nulls[2] = {0};
     278             : 
     279          12 :     rel = relation_open(relid, AccessShareLock);
     280             : 
     281             :     /* Only some relkinds have a visibility map */
     282          12 :     check_relation_relkind(rel);
     283             : 
     284           2 :     nblocks = RelationGetNumberOfBlocks(rel);
     285             : 
     286           4 :     for (blkno = 0; blkno < nblocks; ++blkno)
     287             :     {
     288             :         int32       mapbits;
     289             : 
     290             :         /* Make sure we are interruptible. */
     291           2 :         CHECK_FOR_INTERRUPTS();
     292             : 
     293             :         /* Get map info. */
     294           2 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     295           2 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     296           2 :             ++all_visible;
     297           2 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     298           0 :             ++all_frozen;
     299             :     }
     300             : 
     301             :     /* Clean up. */
     302           2 :     if (vmbuffer != InvalidBuffer)
     303           2 :         ReleaseBuffer(vmbuffer);
     304           2 :     relation_close(rel, AccessShareLock);
     305             : 
     306           2 :     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
     307           0 :         elog(ERROR, "return type must be a row type");
     308             : 
     309           2 :     values[0] = Int64GetDatum(all_visible);
     310           2 :     values[1] = Int64GetDatum(all_frozen);
     311             : 
     312           2 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     313             : }
     314             : 
     315             : /*
     316             :  * Return the TIDs of non-frozen tuples present in pages marked all-frozen
     317             :  * in the visibility map.  We hope no one will ever find any, but there could
     318             :  * be bugs, database corruption, etc.
     319             :  */
     320             : Datum
     321          30 : pg_check_frozen(PG_FUNCTION_ARGS)
     322             : {
     323             :     FuncCallContext *funcctx;
     324             :     corrupt_items *items;
     325             : 
     326          30 :     if (SRF_IS_FIRSTCALL())
     327             :     {
     328          20 :         Oid         relid = PG_GETARG_OID(0);
     329             :         MemoryContext oldcontext;
     330             : 
     331          20 :         funcctx = SRF_FIRSTCALL_INIT();
     332          20 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     333             :         /* collect_corrupt_items will verify the relkind */
     334          20 :         funcctx->user_fctx = collect_corrupt_items(relid, false, true);
     335          10 :         MemoryContextSwitchTo(oldcontext);
     336             :     }
     337             : 
     338          20 :     funcctx = SRF_PERCALL_SETUP();
     339          20 :     items = (corrupt_items *) funcctx->user_fctx;
     340             : 
     341          20 :     if (items->next < items->count)
     342          10 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     343             : 
     344          10 :     SRF_RETURN_DONE(funcctx);
     345             : }
     346             : 
     347             : /*
     348             :  * Return the TIDs of not-all-visible tuples in pages marked all-visible
     349             :  * in the visibility map.  We hope no one will ever find any, but there could
     350             :  * be bugs, database corruption, etc.
     351             :  */
     352             : Datum
     353          16 : pg_check_visible(PG_FUNCTION_ARGS)
     354             : {
     355             :     FuncCallContext *funcctx;
     356             :     corrupt_items *items;
     357             : 
     358          16 :     if (SRF_IS_FIRSTCALL())
     359             :     {
     360           6 :         Oid         relid = PG_GETARG_OID(0);
     361             :         MemoryContext oldcontext;
     362             : 
     363           6 :         funcctx = SRF_FIRSTCALL_INIT();
     364           6 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     365             :         /* collect_corrupt_items will verify the relkind */
     366           6 :         funcctx->user_fctx = collect_corrupt_items(relid, true, false);
     367           6 :         MemoryContextSwitchTo(oldcontext);
     368             :     }
     369             : 
     370          16 :     funcctx = SRF_PERCALL_SETUP();
     371          16 :     items = (corrupt_items *) funcctx->user_fctx;
     372             : 
     373          16 :     if (items->next < items->count)
     374          10 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     375             : 
     376           6 :     SRF_RETURN_DONE(funcctx);
     377             : }
     378             : 
     379             : /*
     380             :  * Remove the visibility map fork for a relation.  If there turn out to be
     381             :  * any bugs in the visibility map code that require rebuilding the VM, this
     382             :  * provides users with a way to do it that is cleaner than shutting down the
     383             :  * server and removing files by hand.
     384             :  *
     385             :  * This is a cut-down version of RelationTruncate.
     386             :  */
     387             : Datum
     388          12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
     389             : {
     390          12 :     Oid         relid = PG_GETARG_OID(0);
     391             :     Relation    rel;
     392             :     ForkNumber  fork;
     393             :     BlockNumber block;
     394             :     BlockNumber old_block;
     395             : 
     396          12 :     rel = relation_open(relid, AccessExclusiveLock);
     397             : 
     398             :     /* Only some relkinds have a visibility map */
     399          12 :     check_relation_relkind(rel);
     400             : 
     401             :     /* Forcibly reset cached file size */
     402           2 :     RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
     403             : 
     404             :     /* Compute new and old size before entering critical section. */
     405           2 :     fork = VISIBILITYMAP_FORKNUM;
     406           2 :     block = visibilitymap_prepare_truncate(rel, 0);
     407           2 :     old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
     408             : 
     409             :     /*
     410             :      * WAL-logging, buffer dropping, file truncation must be atomic and all on
     411             :      * one side of a checkpoint.  See RelationTruncate() for discussion.
     412             :      */
     413             :     Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
     414           2 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
     415           2 :     START_CRIT_SECTION();
     416             : 
     417           2 :     if (RelationNeedsWAL(rel))
     418             :     {
     419             :         XLogRecPtr  lsn;
     420             :         xl_smgr_truncate xlrec;
     421             : 
     422           2 :         xlrec.blkno = 0;
     423           2 :         xlrec.rlocator = rel->rd_locator;
     424           2 :         xlrec.flags = SMGR_TRUNCATE_VM;
     425             : 
     426           2 :         XLogBeginInsert();
     427           2 :         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
     428             : 
     429           2 :         lsn = XLogInsert(RM_SMGR_ID,
     430             :                          XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
     431           2 :         XLogFlush(lsn);
     432             :     }
     433             : 
     434           2 :     if (BlockNumberIsValid(block))
     435           2 :         smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
     436             : 
     437           2 :     END_CRIT_SECTION();
     438           2 :     MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
     439             : 
     440             :     /*
     441             :      * Release the lock right away, not at commit time.
     442             :      *
     443             :      * It would be a problem to release the lock prior to commit if this
     444             :      * truncate operation sends any transactional invalidation messages. Other
     445             :      * backends would potentially be able to lock the relation without
     446             :      * processing them in the window of time between when we release the lock
     447             :      * here and when we sent the messages at our eventual commit.  However,
     448             :      * we're currently only sending a non-transactional smgr invalidation,
     449             :      * which will have been posted to shared memory immediately from within
     450             :      * smgr_truncate.  Therefore, there should be no race here.
     451             :      *
     452             :      * The reason why it's desirable to release the lock early here is because
     453             :      * of the possibility that someone will need to use this to blow away many
     454             :      * visibility map forks at once.  If we can't release the lock until
     455             :      * commit time, the transaction doing this will accumulate
     456             :      * AccessExclusiveLocks on all of those relations at the same time, which
     457             :      * is undesirable. However, if this turns out to be unsafe we may have no
     458             :      * choice...
     459             :      */
     460           2 :     relation_close(rel, AccessExclusiveLock);
     461             : 
     462             :     /* Nothing to return. */
     463           2 :     PG_RETURN_VOID();
     464             : }
     465             : 
     466             : /*
     467             :  * Helper function to construct whichever TupleDesc we need for a particular
     468             :  * call.
     469             :  */
     470             : static TupleDesc
     471          36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
     472             : {
     473             :     TupleDesc   tupdesc;
     474          36 :     AttrNumber  maxattr = 2;
     475          36 :     AttrNumber  a = 0;
     476             : 
     477          36 :     if (include_blkno)
     478          34 :         ++maxattr;
     479          36 :     if (include_pd)
     480          14 :         ++maxattr;
     481          36 :     tupdesc = CreateTemplateTupleDesc(maxattr);
     482          36 :     if (include_blkno)
     483          34 :         TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
     484          36 :     TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
     485          36 :     TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
     486          36 :     if (include_pd)
     487          14 :         TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
     488             :     Assert(a == maxattr);
     489             : 
     490          36 :     return BlessTupleDesc(tupdesc);
     491             : }
     492             : 
     493             : /*
     494             :  * Collect visibility data about a relation.
     495             :  *
     496             :  * Checks relkind of relid and will throw an error if the relation does not
     497             :  * have a VM.
     498             :  */
     499             : static vbits *
     500          34 : collect_visibility_data(Oid relid, bool include_pd)
     501             : {
     502             :     Relation    rel;
     503             :     BlockNumber nblocks;
     504             :     vbits      *info;
     505             :     BlockNumber blkno;
     506          34 :     Buffer      vmbuffer = InvalidBuffer;
     507          34 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     508             :     BlockRangeReadStreamPrivate p;
     509          34 :     ReadStream *stream = NULL;
     510             : 
     511          34 :     rel = relation_open(relid, AccessShareLock);
     512             : 
     513             :     /* Only some relkinds have a visibility map */
     514          30 :     check_relation_relkind(rel);
     515             : 
     516          20 :     nblocks = RelationGetNumberOfBlocks(rel);
     517          20 :     info = palloc0(offsetof(vbits, bits) + nblocks);
     518          20 :     info->next = 0;
     519          20 :     info->count = nblocks;
     520             : 
     521             :     /* Create a stream if reading main fork. */
     522          20 :     if (include_pd)
     523             :     {
     524          12 :         p.current_blocknum = 0;
     525          12 :         p.last_exclusive = nblocks;
     526          12 :         stream = read_stream_begin_relation(READ_STREAM_FULL,
     527             :                                             bstrategy,
     528             :                                             rel,
     529             :                                             MAIN_FORKNUM,
     530             :                                             block_range_read_stream_cb,
     531             :                                             &p,
     532             :                                             0);
     533             :     }
     534             : 
     535          44 :     for (blkno = 0; blkno < nblocks; ++blkno)
     536             :     {
     537             :         int32       mapbits;
     538             : 
     539             :         /* Make sure we are interruptible. */
     540          24 :         CHECK_FOR_INTERRUPTS();
     541             : 
     542             :         /* Get map info. */
     543          24 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     544          24 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     545          16 :             info->bits[blkno] |= (1 << 0);
     546          24 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     547          10 :             info->bits[blkno] |= (1 << 1);
     548             : 
     549             :         /*
     550             :          * Page-level data requires reading every block, so only get it if the
     551             :          * caller needs it.  Use a buffer access strategy, too, to prevent
     552             :          * cache-trashing.
     553             :          */
     554          24 :         if (include_pd)
     555             :         {
     556             :             Buffer      buffer;
     557             :             Page        page;
     558             : 
     559           6 :             buffer = read_stream_next_buffer(stream, NULL);
     560           6 :             LockBuffer(buffer, BUFFER_LOCK_SHARE);
     561             : 
     562           6 :             page = BufferGetPage(buffer);
     563           6 :             if (PageIsAllVisible(page))
     564           4 :                 info->bits[blkno] |= (1 << 2);
     565             : 
     566           6 :             UnlockReleaseBuffer(buffer);
     567             :         }
     568             :     }
     569             : 
     570          20 :     if (include_pd)
     571             :     {
     572             :         Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
     573          12 :         read_stream_end(stream);
     574             :     }
     575             : 
     576             :     /* Clean up. */
     577          20 :     if (vmbuffer != InvalidBuffer)
     578          14 :         ReleaseBuffer(vmbuffer);
     579          20 :     relation_close(rel, AccessShareLock);
     580             : 
     581          20 :     return info;
     582             : }
     583             : 
     584             : /*
     585             :  * The "strict" version of GetOldestNonRemovableTransactionId().  The
     586             :  * pg_visibility check can tolerate false positives (don't report some of the
     587             :  * errors), but can't tolerate false negatives (report false errors). Normally,
     588             :  * horizons move forwards, but there are cases when it could move backward
     589             :  * (see comment for ComputeXidHorizons()).
     590             :  *
     591             :  * This is why we have to implement our own function for xid horizon, which
     592             :  * would be guaranteed to be newer or equal to any xid horizon computed before.
     593             :  * We have to do the following to achieve this.
     594             :  *
     595             :  * 1. Ignore processes xmin's, because they consider connection to other
     596             :  *    databases that were ignored before.
     597             :  * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
     598             :  *    now perform minimal checking on a standby by always using nextXid, this
     599             :  *    approach is better than nothing and will at least catch extremely broken
     600             :  *    cases where a xid is in the future.
     601             :  * 3. Ignore walsender xmin, because it could go backward if some replication
     602             :  *    connections don't use replication slots.
     603             :  *
     604             :  * While it might seem like we could use KnownAssignedXids for shared
     605             :  * catalogs, since shared catalogs rely on a global horizon rather than a
     606             :  * database-specific one - there are potential edge cases.  For example, a
     607             :  * transaction may crash on the primary without writing a commit/abort record.
     608             :  * This would lead to a situation where it appears to still be running on the
     609             :  * standby, even though it has already ended on the primary.  For this reason,
     610             :  * it's safer to ignore KnownAssignedXids, even for shared catalogs.
     611             :  *
     612             :  * As a result, we're using only currently running xids to compute the horizon.
     613             :  * Surely these would significantly sacrifice accuracy.  But we have to do so
     614             :  * to avoid reporting false errors.
     615             :  */
     616             : static TransactionId
     617          16 : GetStrictOldestNonRemovableTransactionId(Relation rel)
     618             : {
     619             :     RunningTransactions runningTransactions;
     620             : 
     621          16 :     if (RecoveryInProgress())
     622             :     {
     623             :         TransactionId result;
     624             : 
     625             :         /* As we ignore KnownAssignedXids on standby, just pick nextXid */
     626           2 :         LWLockAcquire(XidGenLock, LW_SHARED);
     627           2 :         result = XidFromFullTransactionId(TransamVariables->nextXid);
     628           2 :         LWLockRelease(XidGenLock);
     629           2 :         return result;
     630             :     }
     631          14 :     else if (rel == NULL || rel->rd_rel->relisshared)
     632             :     {
     633             :         /* Shared relation: take into account all running xids */
     634           0 :         runningTransactions = GetRunningTransactionData();
     635           0 :         LWLockRelease(ProcArrayLock);
     636           0 :         LWLockRelease(XidGenLock);
     637           0 :         return runningTransactions->oldestRunningXid;
     638             :     }
     639          14 :     else if (!RELATION_IS_LOCAL(rel))
     640             :     {
     641             :         /*
     642             :          * Normal relation: take into account xids running within the current
     643             :          * database
     644             :          */
     645          14 :         runningTransactions = GetRunningTransactionData();
     646          14 :         LWLockRelease(ProcArrayLock);
     647          14 :         LWLockRelease(XidGenLock);
     648          14 :         return runningTransactions->oldestDatabaseRunningXid;
     649             :     }
     650             :     else
     651             :     {
     652             :         /*
     653             :          * For temporary relations, ComputeXidHorizons() uses only
     654             :          * TransamVariables->latestCompletedXid and MyProc->xid.  These two
     655             :          * shouldn't go backwards.  So we're fine with this horizon.
     656             :          */
     657           0 :         return GetOldestNonRemovableTransactionId(rel);
     658             :     }
     659             : }
     660             : 
     661             : /*
     662             :  * Callback function to get next block for read stream object used in
     663             :  * collect_corrupt_items() function.
     664             :  */
     665             : static BlockNumber
     666         206 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
     667             :                                              void *callback_private_data,
     668             :                                              void *per_buffer_data)
     669             : {
     670         206 :     struct collect_corrupt_items_read_stream_private *p = callback_private_data;
     671             : 
     672         218 :     for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
     673             :     {
     674         202 :         bool        check_frozen = false;
     675         202 :         bool        check_visible = false;
     676             : 
     677             :         /* Make sure we are interruptible. */
     678         202 :         CHECK_FOR_INTERRUPTS();
     679             : 
     680         202 :         if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
     681          98 :             check_frozen = true;
     682         202 :         if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
     683          92 :             check_visible = true;
     684         202 :         if (!check_visible && !check_frozen)
     685          12 :             continue;
     686             : 
     687         190 :         return p->current_blocknum++;
     688             :     }
     689             : 
     690          16 :     return InvalidBlockNumber;
     691             : }
     692             : 
     693             : /*
     694             :  * Returns a list of items whose visibility map information does not match
     695             :  * the status of the tuples on the page.
     696             :  *
     697             :  * If all_visible is passed as true, this will include all items which are
     698             :  * on pages marked as all-visible in the visibility map but which do not
     699             :  * seem to in fact be all-visible.
     700             :  *
     701             :  * If all_frozen is passed as true, this will include all items which are
     702             :  * on pages marked as all-frozen but which do not seem to in fact be frozen.
     703             :  *
     704             :  * Checks relkind of relid and will throw an error if the relation does not
     705             :  * have a VM.
     706             :  */
     707             : static corrupt_items *
     708          26 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
     709             : {
     710             :     Relation    rel;
     711             :     corrupt_items *items;
     712          26 :     Buffer      vmbuffer = InvalidBuffer;
     713          26 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     714          26 :     TransactionId OldestXmin = InvalidTransactionId;
     715             :     struct collect_corrupt_items_read_stream_private p;
     716             :     ReadStream *stream;
     717             :     Buffer      buffer;
     718             : 
     719          26 :     rel = relation_open(relid, AccessShareLock);
     720             : 
     721             :     /* Only some relkinds have a visibility map */
     722          26 :     check_relation_relkind(rel);
     723             : 
     724          16 :     if (all_visible)
     725           6 :         OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     726             : 
     727             :     /*
     728             :      * Guess an initial array size. We don't expect many corrupted tuples, so
     729             :      * start with a small array.  This function uses the "next" field to track
     730             :      * the next offset where we can store an item (which is the same thing as
     731             :      * the number of items found so far) and the "count" field to track the
     732             :      * number of entries allocated.  We'll repurpose these fields before
     733             :      * returning.
     734             :      */
     735          16 :     items = palloc0(sizeof(corrupt_items));
     736          16 :     items->next = 0;
     737          16 :     items->count = 64;
     738          16 :     items->tids = palloc(items->count * sizeof(ItemPointerData));
     739             : 
     740          16 :     p.current_blocknum = 0;
     741          16 :     p.last_exclusive = RelationGetNumberOfBlocks(rel);
     742          16 :     p.rel = rel;
     743          16 :     p.vmbuffer = InvalidBuffer;
     744          16 :     p.all_frozen = all_frozen;
     745          16 :     p.all_visible = all_visible;
     746          16 :     stream = read_stream_begin_relation(READ_STREAM_FULL,
     747             :                                         bstrategy,
     748             :                                         rel,
     749             :                                         MAIN_FORKNUM,
     750             :                                         collect_corrupt_items_read_stream_next_block,
     751             :                                         &p,
     752             :                                         0);
     753             : 
     754             :     /* Loop over every block in the relation. */
     755         206 :     while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
     756             :     {
     757         190 :         bool        check_frozen = all_frozen;
     758         190 :         bool        check_visible = all_visible;
     759             :         Page        page;
     760             :         OffsetNumber offnum,
     761             :                     maxoff;
     762             :         BlockNumber blkno;
     763             : 
     764             :         /* Make sure we are interruptible. */
     765         190 :         CHECK_FOR_INTERRUPTS();
     766             : 
     767         190 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     768             : 
     769         190 :         page = BufferGetPage(buffer);
     770         190 :         maxoff = PageGetMaxOffsetNumber(page);
     771         190 :         blkno = BufferGetBlockNumber(buffer);
     772             : 
     773             :         /*
     774             :          * The visibility map bits might have changed while we were acquiring
     775             :          * the page lock.  Recheck to avoid returning spurious results.
     776             :          */
     777         190 :         if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
     778           0 :             check_frozen = false;
     779         190 :         if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
     780           0 :             check_visible = false;
     781         190 :         if (!check_visible && !check_frozen)
     782             :         {
     783           0 :             UnlockReleaseBuffer(buffer);
     784           0 :             continue;
     785             :         }
     786             : 
     787             :         /* Iterate over each tuple on the page. */
     788       32252 :         for (offnum = FirstOffsetNumber;
     789             :              offnum <= maxoff;
     790       32062 :              offnum = OffsetNumberNext(offnum))
     791             :         {
     792             :             HeapTupleData tuple;
     793             :             ItemId      itemid;
     794             : 
     795       32062 :             itemid = PageGetItemId(page, offnum);
     796             : 
     797             :             /* Unused or redirect line pointers are of no interest. */
     798       32062 :             if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
     799           0 :                 continue;
     800             : 
     801             :             /* Dead line pointers are neither all-visible nor frozen. */
     802       32062 :             if (ItemIdIsDead(itemid))
     803             :             {
     804           0 :                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
     805           0 :                 record_corrupt_item(items, &tuple.t_self);
     806           0 :                 continue;
     807             :             }
     808             : 
     809             :             /* Initialize a HeapTupleData structure for checks below. */
     810       32062 :             ItemPointerSet(&(tuple.t_self), blkno, offnum);
     811       32062 :             tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
     812       32062 :             tuple.t_len = ItemIdGetLength(itemid);
     813       32062 :             tuple.t_tableOid = relid;
     814             : 
     815             :             /*
     816             :              * If we're checking whether the page is all-visible, we expect
     817             :              * the tuple to be all-visible.
     818             :              */
     819       32062 :             if (check_visible &&
     820       16018 :                 !tuple_all_visible(&tuple, OldestXmin, buffer))
     821             :             {
     822             :                 TransactionId RecomputedOldestXmin;
     823             : 
     824             :                 /*
     825             :                  * Time has passed since we computed OldestXmin, so it's
     826             :                  * possible that this tuple is all-visible in reality even
     827             :                  * though it doesn't appear so based on our
     828             :                  * previously-computed value.  Let's compute a new value so we
     829             :                  * can be certain whether there is a problem.
     830             :                  *
     831             :                  * From a concurrency point of view, it sort of sucks to
     832             :                  * retake ProcArrayLock here while we're holding the buffer
     833             :                  * exclusively locked, but it should be safe against
     834             :                  * deadlocks, because surely
     835             :                  * GetStrictOldestNonRemovableTransactionId() should never
     836             :                  * take a buffer lock. And this shouldn't happen often, so
     837             :                  * it's worth being careful so as to avoid false positives.
     838             :                  */
     839          10 :                 RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     840             : 
     841          10 :                 if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
     842          10 :                     record_corrupt_item(items, &tuple.t_self);
     843             :                 else
     844             :                 {
     845           0 :                     OldestXmin = RecomputedOldestXmin;
     846           0 :                     if (!tuple_all_visible(&tuple, OldestXmin, buffer))
     847           0 :                         record_corrupt_item(items, &tuple.t_self);
     848             :                 }
     849             :             }
     850             : 
     851             :             /*
     852             :              * If we're checking whether the page is all-frozen, we expect the
     853             :              * tuple to be in a state where it will never need freezing.
     854             :              */
     855       32062 :             if (check_frozen)
     856             :             {
     857       16044 :                 if (heap_tuple_needs_eventual_freeze(tuple.t_data))
     858          10 :                     record_corrupt_item(items, &tuple.t_self);
     859             :             }
     860             :         }
     861             : 
     862         190 :         UnlockReleaseBuffer(buffer);
     863             :     }
     864          16 :     read_stream_end(stream);
     865             : 
     866             :     /* Clean up. */
     867          16 :     if (vmbuffer != InvalidBuffer)
     868          14 :         ReleaseBuffer(vmbuffer);
     869          16 :     if (p.vmbuffer != InvalidBuffer)
     870          16 :         ReleaseBuffer(p.vmbuffer);
     871          16 :     relation_close(rel, AccessShareLock);
     872             : 
     873             :     /*
     874             :      * Before returning, repurpose the fields to match caller's expectations.
     875             :      * next is now the next item that should be read (rather than written) and
     876             :      * count is now the number of items we wrote (rather than the number we
     877             :      * allocated).
     878             :      */
     879          16 :     items->count = items->next;
     880          16 :     items->next = 0;
     881             : 
     882          16 :     return items;
     883             : }
     884             : 
     885             : /*
     886             :  * Remember one corrupt item.
     887             :  */
     888             : static void
     889          20 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
     890             : {
     891             :     /* enlarge output array if needed. */
     892          20 :     if (items->next >= items->count)
     893             :     {
     894           0 :         items->count *= 2;
     895           0 :         items->tids = repalloc(items->tids,
     896           0 :                                items->count * sizeof(ItemPointerData));
     897             :     }
     898             :     /* and add the new item */
     899          20 :     items->tids[items->next++] = *tid;
     900          20 : }
     901             : 
     902             : /*
     903             :  * Check whether a tuple is all-visible relative to a given OldestXmin value.
     904             :  * The buffer should contain the tuple and should be locked and pinned.
     905             :  */
     906             : static bool
     907       16018 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
     908             : {
     909             :     HTSV_Result state;
     910             :     TransactionId xmin;
     911             : 
     912       16018 :     state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
     913       16018 :     if (state != HEAPTUPLE_LIVE)
     914          10 :         return false;           /* all-visible implies live */
     915             : 
     916             :     /*
     917             :      * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
     918             :      * all-visible unless every tuple is hinted committed. However, those hint
     919             :      * bits could be lost after a crash, so we can't be certain that they'll
     920             :      * be set here.  So just check the xmin.
     921             :      */
     922             : 
     923       16008 :     xmin = HeapTupleHeaderGetXmin(tup->t_data);
     924       16008 :     if (!TransactionIdPrecedes(xmin, OldestXmin))
     925           0 :         return false;           /* xmin not old enough for all to see */
     926             : 
     927       16008 :     return true;
     928             : }
     929             : 
     930             : /*
     931             :  * check_relation_relkind - convenience routine to check that relation
     932             :  * is of the relkind supported by the callers
     933             :  */
     934             : static void
     935          92 : check_relation_relkind(Relation rel)
     936             : {
     937          92 :     if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
     938          50 :         ereport(ERROR,
     939             :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
     940             :                  errmsg("relation \"%s\" is of wrong relation kind",
     941             :                         RelationGetRelationName(rel)),
     942             :                  errdetail_relkind_not_supported(rel->rd_rel->relkind)));
     943          42 : }

Generated by: LCOV version 1.14