LCOV - code coverage report
Current view: top level - contrib/pg_visibility - pg_visibility.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 287 328 87.5 %
Date: 2025-04-01 16:15:31 Functions: 24 25 96.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_visibility.c
       4             :  *    display visibility map information and page-level visibility bits
       5             :  *
       6             :  * Copyright (c) 2016-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  *    contrib/pg_visibility/pg_visibility.c
       9             :  *-------------------------------------------------------------------------
      10             :  */
      11             : #include "postgres.h"
      12             : 
      13             : #include "access/heapam.h"
      14             : #include "access/htup_details.h"
      15             : #include "access/visibilitymap.h"
      16             : #include "access/xloginsert.h"
      17             : #include "catalog/pg_type.h"
      18             : #include "catalog/storage_xlog.h"
      19             : #include "funcapi.h"
      20             : #include "miscadmin.h"
      21             : #include "storage/bufmgr.h"
      22             : #include "storage/proc.h"
      23             : #include "storage/procarray.h"
      24             : #include "storage/read_stream.h"
      25             : #include "storage/smgr.h"
      26             : #include "utils/rel.h"
      27             : 
      28          14 : PG_MODULE_MAGIC_EXT(
      29             :                     .name = "pg_visibility",
      30             :                     .version = PG_VERSION
      31             : );
      32             : 
      33             : typedef struct vbits
      34             : {
      35             :     BlockNumber next;
      36             :     BlockNumber count;
      37             :     uint8       bits[FLEXIBLE_ARRAY_MEMBER];
      38             : } vbits;
      39             : 
      40             : typedef struct corrupt_items
      41             : {
      42             :     BlockNumber next;
      43             :     BlockNumber count;
      44             :     ItemPointer tids;
      45             : } corrupt_items;
      46             : 
      47             : /* for collect_corrupt_items_read_stream_next_block */
      48             : struct collect_corrupt_items_read_stream_private
      49             : {
      50             :     bool        all_frozen;
      51             :     bool        all_visible;
      52             :     BlockNumber current_blocknum;
      53             :     BlockNumber last_exclusive;
      54             :     Relation    rel;
      55             :     Buffer      vmbuffer;
      56             : };
      57             : 
      58           6 : PG_FUNCTION_INFO_V1(pg_visibility_map);
      59           8 : PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
      60           8 : PG_FUNCTION_INFO_V1(pg_visibility);
      61           8 : PG_FUNCTION_INFO_V1(pg_visibility_rel);
      62           8 : PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
      63          10 : PG_FUNCTION_INFO_V1(pg_check_frozen);
      64          12 : PG_FUNCTION_INFO_V1(pg_check_visible);
      65           8 : PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
      66             : 
      67             : static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
      68             : static vbits *collect_visibility_data(Oid relid, bool include_pd);
      69             : static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
      70             :                                             bool all_frozen);
      71             : static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
      72             : static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
      73             :                               Buffer buffer);
      74             : static void check_relation_relkind(Relation rel);
      75             : 
      76             : /*
      77             :  * Visibility map information for a single block of a relation.
      78             :  *
      79             :  * Note: the VM code will silently return zeroes for pages past the end
      80             :  * of the map, so we allow probes up to MaxBlockNumber regardless of the
      81             :  * actual relation size.
      82             :  */
      83             : Datum
      84           0 : pg_visibility_map(PG_FUNCTION_ARGS)
      85             : {
      86           0 :     Oid         relid = PG_GETARG_OID(0);
      87           0 :     int64       blkno = PG_GETARG_INT64(1);
      88             :     int32       mapbits;
      89             :     Relation    rel;
      90           0 :     Buffer      vmbuffer = InvalidBuffer;
      91             :     TupleDesc   tupdesc;
      92             :     Datum       values[2];
      93           0 :     bool        nulls[2] = {0};
      94             : 
      95           0 :     rel = relation_open(relid, AccessShareLock);
      96             : 
      97             :     /* Only some relkinds have a visibility map */
      98           0 :     check_relation_relkind(rel);
      99             : 
     100           0 :     if (blkno < 0 || blkno > MaxBlockNumber)
     101           0 :         ereport(ERROR,
     102             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     103             :                  errmsg("invalid block number")));
     104             : 
     105           0 :     tupdesc = pg_visibility_tupdesc(false, false);
     106             : 
     107           0 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     108           0 :     if (vmbuffer != InvalidBuffer)
     109           0 :         ReleaseBuffer(vmbuffer);
     110           0 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     111           0 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     112             : 
     113           0 :     relation_close(rel, AccessShareLock);
     114             : 
     115           0 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     116             : }
     117             : 
     118             : /*
     119             :  * Visibility map information for a single block of a relation, plus the
     120             :  * page-level information for the same block.
     121             :  */
     122             : Datum
     123          12 : pg_visibility(PG_FUNCTION_ARGS)
     124             : {
     125          12 :     Oid         relid = PG_GETARG_OID(0);
     126          12 :     int64       blkno = PG_GETARG_INT64(1);
     127             :     int32       mapbits;
     128             :     Relation    rel;
     129          12 :     Buffer      vmbuffer = InvalidBuffer;
     130             :     Buffer      buffer;
     131             :     Page        page;
     132             :     TupleDesc   tupdesc;
     133             :     Datum       values[3];
     134          12 :     bool        nulls[3] = {0};
     135             : 
     136          12 :     rel = relation_open(relid, AccessShareLock);
     137             : 
     138             :     /* Only some relkinds have a visibility map */
     139          12 :     check_relation_relkind(rel);
     140             : 
     141           2 :     if (blkno < 0 || blkno > MaxBlockNumber)
     142           0 :         ereport(ERROR,
     143             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     144             :                  errmsg("invalid block number")));
     145             : 
     146           2 :     tupdesc = pg_visibility_tupdesc(false, true);
     147             : 
     148           2 :     mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     149           2 :     if (vmbuffer != InvalidBuffer)
     150           2 :         ReleaseBuffer(vmbuffer);
     151           2 :     values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
     152           2 :     values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
     153             : 
     154             :     /* Here we have to explicitly check rel size ... */
     155           2 :     if (blkno < RelationGetNumberOfBlocks(rel))
     156             :     {
     157           2 :         buffer = ReadBuffer(rel, blkno);
     158           2 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     159             : 
     160           2 :         page = BufferGetPage(buffer);
     161           2 :         values[2] = BoolGetDatum(PageIsAllVisible(page));
     162             : 
     163           2 :         UnlockReleaseBuffer(buffer);
     164             :     }
     165             :     else
     166             :     {
     167             :         /* As with the vismap, silently return 0 for pages past EOF */
     168           0 :         values[2] = BoolGetDatum(false);
     169             :     }
     170             : 
     171           2 :     relation_close(rel, AccessShareLock);
     172             : 
     173           2 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     174             : }
     175             : 
     176             : /*
     177             :  * Visibility map information for every block in a relation.
     178             :  */
     179             : Datum
     180          40 : pg_visibility_map_rel(PG_FUNCTION_ARGS)
     181             : {
     182             :     FuncCallContext *funcctx;
     183             :     vbits      *info;
     184             : 
     185          40 :     if (SRF_IS_FIRSTCALL())
     186             :     {
     187          22 :         Oid         relid = PG_GETARG_OID(0);
     188             :         MemoryContext oldcontext;
     189             : 
     190          22 :         funcctx = SRF_FIRSTCALL_INIT();
     191          22 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     192          22 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
     193             :         /* collect_visibility_data will verify the relkind */
     194          22 :         funcctx->user_fctx = collect_visibility_data(relid, false);
     195           8 :         MemoryContextSwitchTo(oldcontext);
     196             :     }
     197             : 
     198          26 :     funcctx = SRF_PERCALL_SETUP();
     199          26 :     info = (vbits *) funcctx->user_fctx;
     200             : 
     201          26 :     if (info->next < info->count)
     202             :     {
     203             :         Datum       values[3];
     204          18 :         bool        nulls[3] = {0};
     205             :         HeapTuple   tuple;
     206             : 
     207          18 :         values[0] = Int64GetDatum(info->next);
     208          18 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     209          18 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     210          18 :         info->next++;
     211             : 
     212          18 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     213          18 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     214             :     }
     215             : 
     216           8 :     SRF_RETURN_DONE(funcctx);
     217             : }
     218             : 
     219             : /*
     220             :  * Visibility map information for every block in a relation, plus the page
     221             :  * level information for each block.
     222             :  */
     223             : Datum
     224          18 : pg_visibility_rel(PG_FUNCTION_ARGS)
     225             : {
     226             :     FuncCallContext *funcctx;
     227             :     vbits      *info;
     228             : 
     229          18 :     if (SRF_IS_FIRSTCALL())
     230             :     {
     231          12 :         Oid         relid = PG_GETARG_OID(0);
     232             :         MemoryContext oldcontext;
     233             : 
     234          12 :         funcctx = SRF_FIRSTCALL_INIT();
     235          12 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     236          12 :         funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
     237             :         /* collect_visibility_data will verify the relkind */
     238          12 :         funcctx->user_fctx = collect_visibility_data(relid, true);
     239          12 :         MemoryContextSwitchTo(oldcontext);
     240             :     }
     241             : 
     242          18 :     funcctx = SRF_PERCALL_SETUP();
     243          18 :     info = (vbits *) funcctx->user_fctx;
     244             : 
     245          18 :     if (info->next < info->count)
     246             :     {
     247             :         Datum       values[4];
     248           6 :         bool        nulls[4] = {0};
     249             :         HeapTuple   tuple;
     250             : 
     251           6 :         values[0] = Int64GetDatum(info->next);
     252           6 :         values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
     253           6 :         values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
     254           6 :         values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
     255           6 :         info->next++;
     256             : 
     257           6 :         tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
     258           6 :         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
     259             :     }
     260             : 
     261          12 :     SRF_RETURN_DONE(funcctx);
     262             : }
     263             : 
     264             : /*
     265             :  * Count the number of all-visible and all-frozen pages in the visibility
     266             :  * map for a particular relation.
     267             :  */
     268             : Datum
     269          12 : pg_visibility_map_summary(PG_FUNCTION_ARGS)
     270             : {
     271          12 :     Oid         relid = PG_GETARG_OID(0);
     272             :     Relation    rel;
     273             :     BlockNumber nblocks;
     274             :     BlockNumber blkno;
     275          12 :     Buffer      vmbuffer = InvalidBuffer;
     276          12 :     int64       all_visible = 0;
     277          12 :     int64       all_frozen = 0;
     278             :     TupleDesc   tupdesc;
     279             :     Datum       values[2];
     280          12 :     bool        nulls[2] = {0};
     281             : 
     282          12 :     rel = relation_open(relid, AccessShareLock);
     283             : 
     284             :     /* Only some relkinds have a visibility map */
     285          12 :     check_relation_relkind(rel);
     286             : 
     287           2 :     nblocks = RelationGetNumberOfBlocks(rel);
     288             : 
     289           4 :     for (blkno = 0; blkno < nblocks; ++blkno)
     290             :     {
     291             :         int32       mapbits;
     292             : 
     293             :         /* Make sure we are interruptible. */
     294           2 :         CHECK_FOR_INTERRUPTS();
     295             : 
     296             :         /* Get map info. */
     297           2 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     298           2 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     299           2 :             ++all_visible;
     300           2 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     301           0 :             ++all_frozen;
     302             :     }
     303             : 
     304             :     /* Clean up. */
     305           2 :     if (vmbuffer != InvalidBuffer)
     306           2 :         ReleaseBuffer(vmbuffer);
     307           2 :     relation_close(rel, AccessShareLock);
     308             : 
     309           2 :     if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
     310           0 :         elog(ERROR, "return type must be a row type");
     311             : 
     312           2 :     values[0] = Int64GetDatum(all_visible);
     313           2 :     values[1] = Int64GetDatum(all_frozen);
     314             : 
     315           2 :     PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
     316             : }
     317             : 
     318             : /*
     319             :  * Return the TIDs of non-frozen tuples present in pages marked all-frozen
     320             :  * in the visibility map.  We hope no one will ever find any, but there could
     321             :  * be bugs, database corruption, etc.
     322             :  */
     323             : Datum
     324          30 : pg_check_frozen(PG_FUNCTION_ARGS)
     325             : {
     326             :     FuncCallContext *funcctx;
     327             :     corrupt_items *items;
     328             : 
     329          30 :     if (SRF_IS_FIRSTCALL())
     330             :     {
     331          20 :         Oid         relid = PG_GETARG_OID(0);
     332             :         MemoryContext oldcontext;
     333             : 
     334          20 :         funcctx = SRF_FIRSTCALL_INIT();
     335          20 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     336             :         /* collect_corrupt_items will verify the relkind */
     337          20 :         funcctx->user_fctx = collect_corrupt_items(relid, false, true);
     338          10 :         MemoryContextSwitchTo(oldcontext);
     339             :     }
     340             : 
     341          20 :     funcctx = SRF_PERCALL_SETUP();
     342          20 :     items = (corrupt_items *) funcctx->user_fctx;
     343             : 
     344          20 :     if (items->next < items->count)
     345          10 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     346             : 
     347          10 :     SRF_RETURN_DONE(funcctx);
     348             : }
     349             : 
     350             : /*
     351             :  * Return the TIDs of not-all-visible tuples in pages marked all-visible
     352             :  * in the visibility map.  We hope no one will ever find any, but there could
     353             :  * be bugs, database corruption, etc.
     354             :  */
     355             : Datum
     356          16 : pg_check_visible(PG_FUNCTION_ARGS)
     357             : {
     358             :     FuncCallContext *funcctx;
     359             :     corrupt_items *items;
     360             : 
     361          16 :     if (SRF_IS_FIRSTCALL())
     362             :     {
     363           6 :         Oid         relid = PG_GETARG_OID(0);
     364             :         MemoryContext oldcontext;
     365             : 
     366           6 :         funcctx = SRF_FIRSTCALL_INIT();
     367           6 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     368             :         /* collect_corrupt_items will verify the relkind */
     369           6 :         funcctx->user_fctx = collect_corrupt_items(relid, true, false);
     370           6 :         MemoryContextSwitchTo(oldcontext);
     371             :     }
     372             : 
     373          16 :     funcctx = SRF_PERCALL_SETUP();
     374          16 :     items = (corrupt_items *) funcctx->user_fctx;
     375             : 
     376          16 :     if (items->next < items->count)
     377          10 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
     378             : 
     379           6 :     SRF_RETURN_DONE(funcctx);
     380             : }
     381             : 
     382             : /*
     383             :  * Remove the visibility map fork for a relation.  If there turn out to be
     384             :  * any bugs in the visibility map code that require rebuilding the VM, this
     385             :  * provides users with a way to do it that is cleaner than shutting down the
     386             :  * server and removing files by hand.
     387             :  *
     388             :  * This is a cut-down version of RelationTruncate.
     389             :  */
     390             : Datum
     391          12 : pg_truncate_visibility_map(PG_FUNCTION_ARGS)
     392             : {
     393          12 :     Oid         relid = PG_GETARG_OID(0);
     394             :     Relation    rel;
     395             :     ForkNumber  fork;
     396             :     BlockNumber block;
     397             :     BlockNumber old_block;
     398             : 
     399          12 :     rel = relation_open(relid, AccessExclusiveLock);
     400             : 
     401             :     /* Only some relkinds have a visibility map */
     402          12 :     check_relation_relkind(rel);
     403             : 
     404             :     /* Forcibly reset cached file size */
     405           2 :     RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
     406             : 
     407             :     /* Compute new and old size before entering critical section. */
     408           2 :     fork = VISIBILITYMAP_FORKNUM;
     409           2 :     block = visibilitymap_prepare_truncate(rel, 0);
     410           2 :     old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
     411             : 
     412             :     /*
     413             :      * WAL-logging, buffer dropping, file truncation must be atomic and all on
     414             :      * one side of a checkpoint.  See RelationTruncate() for discussion.
     415             :      */
     416             :     Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
     417           2 :     MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
     418           2 :     START_CRIT_SECTION();
     419             : 
     420           2 :     if (RelationNeedsWAL(rel))
     421             :     {
     422             :         XLogRecPtr  lsn;
     423             :         xl_smgr_truncate xlrec;
     424             : 
     425           2 :         xlrec.blkno = 0;
     426           2 :         xlrec.rlocator = rel->rd_locator;
     427           2 :         xlrec.flags = SMGR_TRUNCATE_VM;
     428             : 
     429           2 :         XLogBeginInsert();
     430           2 :         XLogRegisterData(&xlrec, sizeof(xlrec));
     431             : 
     432           2 :         lsn = XLogInsert(RM_SMGR_ID,
     433             :                          XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
     434           2 :         XLogFlush(lsn);
     435             :     }
     436             : 
     437           2 :     if (BlockNumberIsValid(block))
     438           2 :         smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
     439             : 
     440           2 :     END_CRIT_SECTION();
     441           2 :     MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
     442             : 
     443             :     /*
     444             :      * Release the lock right away, not at commit time.
     445             :      *
     446             :      * It would be a problem to release the lock prior to commit if this
     447             :      * truncate operation sends any transactional invalidation messages. Other
     448             :      * backends would potentially be able to lock the relation without
     449             :      * processing them in the window of time between when we release the lock
     450             :      * here and when we sent the messages at our eventual commit.  However,
     451             :      * we're currently only sending a non-transactional smgr invalidation,
     452             :      * which will have been posted to shared memory immediately from within
     453             :      * smgr_truncate.  Therefore, there should be no race here.
     454             :      *
     455             :      * The reason why it's desirable to release the lock early here is because
     456             :      * of the possibility that someone will need to use this to blow away many
     457             :      * visibility map forks at once.  If we can't release the lock until
     458             :      * commit time, the transaction doing this will accumulate
     459             :      * AccessExclusiveLocks on all of those relations at the same time, which
     460             :      * is undesirable. However, if this turns out to be unsafe we may have no
     461             :      * choice...
     462             :      */
     463           2 :     relation_close(rel, AccessExclusiveLock);
     464             : 
     465             :     /* Nothing to return. */
     466           2 :     PG_RETURN_VOID();
     467             : }
     468             : 
     469             : /*
     470             :  * Helper function to construct whichever TupleDesc we need for a particular
     471             :  * call.
     472             :  */
     473             : static TupleDesc
     474          36 : pg_visibility_tupdesc(bool include_blkno, bool include_pd)
     475             : {
     476             :     TupleDesc   tupdesc;
     477          36 :     AttrNumber  maxattr = 2;
     478          36 :     AttrNumber  a = 0;
     479             : 
     480          36 :     if (include_blkno)
     481          34 :         ++maxattr;
     482          36 :     if (include_pd)
     483          14 :         ++maxattr;
     484          36 :     tupdesc = CreateTemplateTupleDesc(maxattr);
     485          36 :     if (include_blkno)
     486          34 :         TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
     487          36 :     TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
     488          36 :     TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
     489          36 :     if (include_pd)
     490          14 :         TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
     491             :     Assert(a == maxattr);
     492             : 
     493          36 :     return BlessTupleDesc(tupdesc);
     494             : }
     495             : 
     496             : /*
     497             :  * Collect visibility data about a relation.
     498             :  *
     499             :  * Checks relkind of relid and will throw an error if the relation does not
     500             :  * have a VM.
     501             :  */
     502             : static vbits *
     503          34 : collect_visibility_data(Oid relid, bool include_pd)
     504             : {
     505             :     Relation    rel;
     506             :     BlockNumber nblocks;
     507             :     vbits      *info;
     508             :     BlockNumber blkno;
     509          34 :     Buffer      vmbuffer = InvalidBuffer;
     510          34 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     511             :     BlockRangeReadStreamPrivate p;
     512          34 :     ReadStream *stream = NULL;
     513             : 
     514          34 :     rel = relation_open(relid, AccessShareLock);
     515             : 
     516             :     /* Only some relkinds have a visibility map */
     517          30 :     check_relation_relkind(rel);
     518             : 
     519          20 :     nblocks = RelationGetNumberOfBlocks(rel);
     520          20 :     info = palloc0(offsetof(vbits, bits) + nblocks);
     521          20 :     info->next = 0;
     522          20 :     info->count = nblocks;
     523             : 
     524             :     /* Create a stream if reading main fork. */
     525          20 :     if (include_pd)
     526             :     {
     527          12 :         p.current_blocknum = 0;
     528          12 :         p.last_exclusive = nblocks;
     529             : 
     530             :         /*
     531             :          * It is safe to use batchmode as block_range_read_stream_cb takes no
     532             :          * locks.
     533             :          */
     534          12 :         stream = read_stream_begin_relation(READ_STREAM_FULL |
     535             :                                             READ_STREAM_USE_BATCHING,
     536             :                                             bstrategy,
     537             :                                             rel,
     538             :                                             MAIN_FORKNUM,
     539             :                                             block_range_read_stream_cb,
     540             :                                             &p,
     541             :                                             0);
     542             :     }
     543             : 
     544          44 :     for (blkno = 0; blkno < nblocks; ++blkno)
     545             :     {
     546             :         int32       mapbits;
     547             : 
     548             :         /* Make sure we are interruptible. */
     549          24 :         CHECK_FOR_INTERRUPTS();
     550             : 
     551             :         /* Get map info. */
     552          24 :         mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
     553          24 :         if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
     554          16 :             info->bits[blkno] |= (1 << 0);
     555          24 :         if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
     556          10 :             info->bits[blkno] |= (1 << 1);
     557             : 
     558             :         /*
     559             :          * Page-level data requires reading every block, so only get it if the
     560             :          * caller needs it.  Use a buffer access strategy, too, to prevent
     561             :          * cache-trashing.
     562             :          */
     563          24 :         if (include_pd)
     564             :         {
     565             :             Buffer      buffer;
     566             :             Page        page;
     567             : 
     568           6 :             buffer = read_stream_next_buffer(stream, NULL);
     569           6 :             LockBuffer(buffer, BUFFER_LOCK_SHARE);
     570             : 
     571           6 :             page = BufferGetPage(buffer);
     572           6 :             if (PageIsAllVisible(page))
     573           4 :                 info->bits[blkno] |= (1 << 2);
     574             : 
     575           6 :             UnlockReleaseBuffer(buffer);
     576             :         }
     577             :     }
     578             : 
     579          20 :     if (include_pd)
     580             :     {
     581             :         Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
     582          12 :         read_stream_end(stream);
     583             :     }
     584             : 
     585             :     /* Clean up. */
     586          20 :     if (vmbuffer != InvalidBuffer)
     587          14 :         ReleaseBuffer(vmbuffer);
     588          20 :     relation_close(rel, AccessShareLock);
     589             : 
     590          20 :     return info;
     591             : }
     592             : 
     593             : /*
     594             :  * The "strict" version of GetOldestNonRemovableTransactionId().  The
     595             :  * pg_visibility check can tolerate false positives (don't report some of the
     596             :  * errors), but can't tolerate false negatives (report false errors). Normally,
     597             :  * horizons move forwards, but there are cases when it could move backward
     598             :  * (see comment for ComputeXidHorizons()).
     599             :  *
     600             :  * This is why we have to implement our own function for xid horizon, which
     601             :  * would be guaranteed to be newer or equal to any xid horizon computed before.
     602             :  * We have to do the following to achieve this.
     603             :  *
     604             :  * 1. Ignore processes xmin's, because they consider connection to other
     605             :  *    databases that were ignored before.
     606             :  * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
     607             :  *    now perform minimal checking on a standby by always using nextXid, this
     608             :  *    approach is better than nothing and will at least catch extremely broken
     609             :  *    cases where a xid is in the future.
     610             :  * 3. Ignore walsender xmin, because it could go backward if some replication
     611             :  *    connections don't use replication slots.
     612             :  *
     613             :  * While it might seem like we could use KnownAssignedXids for shared
     614             :  * catalogs, since shared catalogs rely on a global horizon rather than a
     615             :  * database-specific one - there are potential edge cases.  For example, a
     616             :  * transaction may crash on the primary without writing a commit/abort record.
     617             :  * This would lead to a situation where it appears to still be running on the
     618             :  * standby, even though it has already ended on the primary.  For this reason,
     619             :  * it's safer to ignore KnownAssignedXids, even for shared catalogs.
     620             :  *
     621             :  * As a result, we're using only currently running xids to compute the horizon.
     622             :  * Surely these would significantly sacrifice accuracy.  But we have to do so
     623             :  * to avoid reporting false errors.
     624             :  */
     625             : static TransactionId
     626          16 : GetStrictOldestNonRemovableTransactionId(Relation rel)
     627             : {
     628             :     RunningTransactions runningTransactions;
     629             : 
     630          16 :     if (RecoveryInProgress())
     631             :     {
     632             :         TransactionId result;
     633             : 
     634             :         /* As we ignore KnownAssignedXids on standby, just pick nextXid */
     635           2 :         LWLockAcquire(XidGenLock, LW_SHARED);
     636           2 :         result = XidFromFullTransactionId(TransamVariables->nextXid);
     637           2 :         LWLockRelease(XidGenLock);
     638           2 :         return result;
     639             :     }
     640          14 :     else if (rel == NULL || rel->rd_rel->relisshared)
     641             :     {
     642             :         /* Shared relation: take into account all running xids */
     643           0 :         runningTransactions = GetRunningTransactionData();
     644           0 :         LWLockRelease(ProcArrayLock);
     645           0 :         LWLockRelease(XidGenLock);
     646           0 :         return runningTransactions->oldestRunningXid;
     647             :     }
     648          14 :     else if (!RELATION_IS_LOCAL(rel))
     649             :     {
     650             :         /*
     651             :          * Normal relation: take into account xids running within the current
     652             :          * database
     653             :          */
     654          14 :         runningTransactions = GetRunningTransactionData();
     655          14 :         LWLockRelease(ProcArrayLock);
     656          14 :         LWLockRelease(XidGenLock);
     657          14 :         return runningTransactions->oldestDatabaseRunningXid;
     658             :     }
     659             :     else
     660             :     {
     661             :         /*
     662             :          * For temporary relations, ComputeXidHorizons() uses only
     663             :          * TransamVariables->latestCompletedXid and MyProc->xid.  These two
     664             :          * shouldn't go backwards.  So we're fine with this horizon.
     665             :          */
     666           0 :         return GetOldestNonRemovableTransactionId(rel);
     667             :     }
     668             : }
     669             : 
     670             : /*
     671             :  * Callback function to get next block for read stream object used in
     672             :  * collect_corrupt_items() function.
     673             :  */
     674             : static BlockNumber
     675         206 : collect_corrupt_items_read_stream_next_block(ReadStream *stream,
     676             :                                              void *callback_private_data,
     677             :                                              void *per_buffer_data)
     678             : {
     679         206 :     struct collect_corrupt_items_read_stream_private *p = callback_private_data;
     680             : 
     681         218 :     for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
     682             :     {
     683         202 :         bool        check_frozen = false;
     684         202 :         bool        check_visible = false;
     685             : 
     686             :         /* Make sure we are interruptible. */
     687         202 :         CHECK_FOR_INTERRUPTS();
     688             : 
     689         202 :         if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
     690          98 :             check_frozen = true;
     691         202 :         if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
     692          92 :             check_visible = true;
     693         202 :         if (!check_visible && !check_frozen)
     694          12 :             continue;
     695             : 
     696         190 :         return p->current_blocknum++;
     697             :     }
     698             : 
     699          16 :     return InvalidBlockNumber;
     700             : }
     701             : 
     702             : /*
     703             :  * Returns a list of items whose visibility map information does not match
     704             :  * the status of the tuples on the page.
     705             :  *
     706             :  * If all_visible is passed as true, this will include all items which are
     707             :  * on pages marked as all-visible in the visibility map but which do not
     708             :  * seem to in fact be all-visible.
     709             :  *
     710             :  * If all_frozen is passed as true, this will include all items which are
     711             :  * on pages marked as all-frozen but which do not seem to in fact be frozen.
     712             :  *
     713             :  * Checks relkind of relid and will throw an error if the relation does not
     714             :  * have a VM.
     715             :  */
     716             : static corrupt_items *
     717          26 : collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
     718             : {
     719             :     Relation    rel;
     720             :     corrupt_items *items;
     721          26 :     Buffer      vmbuffer = InvalidBuffer;
     722          26 :     BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
     723          26 :     TransactionId OldestXmin = InvalidTransactionId;
     724             :     struct collect_corrupt_items_read_stream_private p;
     725             :     ReadStream *stream;
     726             :     Buffer      buffer;
     727             : 
     728          26 :     rel = relation_open(relid, AccessShareLock);
     729             : 
     730             :     /* Only some relkinds have a visibility map */
     731          26 :     check_relation_relkind(rel);
     732             : 
     733          16 :     if (all_visible)
     734           6 :         OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     735             : 
     736             :     /*
     737             :      * Guess an initial array size. We don't expect many corrupted tuples, so
     738             :      * start with a small array.  This function uses the "next" field to track
     739             :      * the next offset where we can store an item (which is the same thing as
     740             :      * the number of items found so far) and the "count" field to track the
     741             :      * number of entries allocated.  We'll repurpose these fields before
     742             :      * returning.
     743             :      */
     744          16 :     items = palloc0(sizeof(corrupt_items));
     745          16 :     items->next = 0;
     746          16 :     items->count = 64;
     747          16 :     items->tids = palloc(items->count * sizeof(ItemPointerData));
     748             : 
     749          16 :     p.current_blocknum = 0;
     750          16 :     p.last_exclusive = RelationGetNumberOfBlocks(rel);
     751          16 :     p.rel = rel;
     752          16 :     p.vmbuffer = InvalidBuffer;
     753          16 :     p.all_frozen = all_frozen;
     754          16 :     p.all_visible = all_visible;
     755          16 :     stream = read_stream_begin_relation(READ_STREAM_FULL,
     756             :                                         bstrategy,
     757             :                                         rel,
     758             :                                         MAIN_FORKNUM,
     759             :                                         collect_corrupt_items_read_stream_next_block,
     760             :                                         &p,
     761             :                                         0);
     762             : 
     763             :     /* Loop over every block in the relation. */
     764         206 :     while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
     765             :     {
     766         190 :         bool        check_frozen = all_frozen;
     767         190 :         bool        check_visible = all_visible;
     768             :         Page        page;
     769             :         OffsetNumber offnum,
     770             :                     maxoff;
     771             :         BlockNumber blkno;
     772             : 
     773             :         /* Make sure we are interruptible. */
     774         190 :         CHECK_FOR_INTERRUPTS();
     775             : 
     776         190 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
     777             : 
     778         190 :         page = BufferGetPage(buffer);
     779         190 :         maxoff = PageGetMaxOffsetNumber(page);
     780         190 :         blkno = BufferGetBlockNumber(buffer);
     781             : 
     782             :         /*
     783             :          * The visibility map bits might have changed while we were acquiring
     784             :          * the page lock.  Recheck to avoid returning spurious results.
     785             :          */
     786         190 :         if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
     787           0 :             check_frozen = false;
     788         190 :         if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
     789           0 :             check_visible = false;
     790         190 :         if (!check_visible && !check_frozen)
     791             :         {
     792           0 :             UnlockReleaseBuffer(buffer);
     793           0 :             continue;
     794             :         }
     795             : 
     796             :         /* Iterate over each tuple on the page. */
     797       32252 :         for (offnum = FirstOffsetNumber;
     798             :              offnum <= maxoff;
     799       32062 :              offnum = OffsetNumberNext(offnum))
     800             :         {
     801             :             HeapTupleData tuple;
     802             :             ItemId      itemid;
     803             : 
     804       32062 :             itemid = PageGetItemId(page, offnum);
     805             : 
     806             :             /* Unused or redirect line pointers are of no interest. */
     807       32062 :             if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
     808           0 :                 continue;
     809             : 
     810             :             /* Dead line pointers are neither all-visible nor frozen. */
     811       32062 :             if (ItemIdIsDead(itemid))
     812             :             {
     813           0 :                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
     814           0 :                 record_corrupt_item(items, &tuple.t_self);
     815           0 :                 continue;
     816             :             }
     817             : 
     818             :             /* Initialize a HeapTupleData structure for checks below. */
     819       32062 :             ItemPointerSet(&(tuple.t_self), blkno, offnum);
     820       32062 :             tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
     821       32062 :             tuple.t_len = ItemIdGetLength(itemid);
     822       32062 :             tuple.t_tableOid = relid;
     823             : 
     824             :             /*
     825             :              * If we're checking whether the page is all-visible, we expect
     826             :              * the tuple to be all-visible.
     827             :              */
     828       32062 :             if (check_visible &&
     829       16018 :                 !tuple_all_visible(&tuple, OldestXmin, buffer))
     830             :             {
     831             :                 TransactionId RecomputedOldestXmin;
     832             : 
     833             :                 /*
     834             :                  * Time has passed since we computed OldestXmin, so it's
     835             :                  * possible that this tuple is all-visible in reality even
     836             :                  * though it doesn't appear so based on our
     837             :                  * previously-computed value.  Let's compute a new value so we
     838             :                  * can be certain whether there is a problem.
     839             :                  *
     840             :                  * From a concurrency point of view, it sort of sucks to
     841             :                  * retake ProcArrayLock here while we're holding the buffer
     842             :                  * exclusively locked, but it should be safe against
     843             :                  * deadlocks, because surely
     844             :                  * GetStrictOldestNonRemovableTransactionId() should never
     845             :                  * take a buffer lock. And this shouldn't happen often, so
     846             :                  * it's worth being careful so as to avoid false positives.
     847             :                  */
     848          10 :                 RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
     849             : 
     850          10 :                 if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
     851          10 :                     record_corrupt_item(items, &tuple.t_self);
     852             :                 else
     853             :                 {
     854           0 :                     OldestXmin = RecomputedOldestXmin;
     855           0 :                     if (!tuple_all_visible(&tuple, OldestXmin, buffer))
     856           0 :                         record_corrupt_item(items, &tuple.t_self);
     857             :                 }
     858             :             }
     859             : 
     860             :             /*
     861             :              * If we're checking whether the page is all-frozen, we expect the
     862             :              * tuple to be in a state where it will never need freezing.
     863             :              */
     864       32062 :             if (check_frozen)
     865             :             {
     866       16044 :                 if (heap_tuple_needs_eventual_freeze(tuple.t_data))
     867          10 :                     record_corrupt_item(items, &tuple.t_self);
     868             :             }
     869             :         }
     870             : 
     871         190 :         UnlockReleaseBuffer(buffer);
     872             :     }
     873          16 :     read_stream_end(stream);
     874             : 
     875             :     /* Clean up. */
     876          16 :     if (vmbuffer != InvalidBuffer)
     877          14 :         ReleaseBuffer(vmbuffer);
     878          16 :     if (p.vmbuffer != InvalidBuffer)
     879          16 :         ReleaseBuffer(p.vmbuffer);
     880          16 :     relation_close(rel, AccessShareLock);
     881             : 
     882             :     /*
     883             :      * Before returning, repurpose the fields to match caller's expectations.
     884             :      * next is now the next item that should be read (rather than written) and
     885             :      * count is now the number of items we wrote (rather than the number we
     886             :      * allocated).
     887             :      */
     888          16 :     items->count = items->next;
     889          16 :     items->next = 0;
     890             : 
     891          16 :     return items;
     892             : }
     893             : 
     894             : /*
     895             :  * Remember one corrupt item.
     896             :  */
     897             : static void
     898          20 : record_corrupt_item(corrupt_items *items, ItemPointer tid)
     899             : {
     900             :     /* enlarge output array if needed. */
     901          20 :     if (items->next >= items->count)
     902             :     {
     903           0 :         items->count *= 2;
     904           0 :         items->tids = repalloc(items->tids,
     905           0 :                                items->count * sizeof(ItemPointerData));
     906             :     }
     907             :     /* and add the new item */
     908          20 :     items->tids[items->next++] = *tid;
     909          20 : }
     910             : 
     911             : /*
     912             :  * Check whether a tuple is all-visible relative to a given OldestXmin value.
     913             :  * The buffer should contain the tuple and should be locked and pinned.
     914             :  */
     915             : static bool
     916       16018 : tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
     917             : {
     918             :     HTSV_Result state;
     919             :     TransactionId xmin;
     920             : 
     921       16018 :     state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
     922       16018 :     if (state != HEAPTUPLE_LIVE)
     923          10 :         return false;           /* all-visible implies live */
     924             : 
     925             :     /*
     926             :      * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
     927             :      * all-visible unless every tuple is hinted committed. However, those hint
     928             :      * bits could be lost after a crash, so we can't be certain that they'll
     929             :      * be set here.  So just check the xmin.
     930             :      */
     931             : 
     932       16008 :     xmin = HeapTupleHeaderGetXmin(tup->t_data);
     933       16008 :     if (!TransactionIdPrecedes(xmin, OldestXmin))
     934           0 :         return false;           /* xmin not old enough for all to see */
     935             : 
     936       16008 :     return true;
     937             : }
     938             : 
     939             : /*
     940             :  * check_relation_relkind - convenience routine to check that relation
     941             :  * is of the relkind supported by the callers
     942             :  */
     943             : static void
     944          92 : check_relation_relkind(Relation rel)
     945             : {
     946          92 :     if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
     947          50 :         ereport(ERROR,
     948             :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
     949             :                  errmsg("relation \"%s\" is of wrong relation kind",
     950             :                         RelationGetRelationName(rel)),
     951             :                  errdetail_relkind_not_supported(rel->rd_rel->relkind)));
     952          42 : }

Generated by: LCOV version 1.14