LCOV - code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 179 199 89.9 %
Date: 2025-12-12 09:18:47 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*----------------------------------------------------------------------
       2             :  *
       3             :  * tableam.c
       4             :  *      Table access method routines too big to be inline functions.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/table/tableam.c
      12             :  *
      13             :  * NOTES
      14             :  *    Note that most function in here are documented in tableam.h, rather than
      15             :  *    here. That's because there's a lot of inline functions in tableam.h and
      16             :  *    it'd be harder to understand if one constantly had to switch between files.
      17             :  *
      18             :  *----------------------------------------------------------------------
      19             :  */
      20             : #include "postgres.h"
      21             : 
      22             : #include <math.h>
      23             : 
      24             : #include "access/syncscan.h"
      25             : #include "access/tableam.h"
      26             : #include "access/xact.h"
      27             : #include "optimizer/optimizer.h"
      28             : #include "optimizer/plancat.h"
      29             : #include "port/pg_bitutils.h"
      30             : #include "storage/bufmgr.h"
      31             : #include "storage/shmem.h"
      32             : #include "storage/smgr.h"
      33             : 
      34             : /*
      35             :  * Constants to control the behavior of block allocation to parallel workers
      36             :  * during a parallel seqscan.  Technically these values do not need to be
      37             :  * powers of 2, but having them as powers of 2 makes the math more optimal
      38             :  * and makes the ramp-down stepping more even.
      39             :  */
      40             : 
      41             : /* The number of I/O chunks we try to break a parallel seqscan down into */
      42             : #define PARALLEL_SEQSCAN_NCHUNKS            2048
      43             : /* Ramp down size of allocations when we've only this number of chunks left */
      44             : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
      45             : /* Cap the size of parallel I/O chunks to this number of blocks */
      46             : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
      47             : 
      48             : /* GUC variables */
      49             : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      50             : bool        synchronize_seqscans = true;
      51             : 
      52             : 
      53             : /* ----------------------------------------------------------------------------
      54             :  * Slot functions.
      55             :  * ----------------------------------------------------------------------------
      56             :  */
      57             : 
      58             : const TupleTableSlotOps *
      59    28786952 : table_slot_callbacks(Relation relation)
      60             : {
      61             :     const TupleTableSlotOps *tts_cb;
      62             : 
      63    28786952 :     if (relation->rd_tableam)
      64    28779874 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
      65        7078 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      66             :     {
      67             :         /*
      68             :          * Historically FDWs expect to store heap tuples in slots. Continue
      69             :          * handing them one, to make it less painful to adapt FDWs to new
      70             :          * versions. The cost of a heap slot over a virtual slot is pretty
      71             :          * small.
      72             :          */
      73         444 :         tts_cb = &TTSOpsHeapTuple;
      74             :     }
      75             :     else
      76             :     {
      77             :         /*
      78             :          * These need to be supported, as some parts of the code (like COPY)
      79             :          * need to create slots for such relations too. It seems better to
      80             :          * centralize the knowledge that a heap slot is the right thing in
      81             :          * that case here.
      82             :          */
      83             :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      84             :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      85        6634 :         tts_cb = &TTSOpsVirtual;
      86             :     }
      87             : 
      88    28786952 :     return tts_cb;
      89             : }
      90             : 
      91             : TupleTableSlot *
      92    28315842 : table_slot_create(Relation relation, List **reglist)
      93             : {
      94             :     const TupleTableSlotOps *tts_cb;
      95             :     TupleTableSlot *slot;
      96             : 
      97    28315842 :     tts_cb = table_slot_callbacks(relation);
      98    28315842 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      99             : 
     100    28315842 :     if (reglist)
     101      280236 :         *reglist = lappend(*reglist, slot);
     102             : 
     103    28315842 :     return slot;
     104             : }
     105             : 
     106             : 
     107             : /* ----------------------------------------------------------------------------
     108             :  * Table scan functions.
     109             :  * ----------------------------------------------------------------------------
     110             :  */
     111             : 
     112             : TableScanDesc
     113       78844 : table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key)
     114             : {
     115       78844 :     uint32      flags = SO_TYPE_SEQSCAN |
     116             :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     117       78844 :     Oid         relid = RelationGetRelid(relation);
     118       78844 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     119             : 
     120       78844 :     return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
     121             :                                             NULL, flags);
     122             : }
     123             : 
     124             : 
     125             : /* ----------------------------------------------------------------------------
     126             :  * Parallel table scan related functions.
     127             :  * ----------------------------------------------------------------------------
     128             :  */
     129             : 
     130             : Size
     131        1112 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     132             : {
     133        1112 :     Size        sz = 0;
     134             : 
     135        1112 :     if (IsMVCCSnapshot(snapshot))
     136         948 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     137             :     else
     138             :         Assert(snapshot == SnapshotAny);
     139             : 
     140        1112 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     141             : 
     142        1112 :     return sz;
     143             : }
     144             : 
     145             : void
     146        1112 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     147             :                               Snapshot snapshot)
     148             : {
     149        1112 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     150             : 
     151        1112 :     pscan->phs_snapshot_off = snapshot_off;
     152             : 
     153        1112 :     if (IsMVCCSnapshot(snapshot))
     154             :     {
     155         948 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     156         948 :         pscan->phs_snapshot_any = false;
     157             :     }
     158             :     else
     159             :     {
     160             :         Assert(snapshot == SnapshotAny);
     161         164 :         pscan->phs_snapshot_any = true;
     162             :     }
     163        1112 : }
     164             : 
     165             : TableScanDesc
     166        3984 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
     167             : {
     168             :     Snapshot    snapshot;
     169        3984 :     uint32      flags = SO_TYPE_SEQSCAN |
     170             :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     171             : 
     172             :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     173             : 
     174        3984 :     if (!pscan->phs_snapshot_any)
     175             :     {
     176             :         /* Snapshot was serialized -- restore it */
     177        3656 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     178        3656 :         RegisterSnapshot(snapshot);
     179        3656 :         flags |= SO_TEMP_SNAPSHOT;
     180             :     }
     181             :     else
     182             :     {
     183             :         /* SnapshotAny passed by caller (not serialized) */
     184         328 :         snapshot = SnapshotAny;
     185             :     }
     186             : 
     187        3984 :     return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     188             :                                             pscan, flags);
     189             : }
     190             : 
     191             : TableScanDesc
     192         120 : table_beginscan_parallel_tidrange(Relation relation,
     193             :                                   ParallelTableScanDesc pscan)
     194             : {
     195             :     Snapshot    snapshot;
     196         120 :     uint32      flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
     197             :     TableScanDesc sscan;
     198             : 
     199             :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     200             : 
     201             :     /* disable syncscan in parallel tid range scan. */
     202         120 :     pscan->phs_syncscan = false;
     203             : 
     204         120 :     if (!pscan->phs_snapshot_any)
     205             :     {
     206             :         /* Snapshot was serialized -- restore it */
     207         120 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     208         120 :         RegisterSnapshot(snapshot);
     209         120 :         flags |= SO_TEMP_SNAPSHOT;
     210             :     }
     211             :     else
     212             :     {
     213             :         /* SnapshotAny passed by caller (not serialized) */
     214           0 :         snapshot = SnapshotAny;
     215             :     }
     216             : 
     217         120 :     sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     218             :                                              pscan, flags);
     219         120 :     return sscan;
     220             : }
     221             : 
     222             : 
     223             : /* ----------------------------------------------------------------------------
     224             :  * Index scan related functions.
     225             :  * ----------------------------------------------------------------------------
     226             :  */
     227             : 
     228             : /*
     229             :  * To perform that check simply start an index scan, create the necessary
     230             :  * slot, do the heap lookup, and shut everything down again. This could be
     231             :  * optimized, but is unlikely to matter from a performance POV. If there
     232             :  * frequently are live index pointers also matching a unique index key, the
     233             :  * CPU overhead of this routine is unlikely to matter.
     234             :  *
     235             :  * Note that *tid may be modified when we return true if the AM supports
     236             :  * storing multiple row versions reachable via a single index entry (like
     237             :  * heap's HOT).
     238             :  */
     239             : bool
     240    11475444 : table_index_fetch_tuple_check(Relation rel,
     241             :                               ItemPointer tid,
     242             :                               Snapshot snapshot,
     243             :                               bool *all_dead)
     244             : {
     245             :     IndexFetchTableData *scan;
     246             :     TupleTableSlot *slot;
     247    11475444 :     bool        call_again = false;
     248             :     bool        found;
     249             : 
     250    11475444 :     slot = table_slot_create(rel, NULL);
     251    11475444 :     scan = table_index_fetch_begin(rel);
     252    11475444 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     253             :                                     all_dead);
     254    11475444 :     table_index_fetch_end(scan);
     255    11475444 :     ExecDropSingleTupleTableSlot(slot);
     256             : 
     257    11475444 :     return found;
     258             : }
     259             : 
     260             : 
     261             : /* ------------------------------------------------------------------------
     262             :  * Functions for non-modifying operations on individual tuples
     263             :  * ------------------------------------------------------------------------
     264             :  */
     265             : 
     266             : void
     267         312 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     268             : {
     269         312 :     Relation    rel = scan->rs_rd;
     270         312 :     const TableAmRoutine *tableam = rel->rd_tableam;
     271             : 
     272             :     /*
     273             :      * We don't expect direct calls to table_tuple_get_latest_tid with valid
     274             :      * CheckXidAlive for catalog or regular tables.  See detailed comments in
     275             :      * xact.c where these variables are declared.
     276             :      */
     277         312 :     if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
     278           0 :         elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
     279             : 
     280             :     /*
     281             :      * Since this can be called with user-supplied TID, don't trust the input
     282             :      * too much.
     283             :      */
     284         312 :     if (!tableam->tuple_tid_valid(scan, tid))
     285          12 :         ereport(ERROR,
     286             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     287             :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     288             :                         ItemPointerGetBlockNumberNoCheck(tid),
     289             :                         ItemPointerGetOffsetNumberNoCheck(tid),
     290             :                         RelationGetRelationName(rel))));
     291             : 
     292         300 :     tableam->tuple_get_latest_tid(scan, tid);
     293         300 : }
     294             : 
     295             : 
     296             : /* ----------------------------------------------------------------------------
     297             :  * Functions to make modifications a bit simpler.
     298             :  * ----------------------------------------------------------------------------
     299             :  */
     300             : 
     301             : /*
     302             :  * simple_table_tuple_insert - insert a tuple
     303             :  *
     304             :  * Currently, this routine differs from table_tuple_insert only in supplying a
     305             :  * default command ID and not allowing access to the speedup options.
     306             :  */
     307             : void
     308      152072 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     309             : {
     310      152072 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     311      152072 : }
     312             : 
     313             : /*
     314             :  * simple_table_tuple_delete - delete a tuple
     315             :  *
     316             :  * This routine may be used to delete a tuple when concurrent updates of
     317             :  * the target tuple are not expected (for example, because we have a lock
     318             :  * on the relation associated with the tuple).  Any failure is reported
     319             :  * via ereport().
     320             :  */
     321             : void
     322       80624 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
     323             : {
     324             :     TM_Result   result;
     325             :     TM_FailureData tmfd;
     326             : 
     327       80624 :     result = table_tuple_delete(rel, tid,
     328             :                                 GetCurrentCommandId(true),
     329             :                                 snapshot, InvalidSnapshot,
     330             :                                 true /* wait for commit */ ,
     331             :                                 &tmfd, false /* changingPart */ );
     332             : 
     333       80624 :     switch (result)
     334             :     {
     335           0 :         case TM_SelfModified:
     336             :             /* Tuple was already updated in current command? */
     337           0 :             elog(ERROR, "tuple already updated by self");
     338             :             break;
     339             : 
     340       80624 :         case TM_Ok:
     341             :             /* done successfully */
     342       80624 :             break;
     343             : 
     344           0 :         case TM_Updated:
     345           0 :             elog(ERROR, "tuple concurrently updated");
     346             :             break;
     347             : 
     348           0 :         case TM_Deleted:
     349           0 :             elog(ERROR, "tuple concurrently deleted");
     350             :             break;
     351             : 
     352           0 :         default:
     353           0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     354             :             break;
     355             :     }
     356       80624 : }
     357             : 
     358             : /*
     359             :  * simple_table_tuple_update - replace a tuple
     360             :  *
     361             :  * This routine may be used to update a tuple when concurrent updates of
     362             :  * the target tuple are not expected (for example, because we have a lock
     363             :  * on the relation associated with the tuple).  Any failure is reported
     364             :  * via ereport().
     365             :  */
     366             : void
     367       63836 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     368             :                           TupleTableSlot *slot,
     369             :                           Snapshot snapshot,
     370             :                           TU_UpdateIndexes *update_indexes)
     371             : {
     372             :     TM_Result   result;
     373             :     TM_FailureData tmfd;
     374             :     LockTupleMode lockmode;
     375             : 
     376       63836 :     result = table_tuple_update(rel, otid, slot,
     377             :                                 GetCurrentCommandId(true),
     378             :                                 snapshot, InvalidSnapshot,
     379             :                                 true /* wait for commit */ ,
     380             :                                 &tmfd, &lockmode, update_indexes);
     381             : 
     382       63836 :     switch (result)
     383             :     {
     384           0 :         case TM_SelfModified:
     385             :             /* Tuple was already updated in current command? */
     386           0 :             elog(ERROR, "tuple already updated by self");
     387             :             break;
     388             : 
     389       63836 :         case TM_Ok:
     390             :             /* done successfully */
     391       63836 :             break;
     392             : 
     393           0 :         case TM_Updated:
     394           0 :             elog(ERROR, "tuple concurrently updated");
     395             :             break;
     396             : 
     397           0 :         case TM_Deleted:
     398           0 :             elog(ERROR, "tuple concurrently deleted");
     399             :             break;
     400             : 
     401           0 :         default:
     402           0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     403             :             break;
     404             :     }
     405       63836 : }
     406             : 
     407             : 
     408             : /* ----------------------------------------------------------------------------
     409             :  * Helper functions to implement parallel scans for block oriented AMs.
     410             :  * ----------------------------------------------------------------------------
     411             :  */
     412             : 
     413             : Size
     414        1112 : table_block_parallelscan_estimate(Relation rel)
     415             : {
     416        1112 :     return sizeof(ParallelBlockTableScanDescData);
     417             : }
     418             : 
     419             : Size
     420        1112 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     421             : {
     422        1112 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     423             : 
     424        1112 :     bpscan->base.phs_locator = rel->rd_locator;
     425        1112 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     426             :     /* compare phs_syncscan initialization to similar logic in initscan */
     427        2974 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
     428        1862 :         !RelationUsesLocalBuffers(rel) &&
     429         750 :         bpscan->phs_nblocks > NBuffers / 4;
     430        1112 :     SpinLockInit(&bpscan->phs_mutex);
     431        1112 :     bpscan->phs_startblock = InvalidBlockNumber;
     432        1112 :     bpscan->phs_numblock = InvalidBlockNumber;
     433        1112 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     434             : 
     435        1112 :     return sizeof(ParallelBlockTableScanDescData);
     436             : }
     437             : 
     438             : void
     439         228 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     440             : {
     441         228 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     442             : 
     443         228 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     444         228 : }
     445             : 
     446             : /*
     447             :  * find and set the scan's startblock
     448             :  *
     449             :  * Determine where the parallel seq scan should start.  This function may be
     450             :  * called many times, once by each parallel worker.  We must be careful only
     451             :  * to set the phs_startblock and phs_numblock fields once.
     452             :  *
     453             :  * Callers may optionally specify a non-InvalidBlockNumber value for
     454             :  * 'startblock' to force the scan to start at the given page.  Likewise,
     455             :  * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the
     456             :  * number of blocks to scan to that many blocks.
     457             :  */
     458             : void
     459        3148 : table_block_parallelscan_startblock_init(Relation rel,
     460             :                                          ParallelBlockTableScanWorker pbscanwork,
     461             :                                          ParallelBlockTableScanDesc pbscan,
     462             :                                          BlockNumber startblock,
     463             :                                          BlockNumber numblocks)
     464             : {
     465        3148 :     BlockNumber sync_startpage = InvalidBlockNumber;
     466             :     BlockNumber scan_nblocks;
     467             : 
     468             :     /* Reset the state we use for controlling allocation size. */
     469        3148 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
     470             : 
     471             :     StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
     472             :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     473             : 
     474        3150 : retry:
     475             :     /* Grab the spinlock. */
     476        3150 :     SpinLockAcquire(&pbscan->phs_mutex);
     477             : 
     478             :     /*
     479             :      * When the caller specified a limit on the number of blocks to scan, set
     480             :      * that in the ParallelBlockTableScanDesc, if it's not been done by
     481             :      * another worker already.
     482             :      */
     483        3150 :     if (numblocks != InvalidBlockNumber &&
     484         120 :         pbscan->phs_numblock == InvalidBlockNumber)
     485             :     {
     486          24 :         pbscan->phs_numblock = numblocks;
     487             :     }
     488             : 
     489             :     /*
     490             :      * If the scan's phs_startblock has not yet been initialized, we must do
     491             :      * so now.  If a startblock was specified, start there, otherwise if this
     492             :      * is not a synchronized scan, we just start at block 0, but if it is a
     493             :      * synchronized scan, we must get the starting position from the
     494             :      * synchronized scan machinery.  We can't hold the spinlock while doing
     495             :      * that, though, so release the spinlock, get the information we need, and
     496             :      * retry.  If nobody else has initialized the scan in the meantime, we'll
     497             :      * fill in the value we fetched on the second time through.
     498             :      */
     499        3150 :     if (pbscan->phs_startblock == InvalidBlockNumber)
     500             :     {
     501        1094 :         if (startblock != InvalidBlockNumber)
     502          24 :             pbscan->phs_startblock = startblock;
     503        1070 :         else if (!pbscan->base.phs_syncscan)
     504        1066 :             pbscan->phs_startblock = 0;
     505           4 :         else if (sync_startpage != InvalidBlockNumber)
     506           2 :             pbscan->phs_startblock = sync_startpage;
     507             :         else
     508             :         {
     509           2 :             SpinLockRelease(&pbscan->phs_mutex);
     510           2 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     511           2 :             goto retry;
     512             :         }
     513             :     }
     514        3148 :     SpinLockRelease(&pbscan->phs_mutex);
     515             : 
     516             :     /*
     517             :      * Figure out how many blocks we're going to scan; either all of them, or
     518             :      * just phs_numblock's worth, if a limit has been imposed.
     519             :      */
     520        3148 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     521        3028 :         scan_nblocks = pbscan->phs_nblocks;
     522             :     else
     523         120 :         scan_nblocks = pbscan->phs_numblock;
     524             : 
     525             :     /*
     526             :      * We determine the chunk size based on scan_nblocks.  First we split
     527             :      * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
     528             :      * next highest power of 2 number of the result.  This means we split the
     529             :      * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     530             :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     531             :      */
     532        3148 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks /
     533             :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
     534             : 
     535             :     /*
     536             :      * Ensure we don't go over the maximum chunk size with larger tables. This
     537             :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     538             :      * tables.  Too large a chunk size has been shown to be detrimental to
     539             :      * sequential scan performance.
     540             :      */
     541        3148 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     542             :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     543        3148 : }
     544             : 
     545             : /*
     546             :  * get the next page to scan
     547             :  *
     548             :  * Get the next page to scan.  Even if there are no pages left to scan,
     549             :  * another backend could have grabbed a page to scan and not yet finished
     550             :  * looking at it, so it doesn't follow that the scan is done when the first
     551             :  * backend gets an InvalidBlockNumber return.
     552             :  */
     553             : BlockNumber
     554      202544 : table_block_parallelscan_nextpage(Relation rel,
     555             :                                   ParallelBlockTableScanWorker pbscanwork,
     556             :                                   ParallelBlockTableScanDesc pbscan)
     557             : {
     558             :     BlockNumber scan_nblocks;
     559             :     BlockNumber page;
     560             :     uint64      nallocated;
     561             : 
     562             :     /*
     563             :      * The logic below allocates block numbers out to parallel workers in a
     564             :      * way that each worker will receive a set of consecutive block numbers to
     565             :      * scan.  Earlier versions of this would allocate the next highest block
     566             :      * number to the next worker to call this function.  This would generally
     567             :      * result in workers never receiving consecutive block numbers.  Some
     568             :      * operating systems would not detect the sequential I/O pattern due to
     569             :      * each backend being a different process which could result in poor
     570             :      * performance due to inefficient or no readahead.  To work around this
     571             :      * issue, we now allocate a range of block numbers for each worker and
     572             :      * when they come back for another block, we give them the next one in
     573             :      * that range until the range is complete.  When the worker completes the
     574             :      * range of blocks we then allocate another range for it and return the
     575             :      * first block number from that range.
     576             :      *
     577             :      * Here we name these ranges of blocks "chunks".  The initial size of
     578             :      * these chunks is determined in table_block_parallelscan_startblock_init
     579             :      * based on the number of blocks to scan.  Towards the end of the scan, we
     580             :      * start making reductions in the size of the chunks in order to attempt
     581             :      * to divide the remaining work over all the workers as evenly as
     582             :      * possible.
     583             :      *
     584             :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     585             :      * the number of blocks remaining in the chunk.  When that reaches 0 then
     586             :      * we must allocate a new chunk for the worker.
     587             :      *
     588             :      * phs_nallocated tracks how many blocks have been allocated to workers
     589             :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     590             :      * allocated.
     591             :      *
     592             :      * Because we use an atomic fetch-and-add to fetch the current value, the
     593             :      * phs_nallocated counter will exceed rs_nblocks, because workers will
     594             :      * still increment the value, when they try to allocate the next block but
     595             :      * all blocks have been allocated already. The counter must be 64 bits
     596             :      * wide because of that, to avoid wrapping around when scan_nblocks is
     597             :      * close to 2^32.
     598             :      *
     599             :      * The actual block to return is calculated by adding the counter to the
     600             :      * starting block number, modulo phs_nblocks.
     601             :      */
     602             : 
     603             :     /* First, figure out how many blocks we're planning on scanning */
     604      202544 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     605      201926 :         scan_nblocks = pbscan->phs_nblocks;
     606             :     else
     607         618 :         scan_nblocks = pbscan->phs_numblock;
     608             : 
     609             :     /*
     610             :      * Now check if we have any remaining blocks in a previous chunk for this
     611             :      * worker.  We must consume all of the blocks from that before we allocate
     612             :      * a new chunk to the worker.
     613             :      */
     614      202544 :     if (pbscanwork->phsw_chunk_remaining > 0)
     615             :     {
     616             :         /*
     617             :          * Give them the next block in the range and update the remaining
     618             :          * number of blocks.
     619             :          */
     620       13026 :         nallocated = ++pbscanwork->phsw_nallocated;
     621       13026 :         pbscanwork->phsw_chunk_remaining--;
     622             :     }
     623             :     else
     624             :     {
     625             :         /*
     626             :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     627             :          * remaining in the scan, we half the chunk size.  Since we reduce the
     628             :          * chunk size here, we'll hit this again after doing
     629             :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     630             :          * iterations of this, we'll end up doing the last few blocks with the
     631             :          * chunk size set to 1.
     632             :          */
     633      189518 :         if (pbscanwork->phsw_chunk_size > 1 &&
     634        4430 :             pbscanwork->phsw_nallocated > scan_nblocks -
     635        4430 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     636           8 :             pbscanwork->phsw_chunk_size >>= 1;
     637             : 
     638      189518 :         nallocated = pbscanwork->phsw_nallocated =
     639      189518 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     640      189518 :                                     pbscanwork->phsw_chunk_size);
     641             : 
     642             :         /*
     643             :          * Set the remaining number of blocks in this chunk so that subsequent
     644             :          * calls from this worker continue on with this chunk until it's done.
     645             :          */
     646      189518 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     647             :     }
     648             : 
     649             :     /* Check if we've run out of blocks to scan */
     650      202544 :     if (nallocated >= scan_nblocks)
     651        3148 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
     652             :     else
     653      199396 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     654             : 
     655             :     /*
     656             :      * Report scan location.  Normally, we report the current page number.
     657             :      * When we reach the end of the scan, though, we report the starting page,
     658             :      * not the ending page, just so the starting positions for later scans
     659             :      * doesn't slew backwards.  We only report the position at the end of the
     660             :      * scan once, though: subsequent callers will report nothing.
     661             :      */
     662      202544 :     if (pbscan->base.phs_syncscan)
     663             :     {
     664       17704 :         if (page != InvalidBlockNumber)
     665       17700 :             ss_report_location(rel, page);
     666           4 :         else if (nallocated == pbscan->phs_nblocks)
     667           2 :             ss_report_location(rel, pbscan->phs_startblock);
     668             :     }
     669             : 
     670      202544 :     return page;
     671             : }
     672             : 
     673             : /* ----------------------------------------------------------------------------
     674             :  * Helper functions to implement relation sizing for block oriented AMs.
     675             :  * ----------------------------------------------------------------------------
     676             :  */
     677             : 
     678             : /*
     679             :  * table_block_relation_size
     680             :  *
     681             :  * If a table AM uses the various relation forks as the sole place where data
     682             :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     683             :  * is in the main fork rather than some other), it can use this implementation
     684             :  * of the relation_size callback rather than implementing its own.
     685             :  */
     686             : uint64
     687     2916146 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     688             : {
     689     2916146 :     uint64      nblocks = 0;
     690             : 
     691             :     /* InvalidForkNumber indicates returning the size for all forks */
     692     2916146 :     if (forkNumber == InvalidForkNumber)
     693             :     {
     694           0 :         for (int i = 0; i < MAX_FORKNUM; i++)
     695           0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     696             :     }
     697             :     else
     698     2916146 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     699             : 
     700     2916108 :     return nblocks * BLCKSZ;
     701             : }
     702             : 
     703             : /*
     704             :  * table_block_relation_estimate_size
     705             :  *
     706             :  * This function can't be directly used as the implementation of the
     707             :  * relation_estimate_size callback, because it has a few additional parameters.
     708             :  * Instead, it is intended to be used as a helper function; the caller can
     709             :  * pass through the arguments to its relation_estimate_size function plus the
     710             :  * additional values required here.
     711             :  *
     712             :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     713             :  * of storage required to store a tuple above and beyond what is required for
     714             :  * the tuple data proper. Typically, this would include things like the
     715             :  * size of the tuple header and item pointer. This is only used for query
     716             :  * planning, so a table AM where the value is not constant could choose to
     717             :  * pass a "best guess".
     718             :  *
     719             :  * usable_bytes_per_page should contain the approximate number of bytes per
     720             :  * page usable for tuple data, excluding the page header and any anticipated
     721             :  * special space.
     722             :  */
     723             : void
     724      458100 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     725             :                                    BlockNumber *pages, double *tuples,
     726             :                                    double *allvisfrac,
     727             :                                    Size overhead_bytes_per_tuple,
     728             :                                    Size usable_bytes_per_page)
     729             : {
     730             :     BlockNumber curpages;
     731             :     BlockNumber relpages;
     732             :     double      reltuples;
     733             :     BlockNumber relallvisible;
     734             :     double      density;
     735             : 
     736             :     /* it should have storage, so we can call the smgr */
     737      458100 :     curpages = RelationGetNumberOfBlocks(rel);
     738             : 
     739             :     /* coerce values in pg_class to more desirable types */
     740      458100 :     relpages = (BlockNumber) rel->rd_rel->relpages;
     741      458100 :     reltuples = (double) rel->rd_rel->reltuples;
     742      458100 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     743             : 
     744             :     /*
     745             :      * HACK: if the relation has never yet been vacuumed, use a minimum size
     746             :      * estimate of 10 pages.  The idea here is to avoid assuming a
     747             :      * newly-created table is really small, even if it currently is, because
     748             :      * that may not be true once some data gets loaded into it.  Once a vacuum
     749             :      * or analyze cycle has been done on it, it's more reasonable to believe
     750             :      * the size is somewhat stable.
     751             :      *
     752             :      * (Note that this is only an issue if the plan gets cached and used again
     753             :      * after the table has been filled.  What we're trying to avoid is using a
     754             :      * nestloop-type plan on a table that has grown substantially since the
     755             :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     756             :      * inserts have happened and cause cached-plan invalidation; but that
     757             :      * doesn't happen instantaneously, and it won't happen at all for cases
     758             :      * such as temporary tables.)
     759             :      *
     760             :      * We test "never vacuumed" by seeing whether reltuples < 0.
     761             :      *
     762             :      * If the table has inheritance children, we don't apply this heuristic.
     763             :      * Totally empty parent tables are quite common, so we should be willing
     764             :      * to believe that they are empty.
     765             :      */
     766      458100 :     if (curpages < 10 &&
     767      115714 :         reltuples < 0 &&
     768      115714 :         !rel->rd_rel->relhassubclass)
     769      113074 :         curpages = 10;
     770             : 
     771             :     /* report estimated # pages */
     772      458100 :     *pages = curpages;
     773             :     /* quick exit if rel is clearly empty */
     774      458100 :     if (curpages == 0)
     775             :     {
     776       21942 :         *tuples = 0;
     777       21942 :         *allvisfrac = 0;
     778       21942 :         return;
     779             :     }
     780             : 
     781             :     /* estimate number of tuples from previous tuple density */
     782      436158 :     if (reltuples >= 0 && relpages > 0)
     783      280658 :         density = reltuples / (double) relpages;
     784             :     else
     785             :     {
     786             :         /*
     787             :          * When we have no data because the relation was never yet vacuumed,
     788             :          * estimate tuple width from attribute datatypes.  We assume here that
     789             :          * the pages are completely full, which is OK for tables but is
     790             :          * probably an overestimate for indexes.  Fortunately
     791             :          * get_relation_info() can clamp the overestimate to the parent
     792             :          * table's size.
     793             :          *
     794             :          * Note: this code intentionally disregards alignment considerations,
     795             :          * because (a) that would be gilding the lily considering how crude
     796             :          * the estimate is, (b) it creates platform dependencies in the
     797             :          * default plans which are kind of a headache for regression testing,
     798             :          * and (c) different table AMs might use different padding schemes.
     799             :          */
     800             :         int32       tuple_width;
     801             :         int         fillfactor;
     802             : 
     803             :         /*
     804             :          * Without reltuples/relpages, we also need to consider fillfactor.
     805             :          * The other branch considers it implicitly by calculating density
     806             :          * from actual relpages/reltuples statistics.
     807             :          */
     808      155500 :         fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
     809             : 
     810      155500 :         tuple_width = get_rel_data_width(rel, attr_widths);
     811      155500 :         tuple_width += overhead_bytes_per_tuple;
     812             :         /* note: integer division is intentional here */
     813      155500 :         density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
     814             :         /* There's at least one row on the page, even with low fillfactor. */
     815      155500 :         density = clamp_row_est(density);
     816             :     }
     817      436158 :     *tuples = rint(density * (double) curpages);
     818             : 
     819             :     /*
     820             :      * We use relallvisible as-is, rather than scaling it up like we do for
     821             :      * the pages and tuples counts, on the theory that any pages added since
     822             :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
     823             :      * wants it converted to a fraction.
     824             :      */
     825      436158 :     if (relallvisible == 0 || curpages <= 0)
     826      213732 :         *allvisfrac = 0;
     827      222426 :     else if ((double) relallvisible >= curpages)
     828      124046 :         *allvisfrac = 1;
     829             :     else
     830       98380 :         *allvisfrac = (double) relallvisible / curpages;
     831             : }

Generated by: LCOV version 1.16