LCOV - code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 162 182 89.0 %
Date: 2024-03-29 15:11:20 Functions: 18 18 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*----------------------------------------------------------------------
       2             :  *
       3             :  * tableam.c
       4             :  *      Table access method routines too big to be inline functions.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/table/tableam.c
      12             :  *
      13             :  * NOTES
      14             :  *    Note that most function in here are documented in tableam.h, rather than
      15             :  *    here. That's because there's a lot of inline functions in tableam.h and
      16             :  *    it'd be harder to understand if one constantly had to switch between files.
      17             :  *
      18             :  *----------------------------------------------------------------------
      19             :  */
      20             : #include "postgres.h"
      21             : 
      22             : #include <math.h>
      23             : 
      24             : #include "access/syncscan.h"
      25             : #include "access/tableam.h"
      26             : #include "access/xact.h"
      27             : #include "optimizer/plancat.h"
      28             : #include "port/pg_bitutils.h"
      29             : #include "storage/bufmgr.h"
      30             : #include "storage/shmem.h"
      31             : #include "storage/smgr.h"
      32             : 
      33             : /*
      34             :  * Constants to control the behavior of block allocation to parallel workers
      35             :  * during a parallel seqscan.  Technically these values do not need to be
      36             :  * powers of 2, but having them as powers of 2 makes the math more optimal
      37             :  * and makes the ramp-down stepping more even.
      38             :  */
      39             : 
      40             : /* The number of I/O chunks we try to break a parallel seqscan down into */
      41             : #define PARALLEL_SEQSCAN_NCHUNKS            2048
      42             : /* Ramp down size of allocations when we've only this number of chunks left */
      43             : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
      44             : /* Cap the size of parallel I/O chunks to this number of blocks */
      45             : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
      46             : 
      47             : /* GUC variables */
      48             : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      49             : bool        synchronize_seqscans = true;
      50             : 
      51             : 
      52             : /* ----------------------------------------------------------------------------
      53             :  * Slot functions.
      54             :  * ----------------------------------------------------------------------------
      55             :  */
      56             : 
      57             : const TupleTableSlotOps *
      58    23305664 : table_slot_callbacks(Relation relation)
      59             : {
      60             :     const TupleTableSlotOps *tts_cb;
      61             : 
      62    23305664 :     if (relation->rd_tableam)
      63    23297200 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
      64        8464 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      65             :     {
      66             :         /*
      67             :          * Historically FDWs expect to store heap tuples in slots. Continue
      68             :          * handing them one, to make it less painful to adapt FDWs to new
      69             :          * versions. The cost of a heap slot over a virtual slot is pretty
      70             :          * small.
      71             :          */
      72         428 :         tts_cb = &TTSOpsHeapTuple;
      73             :     }
      74             :     else
      75             :     {
      76             :         /*
      77             :          * These need to be supported, as some parts of the code (like COPY)
      78             :          * need to create slots for such relations too. It seems better to
      79             :          * centralize the knowledge that a heap slot is the right thing in
      80             :          * that case here.
      81             :          */
      82             :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      83             :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      84        8036 :         tts_cb = &TTSOpsVirtual;
      85             :     }
      86             : 
      87    23305664 :     return tts_cb;
      88             : }
      89             : 
      90             : TupleTableSlot *
      91    22915840 : table_slot_create(Relation relation, List **reglist)
      92             : {
      93             :     const TupleTableSlotOps *tts_cb;
      94             :     TupleTableSlot *slot;
      95             : 
      96    22915840 :     tts_cb = table_slot_callbacks(relation);
      97    22915840 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      98             : 
      99    22915840 :     if (reglist)
     100      283470 :         *reglist = lappend(*reglist, slot);
     101             : 
     102    22915840 :     return slot;
     103             : }
     104             : 
     105             : 
     106             : /* ----------------------------------------------------------------------------
     107             :  * Table scan functions.
     108             :  * ----------------------------------------------------------------------------
     109             :  */
     110             : 
     111             : TableScanDesc
     112       76632 : table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
     113             : {
     114       76632 :     uint32      flags = SO_TYPE_SEQSCAN |
     115             :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     116       76632 :     Oid         relid = RelationGetRelid(relation);
     117       76632 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     118             : 
     119       76632 :     return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
     120             :                                             NULL, flags);
     121             : }
     122             : 
     123             : 
     124             : /* ----------------------------------------------------------------------------
     125             :  * Parallel table scan related functions.
     126             :  * ----------------------------------------------------------------------------
     127             :  */
     128             : 
     129             : Size
     130        1040 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     131             : {
     132        1040 :     Size        sz = 0;
     133             : 
     134        1040 :     if (IsMVCCSnapshot(snapshot))
     135         898 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     136             :     else
     137             :         Assert(snapshot == SnapshotAny);
     138             : 
     139        1040 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     140             : 
     141        1040 :     return sz;
     142             : }
     143             : 
     144             : void
     145        1040 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     146             :                               Snapshot snapshot)
     147             : {
     148        1040 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     149             : 
     150        1040 :     pscan->phs_snapshot_off = snapshot_off;
     151             : 
     152        1040 :     if (IsMVCCSnapshot(snapshot))
     153             :     {
     154         898 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     155         898 :         pscan->phs_snapshot_any = false;
     156             :     }
     157             :     else
     158             :     {
     159             :         Assert(snapshot == SnapshotAny);
     160         142 :         pscan->phs_snapshot_any = true;
     161             :     }
     162        1040 : }
     163             : 
     164             : TableScanDesc
     165        3848 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
     166             : {
     167             :     Snapshot    snapshot;
     168        3848 :     uint32      flags = SO_TYPE_SEQSCAN |
     169             :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     170             : 
     171             :     Assert(RelationGetRelid(relation) == pscan->phs_relid);
     172             : 
     173        3848 :     if (!pscan->phs_snapshot_any)
     174             :     {
     175             :         /* Snapshot was serialized -- restore it */
     176        3564 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     177        3564 :         RegisterSnapshot(snapshot);
     178        3564 :         flags |= SO_TEMP_SNAPSHOT;
     179             :     }
     180             :     else
     181             :     {
     182             :         /* SnapshotAny passed by caller (not serialized) */
     183         284 :         snapshot = SnapshotAny;
     184             :     }
     185             : 
     186        3848 :     return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     187             :                                             pscan, flags);
     188             : }
     189             : 
     190             : 
     191             : /* ----------------------------------------------------------------------------
     192             :  * Index scan related functions.
     193             :  * ----------------------------------------------------------------------------
     194             :  */
     195             : 
     196             : /*
     197             :  * To perform that check simply start an index scan, create the necessary
     198             :  * slot, do the heap lookup, and shut everything down again. This could be
     199             :  * optimized, but is unlikely to matter from a performance POV. If there
     200             :  * frequently are live index pointers also matching a unique index key, the
     201             :  * CPU overhead of this routine is unlikely to matter.
     202             :  *
     203             :  * Note that *tid may be modified when we return true if the AM supports
     204             :  * storing multiple row versions reachable via a single index entry (like
     205             :  * heap's HOT).
     206             :  */
     207             : bool
     208    11410876 : table_index_fetch_tuple_check(Relation rel,
     209             :                               ItemPointer tid,
     210             :                               Snapshot snapshot,
     211             :                               bool *all_dead)
     212             : {
     213             :     IndexFetchTableData *scan;
     214             :     TupleTableSlot *slot;
     215    11410876 :     bool        call_again = false;
     216             :     bool        found;
     217             : 
     218    11410876 :     slot = table_slot_create(rel, NULL);
     219    11410876 :     scan = table_index_fetch_begin(rel);
     220    11410876 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     221             :                                     all_dead);
     222    11410876 :     table_index_fetch_end(scan);
     223    11410876 :     ExecDropSingleTupleTableSlot(slot);
     224             : 
     225    11410876 :     return found;
     226             : }
     227             : 
     228             : 
     229             : /* ------------------------------------------------------------------------
     230             :  * Functions for non-modifying operations on individual tuples
     231             :  * ------------------------------------------------------------------------
     232             :  */
     233             : 
     234             : void
     235         306 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     236             : {
     237         306 :     Relation    rel = scan->rs_rd;
     238         306 :     const TableAmRoutine *tableam = rel->rd_tableam;
     239             : 
     240             :     /*
     241             :      * We don't expect direct calls to table_tuple_get_latest_tid with valid
     242             :      * CheckXidAlive for catalog or regular tables.  See detailed comments in
     243             :      * xact.c where these variables are declared.
     244             :      */
     245         306 :     if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
     246           0 :         elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
     247             : 
     248             :     /*
     249             :      * Since this can be called with user-supplied TID, don't trust the input
     250             :      * too much.
     251             :      */
     252         306 :     if (!tableam->tuple_tid_valid(scan, tid))
     253          12 :         ereport(ERROR,
     254             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     255             :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     256             :                         ItemPointerGetBlockNumberNoCheck(tid),
     257             :                         ItemPointerGetOffsetNumberNoCheck(tid),
     258             :                         RelationGetRelationName(rel))));
     259             : 
     260         294 :     tableam->tuple_get_latest_tid(scan, tid);
     261         294 : }
     262             : 
     263             : 
     264             : /* ----------------------------------------------------------------------------
     265             :  * Functions to make modifications a bit simpler.
     266             :  * ----------------------------------------------------------------------------
     267             :  */
     268             : 
     269             : /*
     270             :  * simple_table_tuple_insert - insert a tuple
     271             :  *
     272             :  * Currently, this routine differs from table_tuple_insert only in supplying a
     273             :  * default command ID and not allowing access to the speedup options.
     274             :  */
     275             : void
     276      153110 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     277             : {
     278      153110 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     279      153110 : }
     280             : 
     281             : /*
     282             :  * simple_table_tuple_delete - delete a tuple
     283             :  *
     284             :  * This routine may be used to delete a tuple when concurrent updates of
     285             :  * the target tuple are not expected (for example, because we have a lock
     286             :  * on the relation associated with the tuple).  Any failure is reported
     287             :  * via ereport().
     288             :  */
     289             : void
     290       80602 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot,
     291             :                           TupleTableSlot *oldSlot)
     292             : {
     293             :     TM_Result   result;
     294             :     TM_FailureData tmfd;
     295       80602 :     int         options = TABLE_MODIFY_WAIT;    /* wait for commit */
     296             : 
     297             :     /* Fetch old tuple if the relevant slot is provided */
     298       80602 :     if (oldSlot)
     299           0 :         options |= TABLE_MODIFY_FETCH_OLD_TUPLE;
     300             : 
     301       80602 :     result = table_tuple_delete(rel, tid,
     302             :                                 GetCurrentCommandId(true),
     303             :                                 snapshot, InvalidSnapshot,
     304             :                                 options,
     305             :                                 &tmfd, false /* changingPart */ ,
     306             :                                 oldSlot);
     307             : 
     308       80602 :     switch (result)
     309             :     {
     310           0 :         case TM_SelfModified:
     311             :             /* Tuple was already updated in current command? */
     312           0 :             elog(ERROR, "tuple already updated by self");
     313             :             break;
     314             : 
     315       80602 :         case TM_Ok:
     316             :             /* done successfully */
     317       80602 :             break;
     318             : 
     319           0 :         case TM_Updated:
     320           0 :             elog(ERROR, "tuple concurrently updated");
     321             :             break;
     322             : 
     323           0 :         case TM_Deleted:
     324           0 :             elog(ERROR, "tuple concurrently deleted");
     325             :             break;
     326             : 
     327           0 :         default:
     328           0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     329             :             break;
     330             :     }
     331       80602 : }
     332             : 
     333             : /*
     334             :  * simple_table_tuple_update - replace a tuple
     335             :  *
     336             :  * This routine may be used to update a tuple when concurrent updates of
     337             :  * the target tuple are not expected (for example, because we have a lock
     338             :  * on the relation associated with the tuple).  Any failure is reported
     339             :  * via ereport().
     340             :  */
     341             : void
     342       63822 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     343             :                           TupleTableSlot *slot,
     344             :                           Snapshot snapshot,
     345             :                           TU_UpdateIndexes *update_indexes,
     346             :                           TupleTableSlot *oldSlot)
     347             : {
     348             :     TM_Result   result;
     349             :     TM_FailureData tmfd;
     350             :     LockTupleMode lockmode;
     351       63822 :     int         options = TABLE_MODIFY_WAIT;    /* wait for commit */
     352             : 
     353             :     /* Fetch old tuple if the relevant slot is provided */
     354       63822 :     if (oldSlot)
     355          14 :         options |= TABLE_MODIFY_FETCH_OLD_TUPLE;
     356             : 
     357       63822 :     result = table_tuple_update(rel, otid, slot,
     358             :                                 GetCurrentCommandId(true),
     359             :                                 snapshot, InvalidSnapshot,
     360             :                                 options,
     361             :                                 &tmfd, &lockmode, update_indexes,
     362             :                                 oldSlot);
     363             : 
     364       63822 :     switch (result)
     365             :     {
     366           0 :         case TM_SelfModified:
     367             :             /* Tuple was already updated in current command? */
     368           0 :             elog(ERROR, "tuple already updated by self");
     369             :             break;
     370             : 
     371       63822 :         case TM_Ok:
     372             :             /* done successfully */
     373       63822 :             break;
     374             : 
     375           0 :         case TM_Updated:
     376           0 :             elog(ERROR, "tuple concurrently updated");
     377             :             break;
     378             : 
     379           0 :         case TM_Deleted:
     380           0 :             elog(ERROR, "tuple concurrently deleted");
     381             :             break;
     382             : 
     383           0 :         default:
     384           0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     385             :             break;
     386             :     }
     387       63822 : }
     388             : 
     389             : 
     390             : /* ----------------------------------------------------------------------------
     391             :  * Helper functions to implement parallel scans for block oriented AMs.
     392             :  * ----------------------------------------------------------------------------
     393             :  */
     394             : 
     395             : Size
     396        1040 : table_block_parallelscan_estimate(Relation rel)
     397             : {
     398        1040 :     return sizeof(ParallelBlockTableScanDescData);
     399             : }
     400             : 
     401             : Size
     402        1040 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     403             : {
     404        1040 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     405             : 
     406        1040 :     bpscan->base.phs_relid = RelationGetRelid(rel);
     407        1040 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     408             :     /* compare phs_syncscan initialization to similar logic in initscan */
     409        2780 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
     410        1740 :         !RelationUsesLocalBuffers(rel) &&
     411         700 :         bpscan->phs_nblocks > NBuffers / 4;
     412        1040 :     SpinLockInit(&bpscan->phs_mutex);
     413        1040 :     bpscan->phs_startblock = InvalidBlockNumber;
     414        1040 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     415             : 
     416        1040 :     return sizeof(ParallelBlockTableScanDescData);
     417             : }
     418             : 
     419             : void
     420         228 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     421             : {
     422         228 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     423             : 
     424         228 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     425         228 : }
     426             : 
     427             : /*
     428             :  * find and set the scan's startblock
     429             :  *
     430             :  * Determine where the parallel seq scan should start.  This function may be
     431             :  * called many times, once by each parallel worker.  We must be careful only
     432             :  * to set the startblock once.
     433             :  */
     434             : void
     435        2486 : table_block_parallelscan_startblock_init(Relation rel,
     436             :                                          ParallelBlockTableScanWorker pbscanwork,
     437             :                                          ParallelBlockTableScanDesc pbscan)
     438             : {
     439        2486 :     BlockNumber sync_startpage = InvalidBlockNumber;
     440             : 
     441             :     /* Reset the state we use for controlling allocation size. */
     442        2486 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
     443             : 
     444             :     StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
     445             :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     446             : 
     447             :     /*
     448             :      * We determine the chunk size based on the size of the relation. First we
     449             :      * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
     450             :      * take the next highest power of 2 number of the chunk size.  This means
     451             :      * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     452             :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     453             :      */
     454        2486 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
     455             :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
     456             : 
     457             :     /*
     458             :      * Ensure we don't go over the maximum chunk size with larger tables. This
     459             :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     460             :      * tables.  Too large a chunk size has been shown to be detrimental to
     461             :      * synchronous scan performance.
     462             :      */
     463        2486 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     464             :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     465             : 
     466        2488 : retry:
     467             :     /* Grab the spinlock. */
     468        2488 :     SpinLockAcquire(&pbscan->phs_mutex);
     469             : 
     470             :     /*
     471             :      * If the scan's startblock has not yet been initialized, we must do so
     472             :      * now.  If this is not a synchronized scan, we just start at block 0, but
     473             :      * if it is a synchronized scan, we must get the starting position from
     474             :      * the synchronized scan machinery.  We can't hold the spinlock while
     475             :      * doing that, though, so release the spinlock, get the information we
     476             :      * need, and retry.  If nobody else has initialized the scan in the
     477             :      * meantime, we'll fill in the value we fetched on the second time
     478             :      * through.
     479             :      */
     480        2488 :     if (pbscan->phs_startblock == InvalidBlockNumber)
     481             :     {
     482         830 :         if (!pbscan->base.phs_syncscan)
     483         826 :             pbscan->phs_startblock = 0;
     484           4 :         else if (sync_startpage != InvalidBlockNumber)
     485           2 :             pbscan->phs_startblock = sync_startpage;
     486             :         else
     487             :         {
     488           2 :             SpinLockRelease(&pbscan->phs_mutex);
     489           2 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     490           2 :             goto retry;
     491             :         }
     492             :     }
     493        2486 :     SpinLockRelease(&pbscan->phs_mutex);
     494        2486 : }
     495             : 
     496             : /*
     497             :  * get the next page to scan
     498             :  *
     499             :  * Get the next page to scan.  Even if there are no pages left to scan,
     500             :  * another backend could have grabbed a page to scan and not yet finished
     501             :  * looking at it, so it doesn't follow that the scan is done when the first
     502             :  * backend gets an InvalidBlockNumber return.
     503             :  */
     504             : BlockNumber
     505      200262 : table_block_parallelscan_nextpage(Relation rel,
     506             :                                   ParallelBlockTableScanWorker pbscanwork,
     507             :                                   ParallelBlockTableScanDesc pbscan)
     508             : {
     509             :     BlockNumber page;
     510             :     uint64      nallocated;
     511             : 
     512             :     /*
     513             :      * The logic below allocates block numbers out to parallel workers in a
     514             :      * way that each worker will receive a set of consecutive block numbers to
     515             :      * scan.  Earlier versions of this would allocate the next highest block
     516             :      * number to the next worker to call this function.  This would generally
     517             :      * result in workers never receiving consecutive block numbers.  Some
     518             :      * operating systems would not detect the sequential I/O pattern due to
     519             :      * each backend being a different process which could result in poor
     520             :      * performance due to inefficient or no readahead.  To work around this
     521             :      * issue, we now allocate a range of block numbers for each worker and
     522             :      * when they come back for another block, we give them the next one in
     523             :      * that range until the range is complete.  When the worker completes the
     524             :      * range of blocks we then allocate another range for it and return the
     525             :      * first block number from that range.
     526             :      *
     527             :      * Here we name these ranges of blocks "chunks".  The initial size of
     528             :      * these chunks is determined in table_block_parallelscan_startblock_init
     529             :      * based on the size of the relation.  Towards the end of the scan, we
     530             :      * start making reductions in the size of the chunks in order to attempt
     531             :      * to divide the remaining work over all the workers as evenly as
     532             :      * possible.
     533             :      *
     534             :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     535             :      * the number of blocks remaining in the chunk.  When that reaches 0 then
     536             :      * we must allocate a new chunk for the worker.
     537             :      *
     538             :      * phs_nallocated tracks how many blocks have been allocated to workers
     539             :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     540             :      * allocated.
     541             :      *
     542             :      * Because we use an atomic fetch-and-add to fetch the current value, the
     543             :      * phs_nallocated counter will exceed rs_nblocks, because workers will
     544             :      * still increment the value, when they try to allocate the next block but
     545             :      * all blocks have been allocated already. The counter must be 64 bits
     546             :      * wide because of that, to avoid wrapping around when rs_nblocks is close
     547             :      * to 2^32.
     548             :      *
     549             :      * The actual block to return is calculated by adding the counter to the
     550             :      * starting block number, modulo nblocks.
     551             :      */
     552             : 
     553             :     /*
     554             :      * First check if we have any remaining blocks in a previous chunk for
     555             :      * this worker.  We must consume all of the blocks from that before we
     556             :      * allocate a new chunk to the worker.
     557             :      */
     558      200262 :     if (pbscanwork->phsw_chunk_remaining > 0)
     559             :     {
     560             :         /*
     561             :          * Give them the next block in the range and update the remaining
     562             :          * number of blocks.
     563             :          */
     564       13026 :         nallocated = ++pbscanwork->phsw_nallocated;
     565       13026 :         pbscanwork->phsw_chunk_remaining--;
     566             :     }
     567             :     else
     568             :     {
     569             :         /*
     570             :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     571             :          * remaining in the scan, we half the chunk size.  Since we reduce the
     572             :          * chunk size here, we'll hit this again after doing
     573             :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     574             :          * iterations of this, we'll end up doing the last few blocks with the
     575             :          * chunk size set to 1.
     576             :          */
     577      187236 :         if (pbscanwork->phsw_chunk_size > 1 &&
     578        4430 :             pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
     579        4430 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     580           8 :             pbscanwork->phsw_chunk_size >>= 1;
     581             : 
     582      187236 :         nallocated = pbscanwork->phsw_nallocated =
     583      187236 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     584      187236 :                                     pbscanwork->phsw_chunk_size);
     585             : 
     586             :         /*
     587             :          * Set the remaining number of blocks in this chunk so that subsequent
     588             :          * calls from this worker continue on with this chunk until it's done.
     589             :          */
     590      187236 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     591             :     }
     592             : 
     593      200262 :     if (nallocated >= pbscan->phs_nblocks)
     594        2486 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
     595             :     else
     596      197776 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     597             : 
     598             :     /*
     599             :      * Report scan location.  Normally, we report the current page number.
     600             :      * When we reach the end of the scan, though, we report the starting page,
     601             :      * not the ending page, just so the starting positions for later scans
     602             :      * doesn't slew backwards.  We only report the position at the end of the
     603             :      * scan once, though: subsequent callers will report nothing.
     604             :      */
     605      200262 :     if (pbscan->base.phs_syncscan)
     606             :     {
     607       17704 :         if (page != InvalidBlockNumber)
     608       17700 :             ss_report_location(rel, page);
     609           4 :         else if (nallocated == pbscan->phs_nblocks)
     610           2 :             ss_report_location(rel, pbscan->phs_startblock);
     611             :     }
     612             : 
     613      200262 :     return page;
     614             : }
     615             : 
     616             : /* ----------------------------------------------------------------------------
     617             :  * Helper functions to implement relation sizing for block oriented AMs.
     618             :  * ----------------------------------------------------------------------------
     619             :  */
     620             : 
     621             : /*
     622             :  * table_block_relation_size
     623             :  *
     624             :  * If a table AM uses the various relation forks as the sole place where data
     625             :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     626             :  * is in the main fork rather than some other), it can use this implementation
     627             :  * of the relation_size callback rather than implementing its own.
     628             :  */
     629             : uint64
     630     2109496 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     631             : {
     632     2109496 :     uint64      nblocks = 0;
     633             : 
     634             :     /* InvalidForkNumber indicates returning the size for all forks */
     635     2109496 :     if (forkNumber == InvalidForkNumber)
     636             :     {
     637           0 :         for (int i = 0; i < MAX_FORKNUM; i++)
     638           0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     639             :     }
     640             :     else
     641     2109496 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     642             : 
     643     2109458 :     return nblocks * BLCKSZ;
     644             : }
     645             : 
     646             : /*
     647             :  * table_block_relation_estimate_size
     648             :  *
     649             :  * This function can't be directly used as the implementation of the
     650             :  * relation_estimate_size callback, because it has a few additional parameters.
     651             :  * Instead, it is intended to be used as a helper function; the caller can
     652             :  * pass through the arguments to its relation_estimate_size function plus the
     653             :  * additional values required here.
     654             :  *
     655             :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     656             :  * of storage required to store a tuple above and beyond what is required for
     657             :  * the tuple data proper. Typically, this would include things like the
     658             :  * size of the tuple header and item pointer. This is only used for query
     659             :  * planning, so a table AM where the value is not constant could choose to
     660             :  * pass a "best guess".
     661             :  *
     662             :  * usable_bytes_per_page should contain the approximate number of bytes per
     663             :  * page usable for tuple data, excluding the page header and any anticipated
     664             :  * special space.
     665             :  */
     666             : void
     667      384000 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     668             :                                    BlockNumber *pages, double *tuples,
     669             :                                    double *allvisfrac,
     670             :                                    Size overhead_bytes_per_tuple,
     671             :                                    Size usable_bytes_per_page)
     672             : {
     673             :     BlockNumber curpages;
     674             :     BlockNumber relpages;
     675             :     double      reltuples;
     676             :     BlockNumber relallvisible;
     677             :     double      density;
     678             : 
     679             :     /* it should have storage, so we can call the smgr */
     680      384000 :     curpages = RelationGetNumberOfBlocks(rel);
     681             : 
     682             :     /* coerce values in pg_class to more desirable types */
     683      384000 :     relpages = (BlockNumber) rel->rd_rel->relpages;
     684      384000 :     reltuples = (double) rel->rd_rel->reltuples;
     685      384000 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     686             : 
     687             :     /*
     688             :      * HACK: if the relation has never yet been vacuumed, use a minimum size
     689             :      * estimate of 10 pages.  The idea here is to avoid assuming a
     690             :      * newly-created table is really small, even if it currently is, because
     691             :      * that may not be true once some data gets loaded into it.  Once a vacuum
     692             :      * or analyze cycle has been done on it, it's more reasonable to believe
     693             :      * the size is somewhat stable.
     694             :      *
     695             :      * (Note that this is only an issue if the plan gets cached and used again
     696             :      * after the table has been filled.  What we're trying to avoid is using a
     697             :      * nestloop-type plan on a table that has grown substantially since the
     698             :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     699             :      * inserts have happened and cause cached-plan invalidation; but that
     700             :      * doesn't happen instantaneously, and it won't happen at all for cases
     701             :      * such as temporary tables.)
     702             :      *
     703             :      * We test "never vacuumed" by seeing whether reltuples < 0.
     704             :      *
     705             :      * If the table has inheritance children, we don't apply this heuristic.
     706             :      * Totally empty parent tables are quite common, so we should be willing
     707             :      * to believe that they are empty.
     708             :      */
     709      384000 :     if (curpages < 10 &&
     710      102160 :         reltuples < 0 &&
     711      102160 :         !rel->rd_rel->relhassubclass)
     712       99748 :         curpages = 10;
     713             : 
     714             :     /* report estimated # pages */
     715      384000 :     *pages = curpages;
     716             :     /* quick exit if rel is clearly empty */
     717      384000 :     if (curpages == 0)
     718             :     {
     719       14560 :         *tuples = 0;
     720       14560 :         *allvisfrac = 0;
     721       14560 :         return;
     722             :     }
     723             : 
     724             :     /* estimate number of tuples from previous tuple density */
     725      369440 :     if (reltuples >= 0 && relpages > 0)
     726      238652 :         density = reltuples / (double) relpages;
     727             :     else
     728             :     {
     729             :         /*
     730             :          * When we have no data because the relation was never yet vacuumed,
     731             :          * estimate tuple width from attribute datatypes.  We assume here that
     732             :          * the pages are completely full, which is OK for tables but is
     733             :          * probably an overestimate for indexes.  Fortunately
     734             :          * get_relation_info() can clamp the overestimate to the parent
     735             :          * table's size.
     736             :          *
     737             :          * Note: this code intentionally disregards alignment considerations,
     738             :          * because (a) that would be gilding the lily considering how crude
     739             :          * the estimate is, (b) it creates platform dependencies in the
     740             :          * default plans which are kind of a headache for regression testing,
     741             :          * and (c) different table AMs might use different padding schemes.
     742             :          */
     743             :         int32       tuple_width;
     744             :         int         fillfactor;
     745             : 
     746             :         /*
     747             :          * Without reltuples/relpages, we also need to consider fillfactor.
     748             :          * The other branch considers it implicitly by calculating density
     749             :          * from actual relpages/reltuples statistics.
     750             :          */
     751      130788 :         fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
     752             : 
     753      130788 :         tuple_width = get_rel_data_width(rel, attr_widths);
     754      130788 :         tuple_width += overhead_bytes_per_tuple;
     755             :         /* note: integer division is intentional here */
     756      130788 :         density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
     757             :     }
     758      369440 :     *tuples = rint(density * (double) curpages);
     759             : 
     760             :     /*
     761             :      * We use relallvisible as-is, rather than scaling it up like we do for
     762             :      * the pages and tuples counts, on the theory that any pages added since
     763             :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
     764             :      * wants it converted to a fraction.
     765             :      */
     766      369440 :     if (relallvisible == 0 || curpages <= 0)
     767      182576 :         *allvisfrac = 0;
     768      186864 :     else if ((double) relallvisible >= curpages)
     769       92478 :         *allvisfrac = 1;
     770             :     else
     771       94386 :         *allvisfrac = (double) relallvisible / curpages;
     772             : }

Generated by: LCOV version 1.14