LCOV - code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 90.4 % 197 178
Test Date: 2026-04-07 14:16:30 Functions: 100.0 % 19 19
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*----------------------------------------------------------------------
       2              :  *
       3              :  * tableam.c
       4              :  *      Table access method routines too big to be inline functions.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/table/tableam.c
      12              :  *
      13              :  * NOTES
      14              :  *    Note that most functions in here are documented in tableam.h, rather than
      15              :  *    here. That's because there's a lot of inline functions in tableam.h and
      16              :  *    it'd be harder to understand if one constantly had to switch between files.
      17              :  *
      18              :  *----------------------------------------------------------------------
      19              :  */
      20              : #include "postgres.h"
      21              : 
      22              : #include <math.h>
      23              : 
      24              : #include "access/syncscan.h"
      25              : #include "access/tableam.h"
      26              : #include "access/xact.h"
      27              : #include "optimizer/optimizer.h"
      28              : #include "optimizer/plancat.h"
      29              : #include "port/pg_bitutils.h"
      30              : #include "storage/bufmgr.h"
      31              : #include "storage/shmem.h"
      32              : #include "storage/smgr.h"
      33              : 
      34              : /*
      35              :  * Constants to control the behavior of block allocation to parallel workers
      36              :  * during a parallel seqscan.  Technically these values do not need to be
      37              :  * powers of 2, but having them as powers of 2 makes the math more optimal
      38              :  * and makes the ramp-down stepping more even.
      39              :  */
      40              : 
      41              : /* The number of I/O chunks we try to break a parallel seqscan down into */
      42              : #define PARALLEL_SEQSCAN_NCHUNKS            2048
      43              : /* Ramp down size of allocations when we've only this number of chunks left */
      44              : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
      45              : /* Cap the size of parallel I/O chunks to this number of blocks */
      46              : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
      47              : 
      48              : /* GUC variables */
      49              : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      50              : bool        synchronize_seqscans = true;
      51              : 
      52              : 
      53              : /* ----------------------------------------------------------------------------
      54              :  * Slot functions.
      55              :  * ----------------------------------------------------------------------------
      56              :  */
      57              : 
      58              : const TupleTableSlotOps *
      59     18172113 : table_slot_callbacks(Relation relation)
      60              : {
      61              :     const TupleTableSlotOps *tts_cb;
      62              : 
      63     18172113 :     if (relation->rd_tableam)
      64     18167136 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
      65         4977 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      66              :     {
      67              :         /*
      68              :          * Historically FDWs expect to store heap tuples in slots. Continue
      69              :          * handing them one, to make it less painful to adapt FDWs to new
      70              :          * versions. The cost of a heap slot over a virtual slot is pretty
      71              :          * small.
      72              :          */
      73          222 :         tts_cb = &TTSOpsHeapTuple;
      74              :     }
      75              :     else
      76              :     {
      77              :         /*
      78              :          * These need to be supported, as some parts of the code (like COPY)
      79              :          * need to create slots for such relations too. It seems better to
      80              :          * centralize the knowledge that a heap slot is the right thing in
      81              :          * that case here.
      82              :          */
      83              :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      84              :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      85         4755 :         tts_cb = &TTSOpsVirtual;
      86              :     }
      87              : 
      88     18172113 :     return tts_cb;
      89              : }
      90              : 
      91              : TupleTableSlot *
      92     17855940 : table_slot_create(Relation relation, List **reglist)
      93              : {
      94              :     const TupleTableSlotOps *tts_cb;
      95              :     TupleTableSlot *slot;
      96              : 
      97     17855940 :     tts_cb = table_slot_callbacks(relation);
      98     17855940 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      99              : 
     100     17855940 :     if (reglist)
     101       153708 :         *reglist = lappend(*reglist, slot);
     102              : 
     103     17855940 :     return slot;
     104              : }
     105              : 
     106              : 
     107              : /* ----------------------------------------------------------------------------
     108              :  * Table scan functions.
     109              :  * ----------------------------------------------------------------------------
     110              :  */
     111              : 
     112              : TableScanDesc
     113        46357 : table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key)
     114              : {
     115        46357 :     uint32      flags = SO_TYPE_SEQSCAN |
     116              :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     117        46357 :     Oid         relid = RelationGetRelid(relation);
     118        46357 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     119              : 
     120        46357 :     return table_beginscan_common(relation, snapshot, nkeys, key,
     121              :                                   NULL, flags, SO_NONE);
     122              : }
     123              : 
     124              : 
     125              : /* ----------------------------------------------------------------------------
     126              :  * Parallel table scan related functions.
     127              :  * ----------------------------------------------------------------------------
     128              :  */
     129              : 
     130              : Size
     131         1337 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     132              : {
     133         1337 :     Size        sz = 0;
     134              : 
     135         1337 :     if (IsMVCCSnapshot(snapshot))
     136         1210 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     137              :     else
     138              :         Assert(snapshot == SnapshotAny);
     139              : 
     140         1337 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     141              : 
     142         1337 :     return sz;
     143              : }
     144              : 
     145              : void
     146         1337 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     147              :                               Snapshot snapshot)
     148              : {
     149         1337 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     150              : 
     151         1337 :     pscan->phs_snapshot_off = snapshot_off;
     152              : 
     153         1337 :     if (IsMVCCSnapshot(snapshot))
     154              :     {
     155         1210 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     156         1210 :         pscan->phs_snapshot_any = false;
     157              :     }
     158              :     else
     159              :     {
     160              :         Assert(snapshot == SnapshotAny);
     161          127 :         pscan->phs_snapshot_any = true;
     162              :     }
     163         1337 : }
     164              : 
     165              : TableScanDesc
     166         4440 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan,
     167              :                          uint32 flags)
     168              : {
     169              :     Snapshot    snapshot;
     170         4440 :     uint32      internal_flags = SO_TYPE_SEQSCAN |
     171              :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     172              : 
     173              :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     174              : 
     175         4440 :     if (!pscan->phs_snapshot_any)
     176              :     {
     177              :         /* Snapshot was serialized -- restore it */
     178         4169 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     179         4169 :         RegisterSnapshot(snapshot);
     180         4169 :         internal_flags |= SO_TEMP_SNAPSHOT;
     181              :     }
     182              :     else
     183              :     {
     184              :         /* SnapshotAny passed by caller (not serialized) */
     185          271 :         snapshot = SnapshotAny;
     186              :     }
     187              : 
     188         4440 :     return table_beginscan_common(relation, snapshot, 0, NULL,
     189              :                                   pscan, internal_flags, flags);
     190              : }
     191              : 
     192              : TableScanDesc
     193           80 : table_beginscan_parallel_tidrange(Relation relation,
     194              :                                   ParallelTableScanDesc pscan,
     195              :                                   uint32 flags)
     196              : {
     197              :     Snapshot    snapshot;
     198              :     TableScanDesc sscan;
     199           80 :     uint32      internal_flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
     200              : 
     201              :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     202              : 
     203              :     /* disable syncscan in parallel tid range scan. */
     204           80 :     pscan->phs_syncscan = false;
     205              : 
     206           80 :     if (!pscan->phs_snapshot_any)
     207              :     {
     208              :         /* Snapshot was serialized -- restore it */
     209           80 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     210           80 :         RegisterSnapshot(snapshot);
     211           80 :         internal_flags |= SO_TEMP_SNAPSHOT;
     212              :     }
     213              :     else
     214              :     {
     215              :         /* SnapshotAny passed by caller (not serialized) */
     216            0 :         snapshot = SnapshotAny;
     217              :     }
     218              : 
     219           80 :     sscan = table_beginscan_common(relation, snapshot, 0, NULL,
     220              :                                    pscan, internal_flags, flags);
     221           80 :     return sscan;
     222              : }
     223              : 
     224              : 
     225              : /* ----------------------------------------------------------------------------
     226              :  * Index scan related functions.
     227              :  * ----------------------------------------------------------------------------
     228              :  */
     229              : 
     230              : /*
     231              :  * To perform that check simply start an index scan, create the necessary
     232              :  * slot, do the heap lookup, and shut everything down again. This could be
     233              :  * optimized, but is unlikely to matter from a performance POV. If there
     234              :  * frequently are live index pointers also matching a unique index key, the
     235              :  * CPU overhead of this routine is unlikely to matter.
     236              :  *
     237              :  * Note that *tid may be modified when we return true if the AM supports
     238              :  * storing multiple row versions reachable via a single index entry (like
     239              :  * heap's HOT).
     240              :  */
     241              : bool
     242      7586817 : table_index_fetch_tuple_check(Relation rel,
     243              :                               ItemPointer tid,
     244              :                               Snapshot snapshot,
     245              :                               bool *all_dead)
     246              : {
     247              :     IndexFetchTableData *scan;
     248              :     TupleTableSlot *slot;
     249      7586817 :     bool        call_again = false;
     250              :     bool        found;
     251              : 
     252      7586817 :     slot = table_slot_create(rel, NULL);
     253      7586817 :     scan = table_index_fetch_begin(rel, SO_NONE);
     254      7586817 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     255              :                                     all_dead);
     256      7586817 :     table_index_fetch_end(scan);
     257      7586817 :     ExecDropSingleTupleTableSlot(slot);
     258              : 
     259      7586817 :     return found;
     260              : }
     261              : 
     262              : 
     263              : /* ------------------------------------------------------------------------
     264              :  * Functions for non-modifying operations on individual tuples
     265              :  * ------------------------------------------------------------------------
     266              :  */
     267              : 
     268              : void
     269          207 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     270              : {
     271          207 :     Relation    rel = scan->rs_rd;
     272          207 :     const TableAmRoutine *tableam = rel->rd_tableam;
     273              : 
     274              :     /*
     275              :      * Since this can be called with user-supplied TID, don't trust the input
     276              :      * too much.
     277              :      */
     278          207 :     if (!tableam->tuple_tid_valid(scan, tid))
     279            8 :         ereport(ERROR,
     280              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     281              :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     282              :                         ItemPointerGetBlockNumberNoCheck(tid),
     283              :                         ItemPointerGetOffsetNumberNoCheck(tid),
     284              :                         RelationGetRelationName(rel))));
     285              : 
     286          199 :     tableam->tuple_get_latest_tid(scan, tid);
     287          199 : }
     288              : 
     289              : 
     290              : /* ----------------------------------------------------------------------------
     291              :  * Functions to make modifications a bit simpler.
     292              :  * ----------------------------------------------------------------------------
     293              :  */
     294              : 
     295              : /*
     296              :  * simple_table_tuple_insert - insert a tuple
     297              :  *
     298              :  * Currently, this routine differs from table_tuple_insert only in supplying a
     299              :  * default command ID and not allowing access to the speedup options.
     300              :  */
     301              : void
     302        96596 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     303              : {
     304        96596 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     305        96596 : }
     306              : 
     307              : /*
     308              :  * simple_table_tuple_delete - delete a tuple
     309              :  *
     310              :  * This routine may be used to delete a tuple when concurrent updates of
     311              :  * the target tuple are not expected (for example, because we have a lock
     312              :  * on the relation associated with the tuple).  Any failure is reported
     313              :  * via ereport().
     314              :  */
     315              : void
     316        40319 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
     317              : {
     318              :     TM_Result   result;
     319              :     TM_FailureData tmfd;
     320              : 
     321        40319 :     result = table_tuple_delete(rel, tid,
     322              :                                 GetCurrentCommandId(true),
     323              :                                 0, snapshot, InvalidSnapshot,
     324              :                                 true /* wait for commit */ ,
     325              :                                 &tmfd);
     326              : 
     327        40319 :     switch (result)
     328              :     {
     329            0 :         case TM_SelfModified:
     330              :             /* Tuple was already updated in current command? */
     331            0 :             elog(ERROR, "tuple already updated by self");
     332              :             break;
     333              : 
     334        40319 :         case TM_Ok:
     335              :             /* done successfully */
     336        40319 :             break;
     337              : 
     338            0 :         case TM_Updated:
     339            0 :             elog(ERROR, "tuple concurrently updated");
     340              :             break;
     341              : 
     342            0 :         case TM_Deleted:
     343            0 :             elog(ERROR, "tuple concurrently deleted");
     344              :             break;
     345              : 
     346            0 :         default:
     347            0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     348              :             break;
     349              :     }
     350        40319 : }
     351              : 
     352              : /*
     353              :  * simple_table_tuple_update - replace a tuple
     354              :  *
     355              :  * This routine may be used to update a tuple when concurrent updates of
     356              :  * the target tuple are not expected (for example, because we have a lock
     357              :  * on the relation associated with the tuple).  Any failure is reported
     358              :  * via ereport().
     359              :  */
     360              : void
     361        31927 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     362              :                           TupleTableSlot *slot,
     363              :                           Snapshot snapshot,
     364              :                           TU_UpdateIndexes *update_indexes)
     365              : {
     366              :     TM_Result   result;
     367              :     TM_FailureData tmfd;
     368              :     LockTupleMode lockmode;
     369              : 
     370        31927 :     result = table_tuple_update(rel, otid, slot,
     371              :                                 GetCurrentCommandId(true),
     372              :                                 0, snapshot, InvalidSnapshot,
     373              :                                 true /* wait for commit */ ,
     374              :                                 &tmfd, &lockmode, update_indexes);
     375              : 
     376        31927 :     switch (result)
     377              :     {
     378            0 :         case TM_SelfModified:
     379              :             /* Tuple was already updated in current command? */
     380            0 :             elog(ERROR, "tuple already updated by self");
     381              :             break;
     382              : 
     383        31927 :         case TM_Ok:
     384              :             /* done successfully */
     385        31927 :             break;
     386              : 
     387            0 :         case TM_Updated:
     388            0 :             elog(ERROR, "tuple concurrently updated");
     389              :             break;
     390              : 
     391            0 :         case TM_Deleted:
     392            0 :             elog(ERROR, "tuple concurrently deleted");
     393              :             break;
     394              : 
     395            0 :         default:
     396            0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     397              :             break;
     398              :     }
     399        31927 : }
     400              : 
     401              : 
     402              : /* ----------------------------------------------------------------------------
     403              :  * Helper functions to implement parallel scans for block oriented AMs.
     404              :  * ----------------------------------------------------------------------------
     405              :  */
     406              : 
     407              : Size
     408         1337 : table_block_parallelscan_estimate(Relation rel)
     409              : {
     410         1337 :     return sizeof(ParallelBlockTableScanDescData);
     411              : }
     412              : 
     413              : Size
     414         1337 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     415              : {
     416         1337 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     417              : 
     418         1337 :     bpscan->base.phs_locator = rel->rd_locator;
     419         1337 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     420              :     /* compare phs_syncscan initialization to similar logic in initscan */
     421         3681 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
     422         2344 :         !RelationUsesLocalBuffers(rel) &&
     423         1007 :         bpscan->phs_nblocks > NBuffers / 4;
     424         1337 :     SpinLockInit(&bpscan->phs_mutex);
     425         1337 :     bpscan->phs_startblock = InvalidBlockNumber;
     426         1337 :     bpscan->phs_numblock = InvalidBlockNumber;
     427         1337 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     428              : 
     429         1337 :     return sizeof(ParallelBlockTableScanDescData);
     430              : }
     431              : 
     432              : void
     433          152 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     434              : {
     435          152 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     436              : 
     437          152 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     438          152 : }
     439              : 
     440              : /*
     441              :  * find and set the scan's startblock
     442              :  *
     443              :  * Determine where the parallel seq scan should start.  This function may be
     444              :  * called many times, once by each parallel worker.  We must be careful only
     445              :  * to set the phs_startblock and phs_numblock fields once.
     446              :  *
     447              :  * Callers may optionally specify a non-InvalidBlockNumber value for
     448              :  * 'startblock' to force the scan to start at the given page.  Likewise,
     449              :  * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the
     450              :  * number of blocks to scan to that many blocks.
     451              :  */
     452              : void
     453         2771 : table_block_parallelscan_startblock_init(Relation rel,
     454              :                                          ParallelBlockTableScanWorker pbscanwork,
     455              :                                          ParallelBlockTableScanDesc pbscan,
     456              :                                          BlockNumber startblock,
     457              :                                          BlockNumber numblocks)
     458              : {
     459              :     StaticAssertDecl(MaxBlockNumber <= 0xFFFFFFFE,
     460              :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     461              : 
     462         2771 :     BlockNumber sync_startpage = InvalidBlockNumber;
     463              :     BlockNumber scan_nblocks;
     464              : 
     465              :     /* Reset the state we use for controlling allocation size. */
     466         2771 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
     467              : 
     468         2773 : retry:
     469              :     /* Grab the spinlock. */
     470         2773 :     SpinLockAcquire(&pbscan->phs_mutex);
     471              : 
     472              :     /*
     473              :      * When the caller specified a limit on the number of blocks to scan, set
     474              :      * that in the ParallelBlockTableScanDesc, if it's not been done by
     475              :      * another worker already.
     476              :      */
     477         2773 :     if (numblocks != InvalidBlockNumber &&
     478           80 :         pbscan->phs_numblock == InvalidBlockNumber)
     479              :     {
     480           16 :         pbscan->phs_numblock = numblocks;
     481              :     }
     482              : 
     483              :     /*
     484              :      * If the scan's phs_startblock has not yet been initialized, we must do
     485              :      * so now.  If a startblock was specified, start there, otherwise if this
     486              :      * is not a synchronized scan, we just start at block 0, but if it is a
     487              :      * synchronized scan, we must get the starting position from the
     488              :      * synchronized scan machinery.  We can't hold the spinlock while doing
     489              :      * that, though, so release the spinlock, get the information we need, and
     490              :      * retry.  If nobody else has initialized the scan in the meantime, we'll
     491              :      * fill in the value we fetched on the second time through.
     492              :      */
     493         2773 :     if (pbscan->phs_startblock == InvalidBlockNumber)
     494              :     {
     495         1324 :         if (startblock != InvalidBlockNumber)
     496           16 :             pbscan->phs_startblock = startblock;
     497         1308 :         else if (!pbscan->base.phs_syncscan)
     498         1304 :             pbscan->phs_startblock = 0;
     499            4 :         else if (sync_startpage != InvalidBlockNumber)
     500            2 :             pbscan->phs_startblock = sync_startpage;
     501              :         else
     502              :         {
     503            2 :             SpinLockRelease(&pbscan->phs_mutex);
     504            2 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     505            2 :             goto retry;
     506              :         }
     507              :     }
     508         2771 :     SpinLockRelease(&pbscan->phs_mutex);
     509              : 
     510              :     /*
     511              :      * Figure out how many blocks we're going to scan; either all of them, or
     512              :      * just phs_numblock's worth, if a limit has been imposed.
     513              :      */
     514         2771 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     515         2691 :         scan_nblocks = pbscan->phs_nblocks;
     516              :     else
     517           80 :         scan_nblocks = pbscan->phs_numblock;
     518              : 
     519              :     /*
     520              :      * We determine the chunk size based on scan_nblocks.  First we split
     521              :      * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
     522              :      * next highest power of 2 number of the result.  This means we split the
     523              :      * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     524              :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     525              :      */
     526         2771 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks /
     527              :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
     528              : 
     529              :     /*
     530              :      * Ensure we don't go over the maximum chunk size with larger tables. This
     531              :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     532              :      * tables.  Too large a chunk size has been shown to be detrimental to
     533              :      * sequential scan performance.
     534              :      */
     535         2771 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     536              :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     537         2771 : }
     538              : 
     539              : /*
     540              :  * get the next page to scan
     541              :  *
     542              :  * Get the next page to scan.  Even if there are no pages left to scan,
     543              :  * another backend could have grabbed a page to scan and not yet finished
     544              :  * looking at it, so it doesn't follow that the scan is done when the first
     545              :  * backend gets an InvalidBlockNumber return.
     546              :  */
     547              : BlockNumber
     548       146246 : table_block_parallelscan_nextpage(Relation rel,
     549              :                                   ParallelBlockTableScanWorker pbscanwork,
     550              :                                   ParallelBlockTableScanDesc pbscan)
     551              : {
     552              :     BlockNumber scan_nblocks;
     553              :     BlockNumber page;
     554              :     uint64      nallocated;
     555              : 
     556              :     /*
     557              :      * The logic below allocates block numbers out to parallel workers in a
     558              :      * way that each worker will receive a set of consecutive block numbers to
     559              :      * scan.  Earlier versions of this would allocate the next highest block
     560              :      * number to the next worker to call this function.  This would generally
     561              :      * result in workers never receiving consecutive block numbers.  Some
     562              :      * operating systems would not detect the sequential I/O pattern due to
     563              :      * each backend being a different process which could result in poor
     564              :      * performance due to inefficient or no readahead.  To work around this
     565              :      * issue, we now allocate a range of block numbers for each worker and
     566              :      * when they come back for another block, we give them the next one in
     567              :      * that range until the range is complete.  When the worker completes the
     568              :      * range of blocks we then allocate another range for it and return the
     569              :      * first block number from that range.
     570              :      *
     571              :      * Here we name these ranges of blocks "chunks".  The initial size of
     572              :      * these chunks is determined in table_block_parallelscan_startblock_init
     573              :      * based on the number of blocks to scan.  Towards the end of the scan, we
     574              :      * start making reductions in the size of the chunks in order to attempt
     575              :      * to divide the remaining work over all the workers as evenly as
     576              :      * possible.
     577              :      *
     578              :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     579              :      * the number of blocks remaining in the chunk.  When that reaches 0 then
     580              :      * we must allocate a new chunk for the worker.
     581              :      *
     582              :      * phs_nallocated tracks how many blocks have been allocated to workers
     583              :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     584              :      * allocated.
     585              :      *
     586              :      * Because we use an atomic fetch-and-add to fetch the current value, the
     587              :      * phs_nallocated counter will exceed rs_nblocks, because workers will
     588              :      * still increment the value, when they try to allocate the next block but
     589              :      * all blocks have been allocated already. The counter must be 64 bits
     590              :      * wide because of that, to avoid wrapping around when scan_nblocks is
     591              :      * close to 2^32.
     592              :      *
     593              :      * The actual block to return is calculated by adding the counter to the
     594              :      * starting block number, modulo phs_nblocks.
     595              :      */
     596              : 
     597              :     /* First, figure out how many blocks we're planning on scanning */
     598       146246 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     599       145834 :         scan_nblocks = pbscan->phs_nblocks;
     600              :     else
     601          412 :         scan_nblocks = pbscan->phs_numblock;
     602              : 
     603              :     /*
     604              :      * Now check if we have any remaining blocks in a previous chunk for this
     605              :      * worker.  We must consume all of the blocks from that before we allocate
     606              :      * a new chunk to the worker.
     607              :      */
     608       146246 :     if (pbscanwork->phsw_chunk_remaining > 0)
     609              :     {
     610              :         /*
     611              :          * Give them the next block in the range and update the remaining
     612              :          * number of blocks.
     613              :          */
     614        17946 :         nallocated = ++pbscanwork->phsw_nallocated;
     615        17946 :         pbscanwork->phsw_chunk_remaining--;
     616              :     }
     617              :     else
     618              :     {
     619              :         /*
     620              :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     621              :          * remaining in the scan, we half the chunk size.  Since we reduce the
     622              :          * chunk size here, we'll hit this again after doing
     623              :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     624              :          * iterations of this, we'll end up doing the last few blocks with the
     625              :          * chunk size set to 1.
     626              :          */
     627       128300 :         if (pbscanwork->phsw_chunk_size > 1 &&
     628         3939 :             pbscanwork->phsw_nallocated > scan_nblocks -
     629         3939 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     630           13 :             pbscanwork->phsw_chunk_size >>= 1;
     631              : 
     632       128300 :         nallocated = pbscanwork->phsw_nallocated =
     633       128300 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     634       128300 :                                     pbscanwork->phsw_chunk_size);
     635              : 
     636              :         /*
     637              :          * Set the remaining number of blocks in this chunk so that subsequent
     638              :          * calls from this worker continue on with this chunk until it's done.
     639              :          */
     640       128300 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     641              :     }
     642              : 
     643              :     /* Check if we've run out of blocks to scan */
     644       146246 :     if (nallocated >= scan_nblocks)
     645         2771 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
     646              :     else
     647       143475 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     648              : 
     649              :     /*
     650              :      * Report scan location.  Normally, we report the current page number.
     651              :      * When we reach the end of the scan, though, we report the starting page,
     652              :      * not the ending page, just so the starting positions for later scans
     653              :      * doesn't slew backwards.  We only report the position at the end of the
     654              :      * scan once, though: subsequent callers will report nothing.
     655              :      */
     656       146246 :     if (pbscan->base.phs_syncscan)
     657              :     {
     658        22130 :         if (page != InvalidBlockNumber)
     659        22125 :             ss_report_location(rel, page);
     660            5 :         else if (nallocated == pbscan->phs_nblocks)
     661            2 :             ss_report_location(rel, pbscan->phs_startblock);
     662              :     }
     663              : 
     664       146246 :     return page;
     665              : }
     666              : 
     667              : /* ----------------------------------------------------------------------------
     668              :  * Helper functions to implement relation sizing for block oriented AMs.
     669              :  * ----------------------------------------------------------------------------
     670              :  */
     671              : 
     672              : /*
     673              :  * table_block_relation_size
     674              :  *
     675              :  * If a table AM uses the various relation forks as the sole place where data
     676              :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     677              :  * is in the main fork rather than some other), it can use this implementation
     678              :  * of the relation_size callback rather than implementing its own.
     679              :  */
     680              : uint64
     681      1926848 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     682              : {
     683      1926848 :     uint64      nblocks = 0;
     684              : 
     685              :     /* InvalidForkNumber indicates returning the size for all forks */
     686      1926848 :     if (forkNumber == InvalidForkNumber)
     687              :     {
     688            0 :         for (int i = 0; i < MAX_FORKNUM; i++)
     689            0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     690              :     }
     691              :     else
     692      1926848 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     693              : 
     694      1926829 :     return nblocks * BLCKSZ;
     695              : }
     696              : 
     697              : /*
     698              :  * table_block_relation_estimate_size
     699              :  *
     700              :  * This function can't be directly used as the implementation of the
     701              :  * relation_estimate_size callback, because it has a few additional parameters.
     702              :  * Instead, it is intended to be used as a helper function; the caller can
     703              :  * pass through the arguments to its relation_estimate_size function plus the
     704              :  * additional values required here.
     705              :  *
     706              :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     707              :  * of storage required to store a tuple above and beyond what is required for
     708              :  * the tuple data proper. Typically, this would include things like the
     709              :  * size of the tuple header and item pointer. This is only used for query
     710              :  * planning, so a table AM where the value is not constant could choose to
     711              :  * pass a "best guess".
     712              :  *
     713              :  * usable_bytes_per_page should contain the approximate number of bytes per
     714              :  * page usable for tuple data, excluding the page header and any anticipated
     715              :  * special space.
     716              :  */
     717              : void
     718       352627 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     719              :                                    BlockNumber *pages, double *tuples,
     720              :                                    double *allvisfrac,
     721              :                                    Size overhead_bytes_per_tuple,
     722              :                                    Size usable_bytes_per_page)
     723              : {
     724              :     BlockNumber curpages;
     725              :     BlockNumber relpages;
     726              :     double      reltuples;
     727              :     BlockNumber relallvisible;
     728              :     double      density;
     729              : 
     730              :     /* it should have storage, so we can call the smgr */
     731       352627 :     curpages = RelationGetNumberOfBlocks(rel);
     732              : 
     733              :     /* coerce values in pg_class to more desirable types */
     734       352627 :     relpages = (BlockNumber) rel->rd_rel->relpages;
     735       352627 :     reltuples = (double) rel->rd_rel->reltuples;
     736       352627 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     737              : 
     738              :     /*
     739              :      * HACK: if the relation has never yet been vacuumed, use a minimum size
     740              :      * estimate of 10 pages.  The idea here is to avoid assuming a
     741              :      * newly-created table is really small, even if it currently is, because
     742              :      * that may not be true once some data gets loaded into it.  Once a vacuum
     743              :      * or analyze cycle has been done on it, it's more reasonable to believe
     744              :      * the size is somewhat stable.
     745              :      *
     746              :      * (Note that this is only an issue if the plan gets cached and used again
     747              :      * after the table has been filled.  What we're trying to avoid is using a
     748              :      * nestloop-type plan on a table that has grown substantially since the
     749              :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     750              :      * inserts have happened and cause cached-plan invalidation; but that
     751              :      * doesn't happen instantaneously, and it won't happen at all for cases
     752              :      * such as temporary tables.)
     753              :      *
     754              :      * We test "never vacuumed" by seeing whether reltuples < 0.
     755              :      *
     756              :      * If the table has inheritance children, we don't apply this heuristic.
     757              :      * Totally empty parent tables are quite common, so we should be willing
     758              :      * to believe that they are empty.
     759              :      */
     760       352627 :     if (curpages < 10 &&
     761        89367 :         reltuples < 0 &&
     762        89367 :         !rel->rd_rel->relhassubclass)
     763        87193 :         curpages = 10;
     764              : 
     765              :     /* report estimated # pages */
     766       352627 :     *pages = curpages;
     767              :     /* quick exit if rel is clearly empty */
     768       352627 :     if (curpages == 0)
     769              :     {
     770        19834 :         *tuples = 0;
     771        19834 :         *allvisfrac = 0;
     772        19834 :         return;
     773              :     }
     774              : 
     775              :     /* estimate number of tuples from previous tuple density */
     776       332793 :     if (reltuples >= 0 && relpages > 0)
     777       205304 :         density = reltuples / (double) relpages;
     778              :     else
     779              :     {
     780              :         /*
     781              :          * When we have no data because the relation was never yet vacuumed,
     782              :          * estimate tuple width from attribute datatypes.  We assume here that
     783              :          * the pages are completely full, which is OK for tables but is
     784              :          * probably an overestimate for indexes.  Fortunately
     785              :          * get_relation_info() can clamp the overestimate to the parent
     786              :          * table's size.
     787              :          *
     788              :          * Note: this code intentionally disregards alignment considerations,
     789              :          * because (a) that would be gilding the lily considering how crude
     790              :          * the estimate is, (b) it creates platform dependencies in the
     791              :          * default plans which are kind of a headache for regression testing,
     792              :          * and (c) different table AMs might use different padding schemes.
     793              :          */
     794              :         int32       tuple_width;
     795              :         int         fillfactor;
     796              : 
     797              :         /*
     798              :          * Without reltuples/relpages, we also need to consider fillfactor.
     799              :          * The other branch considers it implicitly by calculating density
     800              :          * from actual relpages/reltuples statistics.
     801              :          */
     802       127489 :         fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
     803              : 
     804       127489 :         tuple_width = get_rel_data_width(rel, attr_widths);
     805       127489 :         tuple_width += overhead_bytes_per_tuple;
     806              :         /* note: integer division is intentional here */
     807       127489 :         density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
     808              :         /* There's at least one row on the page, even with low fillfactor. */
     809       127489 :         density = clamp_row_est(density);
     810              :     }
     811       332793 :     *tuples = rint(density * (double) curpages);
     812              : 
     813              :     /*
     814              :      * We use relallvisible as-is, rather than scaling it up like we do for
     815              :      * the pages and tuples counts, on the theory that any pages added since
     816              :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
     817              :      * wants it converted to a fraction.
     818              :      */
     819       332793 :     if (relallvisible == 0 || curpages <= 0)
     820       167398 :         *allvisfrac = 0;
     821       165395 :     else if ((double) relallvisible >= curpages)
     822        89728 :         *allvisfrac = 1;
     823              :     else
     824        75667 :         *allvisfrac = (double) relallvisible / curpages;
     825              : }
        

Generated by: LCOV version 2.0-1