LCOV - code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 90.4 % 197 178
Test Date: 2026-03-10 01:14:48 Functions: 100.0 % 19 19
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*----------------------------------------------------------------------
       2              :  *
       3              :  * tableam.c
       4              :  *      Table access method routines too big to be inline functions.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/table/tableam.c
      12              :  *
      13              :  * NOTES
      14              :  *    Note that most functions in here are documented in tableam.h, rather than
      15              :  *    here. That's because there's a lot of inline functions in tableam.h and
      16              :  *    it'd be harder to understand if one constantly had to switch between files.
      17              :  *
      18              :  *----------------------------------------------------------------------
      19              :  */
      20              : #include "postgres.h"
      21              : 
      22              : #include <math.h>
      23              : 
      24              : #include "access/syncscan.h"
      25              : #include "access/tableam.h"
      26              : #include "access/xact.h"
      27              : #include "optimizer/optimizer.h"
      28              : #include "optimizer/plancat.h"
      29              : #include "port/pg_bitutils.h"
      30              : #include "storage/bufmgr.h"
      31              : #include "storage/shmem.h"
      32              : #include "storage/smgr.h"
      33              : 
      34              : /*
      35              :  * Constants to control the behavior of block allocation to parallel workers
      36              :  * during a parallel seqscan.  Technically these values do not need to be
      37              :  * powers of 2, but having them as powers of 2 makes the math more optimal
      38              :  * and makes the ramp-down stepping more even.
      39              :  */
      40              : 
      41              : /* The number of I/O chunks we try to break a parallel seqscan down into */
      42              : #define PARALLEL_SEQSCAN_NCHUNKS            2048
      43              : /* Ramp down size of allocations when we've only this number of chunks left */
      44              : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
      45              : /* Cap the size of parallel I/O chunks to this number of blocks */
      46              : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
      47              : 
      48              : /* GUC variables */
      49              : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      50              : bool        synchronize_seqscans = true;
      51              : 
      52              : 
      53              : /* ----------------------------------------------------------------------------
      54              :  * Slot functions.
      55              :  * ----------------------------------------------------------------------------
      56              :  */
      57              : 
      58              : const TupleTableSlotOps *
      59     14633084 : table_slot_callbacks(Relation relation)
      60              : {
      61              :     const TupleTableSlotOps *tts_cb;
      62              : 
      63     14633084 :     if (relation->rd_tableam)
      64     14628999 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
      65         4085 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      66              :     {
      67              :         /*
      68              :          * Historically FDWs expect to store heap tuples in slots. Continue
      69              :          * handing them one, to make it less painful to adapt FDWs to new
      70              :          * versions. The cost of a heap slot over a virtual slot is pretty
      71              :          * small.
      72              :          */
      73          222 :         tts_cb = &TTSOpsHeapTuple;
      74              :     }
      75              :     else
      76              :     {
      77              :         /*
      78              :          * These need to be supported, as some parts of the code (like COPY)
      79              :          * need to create slots for such relations too. It seems better to
      80              :          * centralize the knowledge that a heap slot is the right thing in
      81              :          * that case here.
      82              :          */
      83              :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      84              :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      85         3863 :         tts_cb = &TTSOpsVirtual;
      86              :     }
      87              : 
      88     14633084 :     return tts_cb;
      89              : }
      90              : 
      91              : TupleTableSlot *
      92     14375714 : table_slot_create(Relation relation, List **reglist)
      93              : {
      94              :     const TupleTableSlotOps *tts_cb;
      95              :     TupleTableSlot *slot;
      96              : 
      97     14375714 :     tts_cb = table_slot_callbacks(relation);
      98     14375714 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      99              : 
     100     14375714 :     if (reglist)
     101       142500 :         *reglist = lappend(*reglist, slot);
     102              : 
     103     14375714 :     return slot;
     104              : }
     105              : 
     106              : 
     107              : /* ----------------------------------------------------------------------------
     108              :  * Table scan functions.
     109              :  * ----------------------------------------------------------------------------
     110              :  */
     111              : 
     112              : TableScanDesc
     113        39367 : table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key)
     114              : {
     115        39367 :     uint32      flags = SO_TYPE_SEQSCAN |
     116              :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     117        39367 :     Oid         relid = RelationGetRelid(relation);
     118        39367 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     119              : 
     120        39367 :     return table_beginscan_common(relation, snapshot, nkeys, key,
     121              :                                   NULL, flags);
     122              : }
     123              : 
     124              : 
     125              : /* ----------------------------------------------------------------------------
     126              :  * Parallel table scan related functions.
     127              :  * ----------------------------------------------------------------------------
     128              :  */
     129              : 
     130              : Size
     131          578 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     132              : {
     133          578 :     Size        sz = 0;
     134              : 
     135          578 :     if (IsMVCCSnapshot(snapshot))
     136          483 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     137              :     else
     138              :         Assert(snapshot == SnapshotAny);
     139              : 
     140          578 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     141              : 
     142          578 :     return sz;
     143              : }
     144              : 
     145              : void
     146          578 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     147              :                               Snapshot snapshot)
     148              : {
     149          578 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     150              : 
     151          578 :     pscan->phs_snapshot_off = snapshot_off;
     152              : 
     153          578 :     if (IsMVCCSnapshot(snapshot))
     154              :     {
     155          483 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     156          483 :         pscan->phs_snapshot_any = false;
     157              :     }
     158              :     else
     159              :     {
     160              :         Assert(snapshot == SnapshotAny);
     161           95 :         pscan->phs_snapshot_any = true;
     162              :     }
     163          578 : }
     164              : 
     165              : TableScanDesc
     166         2059 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
     167              : {
     168              :     Snapshot    snapshot;
     169         2059 :     uint32      flags = SO_TYPE_SEQSCAN |
     170              :         SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     171              : 
     172              :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     173              : 
     174         2059 :     if (!pscan->phs_snapshot_any)
     175              :     {
     176              :         /* Snapshot was serialized -- restore it */
     177         1856 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     178         1856 :         RegisterSnapshot(snapshot);
     179         1856 :         flags |= SO_TEMP_SNAPSHOT;
     180              :     }
     181              :     else
     182              :     {
     183              :         /* SnapshotAny passed by caller (not serialized) */
     184          203 :         snapshot = SnapshotAny;
     185              :     }
     186              : 
     187         2059 :     return table_beginscan_common(relation, snapshot, 0, NULL,
     188              :                                   pscan, flags);
     189              : }
     190              : 
     191              : TableScanDesc
     192           60 : table_beginscan_parallel_tidrange(Relation relation,
     193              :                                   ParallelTableScanDesc pscan)
     194              : {
     195              :     Snapshot    snapshot;
     196           60 :     uint32      flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
     197              :     TableScanDesc sscan;
     198              : 
     199              :     Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
     200              : 
     201              :     /* disable syncscan in parallel tid range scan. */
     202           60 :     pscan->phs_syncscan = false;
     203              : 
     204           60 :     if (!pscan->phs_snapshot_any)
     205              :     {
     206              :         /* Snapshot was serialized -- restore it */
     207           60 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     208           60 :         RegisterSnapshot(snapshot);
     209           60 :         flags |= SO_TEMP_SNAPSHOT;
     210              :     }
     211              :     else
     212              :     {
     213              :         /* SnapshotAny passed by caller (not serialized) */
     214            0 :         snapshot = SnapshotAny;
     215              :     }
     216              : 
     217           60 :     sscan = table_beginscan_common(relation, snapshot, 0, NULL,
     218              :                                    pscan, flags);
     219           60 :     return sscan;
     220              : }
     221              : 
     222              : 
     223              : /* ----------------------------------------------------------------------------
     224              :  * Index scan related functions.
     225              :  * ----------------------------------------------------------------------------
     226              :  */
     227              : 
     228              : /*
     229              :  * To perform that check simply start an index scan, create the necessary
     230              :  * slot, do the heap lookup, and shut everything down again. This could be
     231              :  * optimized, but is unlikely to matter from a performance POV. If there
     232              :  * frequently are live index pointers also matching a unique index key, the
     233              :  * CPU overhead of this routine is unlikely to matter.
     234              :  *
     235              :  * Note that *tid may be modified when we return true if the AM supports
     236              :  * storing multiple row versions reachable via a single index entry (like
     237              :  * heap's HOT).
     238              :  */
     239              : bool
     240      5721442 : table_index_fetch_tuple_check(Relation rel,
     241              :                               ItemPointer tid,
     242              :                               Snapshot snapshot,
     243              :                               bool *all_dead)
     244              : {
     245              :     IndexFetchTableData *scan;
     246              :     TupleTableSlot *slot;
     247      5721442 :     bool        call_again = false;
     248              :     bool        found;
     249              : 
     250      5721442 :     slot = table_slot_create(rel, NULL);
     251      5721442 :     scan = table_index_fetch_begin(rel);
     252      5721442 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     253              :                                     all_dead);
     254      5721442 :     table_index_fetch_end(scan);
     255      5721442 :     ExecDropSingleTupleTableSlot(slot);
     256              : 
     257      5721442 :     return found;
     258              : }
     259              : 
     260              : 
     261              : /* ------------------------------------------------------------------------
     262              :  * Functions for non-modifying operations on individual tuples
     263              :  * ------------------------------------------------------------------------
     264              :  */
     265              : 
     266              : void
     267          156 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     268              : {
     269          156 :     Relation    rel = scan->rs_rd;
     270          156 :     const TableAmRoutine *tableam = rel->rd_tableam;
     271              : 
     272              :     /*
     273              :      * Since this can be called with user-supplied TID, don't trust the input
     274              :      * too much.
     275              :      */
     276          156 :     if (!tableam->tuple_tid_valid(scan, tid))
     277            6 :         ereport(ERROR,
     278              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     279              :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     280              :                         ItemPointerGetBlockNumberNoCheck(tid),
     281              :                         ItemPointerGetOffsetNumberNoCheck(tid),
     282              :                         RelationGetRelationName(rel))));
     283              : 
     284          150 :     tableam->tuple_get_latest_tid(scan, tid);
     285          150 : }
     286              : 
     287              : 
     288              : /* ----------------------------------------------------------------------------
     289              :  * Functions to make modifications a bit simpler.
     290              :  * ----------------------------------------------------------------------------
     291              :  */
     292              : 
     293              : /*
     294              :  * simple_table_tuple_insert - insert a tuple
     295              :  *
     296              :  * Currently, this routine differs from table_tuple_insert only in supplying a
     297              :  * default command ID and not allowing access to the speedup options.
     298              :  */
     299              : void
     300        76110 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     301              : {
     302        76110 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     303        76110 : }
     304              : 
     305              : /*
     306              :  * simple_table_tuple_delete - delete a tuple
     307              :  *
     308              :  * This routine may be used to delete a tuple when concurrent updates of
     309              :  * the target tuple are not expected (for example, because we have a lock
     310              :  * on the relation associated with the tuple).  Any failure is reported
     311              :  * via ereport().
     312              :  */
     313              : void
     314        40313 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
     315              : {
     316              :     TM_Result   result;
     317              :     TM_FailureData tmfd;
     318              : 
     319        40313 :     result = table_tuple_delete(rel, tid,
     320              :                                 GetCurrentCommandId(true),
     321              :                                 snapshot, InvalidSnapshot,
     322              :                                 true /* wait for commit */ ,
     323              :                                 &tmfd, false /* changingPart */ );
     324              : 
     325        40313 :     switch (result)
     326              :     {
     327            0 :         case TM_SelfModified:
     328              :             /* Tuple was already updated in current command? */
     329            0 :             elog(ERROR, "tuple already updated by self");
     330              :             break;
     331              : 
     332        40313 :         case TM_Ok:
     333              :             /* done successfully */
     334        40313 :             break;
     335              : 
     336            0 :         case TM_Updated:
     337            0 :             elog(ERROR, "tuple concurrently updated");
     338              :             break;
     339              : 
     340            0 :         case TM_Deleted:
     341            0 :             elog(ERROR, "tuple concurrently deleted");
     342              :             break;
     343              : 
     344            0 :         default:
     345            0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     346              :             break;
     347              :     }
     348        40313 : }
     349              : 
     350              : /*
     351              :  * simple_table_tuple_update - replace a tuple
     352              :  *
     353              :  * This routine may be used to update a tuple when concurrent updates of
     354              :  * the target tuple are not expected (for example, because we have a lock
     355              :  * on the relation associated with the tuple).  Any failure is reported
     356              :  * via ereport().
     357              :  */
     358              : void
     359        31921 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     360              :                           TupleTableSlot *slot,
     361              :                           Snapshot snapshot,
     362              :                           TU_UpdateIndexes *update_indexes)
     363              : {
     364              :     TM_Result   result;
     365              :     TM_FailureData tmfd;
     366              :     LockTupleMode lockmode;
     367              : 
     368        31921 :     result = table_tuple_update(rel, otid, slot,
     369              :                                 GetCurrentCommandId(true),
     370              :                                 snapshot, InvalidSnapshot,
     371              :                                 true /* wait for commit */ ,
     372              :                                 &tmfd, &lockmode, update_indexes);
     373              : 
     374        31921 :     switch (result)
     375              :     {
     376            0 :         case TM_SelfModified:
     377              :             /* Tuple was already updated in current command? */
     378            0 :             elog(ERROR, "tuple already updated by self");
     379              :             break;
     380              : 
     381        31921 :         case TM_Ok:
     382              :             /* done successfully */
     383        31921 :             break;
     384              : 
     385            0 :         case TM_Updated:
     386            0 :             elog(ERROR, "tuple concurrently updated");
     387              :             break;
     388              : 
     389            0 :         case TM_Deleted:
     390            0 :             elog(ERROR, "tuple concurrently deleted");
     391              :             break;
     392              : 
     393            0 :         default:
     394            0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     395              :             break;
     396              :     }
     397        31921 : }
     398              : 
     399              : 
     400              : /* ----------------------------------------------------------------------------
     401              :  * Helper functions to implement parallel scans for block oriented AMs.
     402              :  * ----------------------------------------------------------------------------
     403              :  */
     404              : 
     405              : Size
     406          578 : table_block_parallelscan_estimate(Relation rel)
     407              : {
     408          578 :     return sizeof(ParallelBlockTableScanDescData);
     409              : }
     410              : 
     411              : Size
     412          578 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     413              : {
     414          578 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     415              : 
     416          578 :     bpscan->base.phs_locator = rel->rd_locator;
     417          578 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     418              :     /* compare phs_syncscan initialization to similar logic in initscan */
     419         1546 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
     420          968 :         !RelationUsesLocalBuffers(rel) &&
     421          390 :         bpscan->phs_nblocks > NBuffers / 4;
     422          578 :     SpinLockInit(&bpscan->phs_mutex);
     423          578 :     bpscan->phs_startblock = InvalidBlockNumber;
     424          578 :     bpscan->phs_numblock = InvalidBlockNumber;
     425          578 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     426              : 
     427          578 :     return sizeof(ParallelBlockTableScanDescData);
     428              : }
     429              : 
     430              : void
     431          114 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     432              : {
     433          114 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     434              : 
     435          114 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     436          114 : }
     437              : 
     438              : /*
     439              :  * find and set the scan's startblock
     440              :  *
     441              :  * Determine where the parallel seq scan should start.  This function may be
     442              :  * called many times, once by each parallel worker.  We must be careful only
     443              :  * to set the phs_startblock and phs_numblock fields once.
     444              :  *
     445              :  * Callers may optionally specify a non-InvalidBlockNumber value for
     446              :  * 'startblock' to force the scan to start at the given page.  Likewise,
     447              :  * 'numblocks' can be specified as a non-InvalidBlockNumber to limit the
     448              :  * number of blocks to scan to that many blocks.
     449              :  */
     450              : void
     451         1657 : table_block_parallelscan_startblock_init(Relation rel,
     452              :                                          ParallelBlockTableScanWorker pbscanwork,
     453              :                                          ParallelBlockTableScanDesc pbscan,
     454              :                                          BlockNumber startblock,
     455              :                                          BlockNumber numblocks)
     456              : {
     457              :     StaticAssertDecl(MaxBlockNumber <= 0xFFFFFFFE,
     458              :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     459              : 
     460         1657 :     BlockNumber sync_startpage = InvalidBlockNumber;
     461              :     BlockNumber scan_nblocks;
     462              : 
     463              :     /* Reset the state we use for controlling allocation size. */
     464         1657 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
     465              : 
     466         1658 : retry:
     467              :     /* Grab the spinlock. */
     468         1658 :     SpinLockAcquire(&pbscan->phs_mutex);
     469              : 
     470              :     /*
     471              :      * When the caller specified a limit on the number of blocks to scan, set
     472              :      * that in the ParallelBlockTableScanDesc, if it's not been done by
     473              :      * another worker already.
     474              :      */
     475         1658 :     if (numblocks != InvalidBlockNumber &&
     476           60 :         pbscan->phs_numblock == InvalidBlockNumber)
     477              :     {
     478           12 :         pbscan->phs_numblock = numblocks;
     479              :     }
     480              : 
     481              :     /*
     482              :      * If the scan's phs_startblock has not yet been initialized, we must do
     483              :      * so now.  If a startblock was specified, start there, otherwise if this
     484              :      * is not a synchronized scan, we just start at block 0, but if it is a
     485              :      * synchronized scan, we must get the starting position from the
     486              :      * synchronized scan machinery.  We can't hold the spinlock while doing
     487              :      * that, though, so release the spinlock, get the information we need, and
     488              :      * retry.  If nobody else has initialized the scan in the meantime, we'll
     489              :      * fill in the value we fetched on the second time through.
     490              :      */
     491         1658 :     if (pbscan->phs_startblock == InvalidBlockNumber)
     492              :     {
     493          567 :         if (startblock != InvalidBlockNumber)
     494           12 :             pbscan->phs_startblock = startblock;
     495          555 :         else if (!pbscan->base.phs_syncscan)
     496          553 :             pbscan->phs_startblock = 0;
     497            2 :         else if (sync_startpage != InvalidBlockNumber)
     498            1 :             pbscan->phs_startblock = sync_startpage;
     499              :         else
     500              :         {
     501            1 :             SpinLockRelease(&pbscan->phs_mutex);
     502            1 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     503            1 :             goto retry;
     504              :         }
     505              :     }
     506         1657 :     SpinLockRelease(&pbscan->phs_mutex);
     507              : 
     508              :     /*
     509              :      * Figure out how many blocks we're going to scan; either all of them, or
     510              :      * just phs_numblock's worth, if a limit has been imposed.
     511              :      */
     512         1657 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     513         1597 :         scan_nblocks = pbscan->phs_nblocks;
     514              :     else
     515           60 :         scan_nblocks = pbscan->phs_numblock;
     516              : 
     517              :     /*
     518              :      * We determine the chunk size based on scan_nblocks.  First we split
     519              :      * scan_nblocks into PARALLEL_SEQSCAN_NCHUNKS chunks then we calculate the
     520              :      * next highest power of 2 number of the result.  This means we split the
     521              :      * blocks we're scanning into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     522              :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     523              :      */
     524         1657 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(scan_nblocks /
     525              :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
     526              : 
     527              :     /*
     528              :      * Ensure we don't go over the maximum chunk size with larger tables. This
     529              :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     530              :      * tables.  Too large a chunk size has been shown to be detrimental to
     531              :      * sequential scan performance.
     532              :      */
     533         1657 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     534              :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     535         1657 : }
     536              : 
     537              : /*
     538              :  * get the next page to scan
     539              :  *
     540              :  * Get the next page to scan.  Even if there are no pages left to scan,
     541              :  * another backend could have grabbed a page to scan and not yet finished
     542              :  * looking at it, so it doesn't follow that the scan is done when the first
     543              :  * backend gets an InvalidBlockNumber return.
     544              :  */
     545              : BlockNumber
     546       101718 : table_block_parallelscan_nextpage(Relation rel,
     547              :                                   ParallelBlockTableScanWorker pbscanwork,
     548              :                                   ParallelBlockTableScanDesc pbscan)
     549              : {
     550              :     BlockNumber scan_nblocks;
     551              :     BlockNumber page;
     552              :     uint64      nallocated;
     553              : 
     554              :     /*
     555              :      * The logic below allocates block numbers out to parallel workers in a
     556              :      * way that each worker will receive a set of consecutive block numbers to
     557              :      * scan.  Earlier versions of this would allocate the next highest block
     558              :      * number to the next worker to call this function.  This would generally
     559              :      * result in workers never receiving consecutive block numbers.  Some
     560              :      * operating systems would not detect the sequential I/O pattern due to
     561              :      * each backend being a different process which could result in poor
     562              :      * performance due to inefficient or no readahead.  To work around this
     563              :      * issue, we now allocate a range of block numbers for each worker and
     564              :      * when they come back for another block, we give them the next one in
     565              :      * that range until the range is complete.  When the worker completes the
     566              :      * range of blocks we then allocate another range for it and return the
     567              :      * first block number from that range.
     568              :      *
     569              :      * Here we name these ranges of blocks "chunks".  The initial size of
     570              :      * these chunks is determined in table_block_parallelscan_startblock_init
     571              :      * based on the number of blocks to scan.  Towards the end of the scan, we
     572              :      * start making reductions in the size of the chunks in order to attempt
     573              :      * to divide the remaining work over all the workers as evenly as
     574              :      * possible.
     575              :      *
     576              :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     577              :      * the number of blocks remaining in the chunk.  When that reaches 0 then
     578              :      * we must allocate a new chunk for the worker.
     579              :      *
     580              :      * phs_nallocated tracks how many blocks have been allocated to workers
     581              :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     582              :      * allocated.
     583              :      *
     584              :      * Because we use an atomic fetch-and-add to fetch the current value, the
     585              :      * phs_nallocated counter will exceed rs_nblocks, because workers will
     586              :      * still increment the value, when they try to allocate the next block but
     587              :      * all blocks have been allocated already. The counter must be 64 bits
     588              :      * wide because of that, to avoid wrapping around when scan_nblocks is
     589              :      * close to 2^32.
     590              :      *
     591              :      * The actual block to return is calculated by adding the counter to the
     592              :      * starting block number, modulo phs_nblocks.
     593              :      */
     594              : 
     595              :     /* First, figure out how many blocks we're planning on scanning */
     596       101718 :     if (pbscan->phs_numblock == InvalidBlockNumber)
     597       101409 :         scan_nblocks = pbscan->phs_nblocks;
     598              :     else
     599          309 :         scan_nblocks = pbscan->phs_numblock;
     600              : 
     601              :     /*
     602              :      * Now check if we have any remaining blocks in a previous chunk for this
     603              :      * worker.  We must consume all of the blocks from that before we allocate
     604              :      * a new chunk to the worker.
     605              :      */
     606       101718 :     if (pbscanwork->phsw_chunk_remaining > 0)
     607              :     {
     608              :         /*
     609              :          * Give them the next block in the range and update the remaining
     610              :          * number of blocks.
     611              :          */
     612         6513 :         nallocated = ++pbscanwork->phsw_nallocated;
     613         6513 :         pbscanwork->phsw_chunk_remaining--;
     614              :     }
     615              :     else
     616              :     {
     617              :         /*
     618              :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     619              :          * remaining in the scan, we half the chunk size.  Since we reduce the
     620              :          * chunk size here, we'll hit this again after doing
     621              :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     622              :          * iterations of this, we'll end up doing the last few blocks with the
     623              :          * chunk size set to 1.
     624              :          */
     625        95205 :         if (pbscanwork->phsw_chunk_size > 1 &&
     626         2215 :             pbscanwork->phsw_nallocated > scan_nblocks -
     627         2215 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     628            4 :             pbscanwork->phsw_chunk_size >>= 1;
     629              : 
     630        95205 :         nallocated = pbscanwork->phsw_nallocated =
     631        95205 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     632        95205 :                                     pbscanwork->phsw_chunk_size);
     633              : 
     634              :         /*
     635              :          * Set the remaining number of blocks in this chunk so that subsequent
     636              :          * calls from this worker continue on with this chunk until it's done.
     637              :          */
     638        95205 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     639              :     }
     640              : 
     641              :     /* Check if we've run out of blocks to scan */
     642       101718 :     if (nallocated >= scan_nblocks)
     643         1657 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
     644              :     else
     645       100061 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     646              : 
     647              :     /*
     648              :      * Report scan location.  Normally, we report the current page number.
     649              :      * When we reach the end of the scan, though, we report the starting page,
     650              :      * not the ending page, just so the starting positions for later scans
     651              :      * doesn't slew backwards.  We only report the position at the end of the
     652              :      * scan once, though: subsequent callers will report nothing.
     653              :      */
     654       101718 :     if (pbscan->base.phs_syncscan)
     655              :     {
     656         8852 :         if (page != InvalidBlockNumber)
     657         8850 :             ss_report_location(rel, page);
     658            2 :         else if (nallocated == pbscan->phs_nblocks)
     659            1 :             ss_report_location(rel, pbscan->phs_startblock);
     660              :     }
     661              : 
     662       101718 :     return page;
     663              : }
     664              : 
     665              : /* ----------------------------------------------------------------------------
     666              :  * Helper functions to implement relation sizing for block oriented AMs.
     667              :  * ----------------------------------------------------------------------------
     668              :  */
     669              : 
     670              : /*
     671              :  * table_block_relation_size
     672              :  *
     673              :  * If a table AM uses the various relation forks as the sole place where data
     674              :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     675              :  * is in the main fork rather than some other), it can use this implementation
     676              :  * of the relation_size callback rather than implementing its own.
     677              :  */
     678              : uint64
     679      1715606 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     680              : {
     681      1715606 :     uint64      nblocks = 0;
     682              : 
     683              :     /* InvalidForkNumber indicates returning the size for all forks */
     684      1715606 :     if (forkNumber == InvalidForkNumber)
     685              :     {
     686            0 :         for (int i = 0; i < MAX_FORKNUM; i++)
     687            0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     688              :     }
     689              :     else
     690      1715606 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     691              : 
     692      1715587 :     return nblocks * BLCKSZ;
     693              : }
     694              : 
     695              : /*
     696              :  * table_block_relation_estimate_size
     697              :  *
     698              :  * This function can't be directly used as the implementation of the
     699              :  * relation_estimate_size callback, because it has a few additional parameters.
     700              :  * Instead, it is intended to be used as a helper function; the caller can
     701              :  * pass through the arguments to its relation_estimate_size function plus the
     702              :  * additional values required here.
     703              :  *
     704              :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     705              :  * of storage required to store a tuple above and beyond what is required for
     706              :  * the tuple data proper. Typically, this would include things like the
     707              :  * size of the tuple header and item pointer. This is only used for query
     708              :  * planning, so a table AM where the value is not constant could choose to
     709              :  * pass a "best guess".
     710              :  *
     711              :  * usable_bytes_per_page should contain the approximate number of bytes per
     712              :  * page usable for tuple data, excluding the page header and any anticipated
     713              :  * special space.
     714              :  */
     715              : void
     716       251361 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     717              :                                    BlockNumber *pages, double *tuples,
     718              :                                    double *allvisfrac,
     719              :                                    Size overhead_bytes_per_tuple,
     720              :                                    Size usable_bytes_per_page)
     721              : {
     722              :     BlockNumber curpages;
     723              :     BlockNumber relpages;
     724              :     double      reltuples;
     725              :     BlockNumber relallvisible;
     726              :     double      density;
     727              : 
     728              :     /* it should have storage, so we can call the smgr */
     729       251361 :     curpages = RelationGetNumberOfBlocks(rel);
     730              : 
     731              :     /* coerce values in pg_class to more desirable types */
     732       251361 :     relpages = (BlockNumber) rel->rd_rel->relpages;
     733       251361 :     reltuples = (double) rel->rd_rel->reltuples;
     734       251361 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     735              : 
     736              :     /*
     737              :      * HACK: if the relation has never yet been vacuumed, use a minimum size
     738              :      * estimate of 10 pages.  The idea here is to avoid assuming a
     739              :      * newly-created table is really small, even if it currently is, because
     740              :      * that may not be true once some data gets loaded into it.  Once a vacuum
     741              :      * or analyze cycle has been done on it, it's more reasonable to believe
     742              :      * the size is somewhat stable.
     743              :      *
     744              :      * (Note that this is only an issue if the plan gets cached and used again
     745              :      * after the table has been filled.  What we're trying to avoid is using a
     746              :      * nestloop-type plan on a table that has grown substantially since the
     747              :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     748              :      * inserts have happened and cause cached-plan invalidation; but that
     749              :      * doesn't happen instantaneously, and it won't happen at all for cases
     750              :      * such as temporary tables.)
     751              :      *
     752              :      * We test "never vacuumed" by seeing whether reltuples < 0.
     753              :      *
     754              :      * If the table has inheritance children, we don't apply this heuristic.
     755              :      * Totally empty parent tables are quite common, so we should be willing
     756              :      * to believe that they are empty.
     757              :      */
     758       251361 :     if (curpages < 10 &&
     759        59252 :         reltuples < 0 &&
     760        59252 :         !rel->rd_rel->relhassubclass)
     761        57930 :         curpages = 10;
     762              : 
     763              :     /* report estimated # pages */
     764       251361 :     *pages = curpages;
     765              :     /* quick exit if rel is clearly empty */
     766       251361 :     if (curpages == 0)
     767              :     {
     768        16071 :         *tuples = 0;
     769        16071 :         *allvisfrac = 0;
     770        16071 :         return;
     771              :     }
     772              : 
     773              :     /* estimate number of tuples from previous tuple density */
     774       235290 :     if (reltuples >= 0 && relpages > 0)
     775       150893 :         density = reltuples / (double) relpages;
     776              :     else
     777              :     {
     778              :         /*
     779              :          * When we have no data because the relation was never yet vacuumed,
     780              :          * estimate tuple width from attribute datatypes.  We assume here that
     781              :          * the pages are completely full, which is OK for tables but is
     782              :          * probably an overestimate for indexes.  Fortunately
     783              :          * get_relation_info() can clamp the overestimate to the parent
     784              :          * table's size.
     785              :          *
     786              :          * Note: this code intentionally disregards alignment considerations,
     787              :          * because (a) that would be gilding the lily considering how crude
     788              :          * the estimate is, (b) it creates platform dependencies in the
     789              :          * default plans which are kind of a headache for regression testing,
     790              :          * and (c) different table AMs might use different padding schemes.
     791              :          */
     792              :         int32       tuple_width;
     793              :         int         fillfactor;
     794              : 
     795              :         /*
     796              :          * Without reltuples/relpages, we also need to consider fillfactor.
     797              :          * The other branch considers it implicitly by calculating density
     798              :          * from actual relpages/reltuples statistics.
     799              :          */
     800        84397 :         fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
     801              : 
     802        84397 :         tuple_width = get_rel_data_width(rel, attr_widths);
     803        84397 :         tuple_width += overhead_bytes_per_tuple;
     804              :         /* note: integer division is intentional here */
     805        84397 :         density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
     806              :         /* There's at least one row on the page, even with low fillfactor. */
     807        84397 :         density = clamp_row_est(density);
     808              :     }
     809       235290 :     *tuples = rint(density * (double) curpages);
     810              : 
     811              :     /*
     812              :      * We use relallvisible as-is, rather than scaling it up like we do for
     813              :      * the pages and tuples counts, on the theory that any pages added since
     814              :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
     815              :      * wants it converted to a fraction.
     816              :      */
     817       235290 :     if (relallvisible == 0 || curpages <= 0)
     818       112892 :         *allvisfrac = 0;
     819       122398 :     else if ((double) relallvisible >= curpages)
     820        70192 :         *allvisfrac = 1;
     821              :     else
     822        52206 :         *allvisfrac = (double) relallvisible / curpages;
     823              : }
        

Generated by: LCOV version 2.0-1