LCOV - code coverage report
Current view: top level - src/backend/access/common - syncscan.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 96.2 % 52 50
Test Date: 2026-04-07 14:16:30 Functions: 100.0 % 5 5
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * syncscan.c
       4              :  *    scan synchronization support
       5              :  *
       6              :  * When multiple backends run a sequential scan on the same table, we try
       7              :  * to keep them synchronized to reduce the overall I/O needed.  The goal is
       8              :  * to read each page into shared buffer cache only once, and let all backends
       9              :  * that take part in the shared scan process the page before it falls out of
      10              :  * the cache.
      11              :  *
      12              :  * Since the "leader" in a pack of backends doing a seqscan will have to wait
      13              :  * for I/O, while the "followers" don't, there is a strong self-synchronizing
      14              :  * effect once we can get the backends examining approximately the same part
      15              :  * of the table at the same time.  Hence all that is really needed is to get
      16              :  * a new backend beginning a seqscan to begin it close to where other backends
      17              :  * are reading.  We can scan the table circularly, from block X up to the
      18              :  * end and then from block 0 to X-1, to ensure we visit all rows while still
      19              :  * participating in the common scan.
      20              :  *
      21              :  * To accomplish that, we keep track of the scan position of each table, and
      22              :  * start new scans close to where the previous scan(s) are.  We don't try to
      23              :  * do any extra synchronization to keep the scans together afterwards; some
      24              :  * scans might progress much more slowly than others, for example if the
      25              :  * results need to be transferred to the client over a slow network, and we
      26              :  * don't want such queries to slow down others.
      27              :  *
      28              :  * There can realistically only be a few large sequential scans on different
      29              :  * tables in progress at any time.  Therefore we just keep the scan positions
      30              :  * in a small LRU list which we scan every time we need to look up or update a
      31              :  * scan position.  The whole mechanism is only applied for tables exceeding
      32              :  * a threshold size (but that is not the concern of this module).
      33              :  *
      34              :  * INTERFACE ROUTINES
      35              :  *      ss_get_location     - return current scan location of a relation
      36              :  *      ss_report_location  - update current scan location
      37              :  *
      38              :  *
      39              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      40              :  * Portions Copyright (c) 1994, Regents of the University of California
      41              :  *
      42              :  * IDENTIFICATION
      43              :  *    src/backend/access/common/syncscan.c
      44              :  *
      45              :  *-------------------------------------------------------------------------
      46              :  */
      47              : #include "postgres.h"
      48              : 
      49              : #include "access/syncscan.h"
      50              : #include "miscadmin.h"
      51              : #include "storage/lwlock.h"
      52              : #include "storage/shmem.h"
      53              : #include "storage/subsystems.h"
      54              : #include "utils/rel.h"
      55              : 
      56              : 
      57              : /* GUC variables */
      58              : #ifdef TRACE_SYNCSCAN
      59              : bool        trace_syncscan = false;
      60              : #endif
      61              : 
      62              : 
      63              : /*
      64              :  * Size of the LRU list.
      65              :  *
      66              :  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
      67              :  *
      68              :  * XXX: What's a good value? It should be large enough to hold the
      69              :  * maximum number of large tables scanned simultaneously.  But a larger value
      70              :  * means more traversing of the LRU list when starting a new scan.
      71              :  */
      72              : #define SYNC_SCAN_NELEM 20
      73              : 
      74              : /*
      75              :  * Interval between reports of the location of the current scan, in pages.
      76              :  *
      77              :  * Note: This should be smaller than the ring size (see buffer/freelist.c)
      78              :  * we use for bulk reads.  Otherwise a scan joining other scans might start
      79              :  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
      80              :  * there's no guarantee that the new scan will read the page before it leaves
      81              :  * the buffer cache anyway, and on the other hand the page is most likely
      82              :  * still in the OS cache.
      83              :  */
      84              : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
      85              : 
      86              : 
      87              : /*
      88              :  * The scan locations structure is essentially a doubly-linked LRU with head
      89              :  * and tail pointer, but designed to hold a fixed maximum number of elements in
      90              :  * fixed-size shared memory.
      91              :  */
      92              : typedef struct ss_scan_location_t
      93              : {
      94              :     RelFileLocator relfilelocator;  /* identity of a relation */
      95              :     BlockNumber location;       /* last-reported location in the relation */
      96              : } ss_scan_location_t;
      97              : 
      98              : typedef struct ss_lru_item_t
      99              : {
     100              :     struct ss_lru_item_t *prev;
     101              :     struct ss_lru_item_t *next;
     102              :     ss_scan_location_t location;
     103              : } ss_lru_item_t;
     104              : 
     105              : typedef struct ss_scan_locations_t
     106              : {
     107              :     ss_lru_item_t *head;
     108              :     ss_lru_item_t *tail;
     109              :     ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
     110              : } ss_scan_locations_t;
     111              : 
     112              : #define SizeOfScanLocations(N) \
     113              :     (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
     114              : 
     115              : static void SyncScanShmemRequest(void *arg);
     116              : static void SyncScanShmemInit(void *arg);
     117              : 
     118              : const ShmemCallbacks SyncScanShmemCallbacks = {
     119              :     .request_fn = SyncScanShmemRequest,
     120              :     .init_fn = SyncScanShmemInit,
     121              : };
     122              : 
     123              : /* Pointer to struct in shared memory */
     124              : static ss_scan_locations_t *scan_locations;
     125              : 
     126              : /* prototypes for internal functions */
     127              : static BlockNumber ss_search(RelFileLocator relfilelocator,
     128              :                              BlockNumber location, bool set);
     129              : 
     130              : 
     131              : /*
     132              :  * SyncScanShmemRequest --- register this module's shared memory
     133              :  */
     134              : static void
     135         1234 : SyncScanShmemRequest(void *arg)
     136              : {
     137         1234 :     ShmemRequestStruct(.name = "Sync Scan Locations List",
     138              :                        .size = SizeOfScanLocations(SYNC_SCAN_NELEM),
     139              :                        .ptr = (void **) &scan_locations,
     140              :         );
     141         1234 : }
     142              : 
     143              : /*
     144              :  * SyncScanShmemInit --- initialize this module's shared memory
     145              :  */
     146              : static void
     147         1231 : SyncScanShmemInit(void *arg)
     148              : {
     149              :     int         i;
     150              : 
     151         1231 :     scan_locations->head = &scan_locations->items[0];
     152         1231 :     scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
     153              : 
     154        25851 :     for (i = 0; i < SYNC_SCAN_NELEM; i++)
     155              :     {
     156        24620 :         ss_lru_item_t *item = &scan_locations->items[i];
     157              : 
     158              :         /*
     159              :          * Initialize all slots with invalid values. As scans are started,
     160              :          * these invalid entries will fall off the LRU list and get replaced
     161              :          * with real entries.
     162              :          */
     163        24620 :         item->location.relfilelocator.spcOid = InvalidOid;
     164        24620 :         item->location.relfilelocator.dbOid = InvalidOid;
     165        24620 :         item->location.relfilelocator.relNumber = InvalidRelFileNumber;
     166        24620 :         item->location.location = InvalidBlockNumber;
     167              : 
     168        24620 :         item->prev = (i > 0) ?
     169        24620 :             (&scan_locations->items[i - 1]) : NULL;
     170        24620 :         item->next = (i < SYNC_SCAN_NELEM - 1) ?
     171        24620 :             (&scan_locations->items[i + 1]) : NULL;
     172              :     }
     173         1231 : }
     174              : 
     175              : /*
     176              :  * ss_search --- search the scan_locations structure for an entry with the
     177              :  *      given relfilelocator.
     178              :  *
     179              :  * If "set" is true, the location is updated to the given location.  If no
     180              :  * entry for the given relfilelocator is found, it will be created at the head
     181              :  * of the list with the given location, even if "set" is false.
     182              :  *
     183              :  * In any case, the location after possible update is returned.
     184              :  *
     185              :  * Caller is responsible for having acquired suitable lock on the shared
     186              :  * data structure.
     187              :  */
     188              : static BlockNumber
     189         3905 : ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
     190              : {
     191              :     ss_lru_item_t *item;
     192              : 
     193         3905 :     item = scan_locations->head;
     194              :     for (;;)
     195          665 :     {
     196              :         bool        match;
     197              : 
     198         4570 :         match = RelFileLocatorEquals(item->location.relfilelocator,
     199              :                                      relfilelocator);
     200              : 
     201         4570 :         if (match || item->next == NULL)
     202              :         {
     203              :             /*
     204              :              * If we reached the end of list and no match was found, take over
     205              :              * the last entry
     206              :              */
     207         3905 :             if (!match)
     208              :             {
     209           35 :                 item->location.relfilelocator = relfilelocator;
     210           35 :                 item->location.location = location;
     211              :             }
     212         3870 :             else if (set)
     213         3825 :                 item->location.location = location;
     214              : 
     215              :             /* Move the entry to the front of the LRU list */
     216         3905 :             if (item != scan_locations->head)
     217              :             {
     218              :                 /* unlink */
     219           35 :                 if (item == scan_locations->tail)
     220           35 :                     scan_locations->tail = item->prev;
     221           35 :                 item->prev->next = item->next;
     222           35 :                 if (item->next)
     223            0 :                     item->next->prev = item->prev;
     224              : 
     225              :                 /* link */
     226           35 :                 item->prev = NULL;
     227           35 :                 item->next = scan_locations->head;
     228           35 :                 scan_locations->head->prev = item;
     229           35 :                 scan_locations->head = item;
     230              :             }
     231              : 
     232         3905 :             return item->location.location;
     233              :         }
     234              : 
     235          665 :         item = item->next;
     236              :     }
     237              : 
     238              :     /* not reached */
     239              : }
     240              : 
     241              : /*
     242              :  * ss_get_location --- get the optimal starting location for scan
     243              :  *
     244              :  * Returns the last-reported location of a sequential scan on the
     245              :  * relation, or 0 if no valid location is found.
     246              :  *
     247              :  * We expect the caller has just done RelationGetNumberOfBlocks(), and
     248              :  * so that number is passed in rather than computing it again.  The result
     249              :  * is guaranteed less than relnblocks (assuming that's > 0).
     250              :  */
     251              : BlockNumber
     252           80 : ss_get_location(Relation rel, BlockNumber relnblocks)
     253              : {
     254              :     BlockNumber startloc;
     255              : 
     256           80 :     LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
     257           80 :     startloc = ss_search(rel->rd_locator, 0, false);
     258           80 :     LWLockRelease(SyncScanLock);
     259              : 
     260              :     /*
     261              :      * If the location is not a valid block number for this scan, start at 0.
     262              :      *
     263              :      * This can happen if for instance a VACUUM truncated the table since the
     264              :      * location was saved.
     265              :      */
     266           80 :     if (startloc >= relnblocks)
     267            0 :         startloc = 0;
     268              : 
     269              : #ifdef TRACE_SYNCSCAN
     270              :     if (trace_syncscan)
     271              :         elog(LOG,
     272              :              "SYNC_SCAN: start \"%s\" (size %u) at %u",
     273              :              RelationGetRelationName(rel), relnblocks, startloc);
     274              : #endif
     275              : 
     276           80 :     return startloc;
     277              : }
     278              : 
     279              : /*
     280              :  * ss_report_location --- update the current scan location
     281              :  *
     282              :  * Writes an entry into the shared Sync Scan state of the form
     283              :  * (relfilelocator, blocknumber), overwriting any existing entry for the
     284              :  * same relfilelocator.
     285              :  */
     286              : void
     287        60491 : ss_report_location(Relation rel, BlockNumber location)
     288              : {
     289              : #ifdef TRACE_SYNCSCAN
     290              :     if (trace_syncscan)
     291              :     {
     292              :         if ((location % 1024) == 0)
     293              :             elog(LOG,
     294              :                  "SYNC_SCAN: scanning \"%s\" at %u",
     295              :                  RelationGetRelationName(rel), location);
     296              :     }
     297              : #endif
     298              : 
     299              :     /*
     300              :      * To reduce lock contention, only report scan progress every N pages. For
     301              :      * the same reason, don't block if the lock isn't immediately available.
     302              :      * Missing a few updates isn't critical, it just means that a new scan
     303              :      * that wants to join the pack will start a little bit behind the head of
     304              :      * the scan.  Hopefully the pages are still in OS cache and the scan
     305              :      * catches up quickly.
     306              :      */
     307        60491 :     if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
     308              :     {
     309         3825 :         if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
     310              :         {
     311         3825 :             (void) ss_search(rel->rd_locator, location, true);
     312         3825 :             LWLockRelease(SyncScanLock);
     313              :         }
     314              : #ifdef TRACE_SYNCSCAN
     315              :         else if (trace_syncscan)
     316              :             elog(LOG,
     317              :                  "SYNC_SCAN: missed update for \"%s\" at %u",
     318              :                  RelationGetRelationName(rel), location);
     319              : #endif
     320              :     }
     321        60491 : }
        

Generated by: LCOV version 2.0-1