LCOV - code coverage report
Current view: top level - src/backend/access/common - syncscan.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 52 54 96.3 %
Date: 2025-01-18 04:15:08 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * syncscan.c
       4             :  *    scan synchronization support
       5             :  *
       6             :  * When multiple backends run a sequential scan on the same table, we try
       7             :  * to keep them synchronized to reduce the overall I/O needed.  The goal is
       8             :  * to read each page into shared buffer cache only once, and let all backends
       9             :  * that take part in the shared scan process the page before it falls out of
      10             :  * the cache.
      11             :  *
      12             :  * Since the "leader" in a pack of backends doing a seqscan will have to wait
      13             :  * for I/O, while the "followers" don't, there is a strong self-synchronizing
      14             :  * effect once we can get the backends examining approximately the same part
      15             :  * of the table at the same time.  Hence all that is really needed is to get
      16             :  * a new backend beginning a seqscan to begin it close to where other backends
      17             :  * are reading.  We can scan the table circularly, from block X up to the
      18             :  * end and then from block 0 to X-1, to ensure we visit all rows while still
      19             :  * participating in the common scan.
      20             :  *
      21             :  * To accomplish that, we keep track of the scan position of each table, and
      22             :  * start new scans close to where the previous scan(s) are.  We don't try to
      23             :  * do any extra synchronization to keep the scans together afterwards; some
      24             :  * scans might progress much more slowly than others, for example if the
      25             :  * results need to be transferred to the client over a slow network, and we
      26             :  * don't want such queries to slow down others.
      27             :  *
      28             :  * There can realistically only be a few large sequential scans on different
      29             :  * tables in progress at any time.  Therefore we just keep the scan positions
      30             :  * in a small LRU list which we scan every time we need to look up or update a
      31             :  * scan position.  The whole mechanism is only applied for tables exceeding
      32             :  * a threshold size (but that is not the concern of this module).
      33             :  *
      34             :  * INTERFACE ROUTINES
      35             :  *      ss_get_location     - return current scan location of a relation
      36             :  *      ss_report_location  - update current scan location
      37             :  *
      38             :  *
      39             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      40             :  * Portions Copyright (c) 1994, Regents of the University of California
      41             :  *
      42             :  * IDENTIFICATION
      43             :  *    src/backend/access/common/syncscan.c
      44             :  *
      45             :  *-------------------------------------------------------------------------
      46             :  */
      47             : #include "postgres.h"
      48             : 
      49             : #include "access/syncscan.h"
      50             : #include "miscadmin.h"
      51             : #include "storage/lwlock.h"
      52             : #include "storage/shmem.h"
      53             : #include "utils/rel.h"
      54             : 
      55             : 
      56             : /* GUC variables */
      57             : #ifdef TRACE_SYNCSCAN
      58             : bool        trace_syncscan = false;
      59             : #endif
      60             : 
      61             : 
      62             : /*
      63             :  * Size of the LRU list.
      64             :  *
      65             :  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
      66             :  *
      67             :  * XXX: What's a good value? It should be large enough to hold the
      68             :  * maximum number of large tables scanned simultaneously.  But a larger value
      69             :  * means more traversing of the LRU list when starting a new scan.
      70             :  */
      71             : #define SYNC_SCAN_NELEM 20
      72             : 
      73             : /*
      74             :  * Interval between reports of the location of the current scan, in pages.
      75             :  *
      76             :  * Note: This should be smaller than the ring size (see buffer/freelist.c)
      77             :  * we use for bulk reads.  Otherwise a scan joining other scans might start
      78             :  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
      79             :  * there's no guarantee that the new scan will read the page before it leaves
      80             :  * the buffer cache anyway, and on the other hand the page is most likely
      81             :  * still in the OS cache.
      82             :  */
      83             : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
      84             : 
      85             : 
      86             : /*
      87             :  * The scan locations structure is essentially a doubly-linked LRU with head
      88             :  * and tail pointer, but designed to hold a fixed maximum number of elements in
      89             :  * fixed-size shared memory.
      90             :  */
      91             : typedef struct ss_scan_location_t
      92             : {
      93             :     RelFileLocator relfilelocator;  /* identity of a relation */
      94             :     BlockNumber location;       /* last-reported location in the relation */
      95             : } ss_scan_location_t;
      96             : 
      97             : typedef struct ss_lru_item_t
      98             : {
      99             :     struct ss_lru_item_t *prev;
     100             :     struct ss_lru_item_t *next;
     101             :     ss_scan_location_t location;
     102             : } ss_lru_item_t;
     103             : 
     104             : typedef struct ss_scan_locations_t
     105             : {
     106             :     ss_lru_item_t *head;
     107             :     ss_lru_item_t *tail;
     108             :     ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
     109             : } ss_scan_locations_t;
     110             : 
     111             : #define SizeOfScanLocations(N) \
     112             :     (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
     113             : 
     114             : /* Pointer to struct in shared memory */
     115             : static ss_scan_locations_t *scan_locations;
     116             : 
     117             : /* prototypes for internal functions */
     118             : static BlockNumber ss_search(RelFileLocator relfilelocator,
     119             :                              BlockNumber location, bool set);
     120             : 
     121             : 
     122             : /*
     123             :  * SyncScanShmemSize --- report amount of shared memory space needed
     124             :  */
     125             : Size
     126        3566 : SyncScanShmemSize(void)
     127             : {
     128        3566 :     return SizeOfScanLocations(SYNC_SCAN_NELEM);
     129             : }
     130             : 
     131             : /*
     132             :  * SyncScanShmemInit --- initialize this module's shared memory
     133             :  */
     134             : void
     135        1918 : SyncScanShmemInit(void)
     136             : {
     137             :     int         i;
     138             :     bool        found;
     139             : 
     140        1918 :     scan_locations = (ss_scan_locations_t *)
     141        1918 :         ShmemInitStruct("Sync Scan Locations List",
     142             :                         SizeOfScanLocations(SYNC_SCAN_NELEM),
     143             :                         &found);
     144             : 
     145        1918 :     if (!IsUnderPostmaster)
     146             :     {
     147             :         /* Initialize shared memory area */
     148             :         Assert(!found);
     149             : 
     150        1918 :         scan_locations->head = &scan_locations->items[0];
     151        1918 :         scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
     152             : 
     153       40278 :         for (i = 0; i < SYNC_SCAN_NELEM; i++)
     154             :         {
     155       38360 :             ss_lru_item_t *item = &scan_locations->items[i];
     156             : 
     157             :             /*
     158             :              * Initialize all slots with invalid values. As scans are started,
     159             :              * these invalid entries will fall off the LRU list and get
     160             :              * replaced with real entries.
     161             :              */
     162       38360 :             item->location.relfilelocator.spcOid = InvalidOid;
     163       38360 :             item->location.relfilelocator.dbOid = InvalidOid;
     164       38360 :             item->location.relfilelocator.relNumber = InvalidRelFileNumber;
     165       38360 :             item->location.location = InvalidBlockNumber;
     166             : 
     167       38360 :             item->prev = (i > 0) ?
     168       38360 :                 (&scan_locations->items[i - 1]) : NULL;
     169       38360 :             item->next = (i < SYNC_SCAN_NELEM - 1) ?
     170       38360 :                 (&scan_locations->items[i + 1]) : NULL;
     171             :         }
     172             :     }
     173             :     else
     174             :         Assert(found);
     175        1918 : }
     176             : 
     177             : /*
     178             :  * ss_search --- search the scan_locations structure for an entry with the
     179             :  *      given relfilelocator.
     180             :  *
     181             :  * If "set" is true, the location is updated to the given location.  If no
     182             :  * entry for the given relfilelocator is found, it will be created at the head
     183             :  * of the list with the given location, even if "set" is false.
     184             :  *
     185             :  * In any case, the location after possible update is returned.
     186             :  *
     187             :  * Caller is responsible for having acquired suitable lock on the shared
     188             :  * data structure.
     189             :  */
     190             : static BlockNumber
     191        2404 : ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
     192             : {
     193             :     ss_lru_item_t *item;
     194             : 
     195        2404 :     item = scan_locations->head;
     196             :     for (;;)
     197         912 :     {
     198             :         bool        match;
     199             : 
     200        3316 :         match = RelFileLocatorEquals(item->location.relfilelocator,
     201             :                                      relfilelocator);
     202             : 
     203        3316 :         if (match || item->next == NULL)
     204             :         {
     205             :             /*
     206             :              * If we reached the end of list and no match was found, take over
     207             :              * the last entry
     208             :              */
     209        2404 :             if (!match)
     210             :             {
     211          48 :                 item->location.relfilelocator = relfilelocator;
     212          48 :                 item->location.location = location;
     213             :             }
     214        2356 :             else if (set)
     215        2284 :                 item->location.location = location;
     216             : 
     217             :             /* Move the entry to the front of the LRU list */
     218        2404 :             if (item != scan_locations->head)
     219             :             {
     220             :                 /* unlink */
     221          48 :                 if (item == scan_locations->tail)
     222          48 :                     scan_locations->tail = item->prev;
     223          48 :                 item->prev->next = item->next;
     224          48 :                 if (item->next)
     225           0 :                     item->next->prev = item->prev;
     226             : 
     227             :                 /* link */
     228          48 :                 item->prev = NULL;
     229          48 :                 item->next = scan_locations->head;
     230          48 :                 scan_locations->head->prev = item;
     231          48 :                 scan_locations->head = item;
     232             :             }
     233             : 
     234        2404 :             return item->location.location;
     235             :         }
     236             : 
     237         912 :         item = item->next;
     238             :     }
     239             : 
     240             :     /* not reached */
     241             : }
     242             : 
     243             : /*
     244             :  * ss_get_location --- get the optimal starting location for scan
     245             :  *
     246             :  * Returns the last-reported location of a sequential scan on the
     247             :  * relation, or 0 if no valid location is found.
     248             :  *
     249             :  * We expect the caller has just done RelationGetNumberOfBlocks(), and
     250             :  * so that number is passed in rather than computing it again.  The result
     251             :  * is guaranteed less than relnblocks (assuming that's > 0).
     252             :  */
     253             : BlockNumber
     254         120 : ss_get_location(Relation rel, BlockNumber relnblocks)
     255             : {
     256             :     BlockNumber startloc;
     257             : 
     258         120 :     LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
     259         120 :     startloc = ss_search(rel->rd_locator, 0, false);
     260         120 :     LWLockRelease(SyncScanLock);
     261             : 
     262             :     /*
     263             :      * If the location is not a valid block number for this scan, start at 0.
     264             :      *
     265             :      * This can happen if for instance a VACUUM truncated the table since the
     266             :      * location was saved.
     267             :      */
     268         120 :     if (startloc >= relnblocks)
     269           0 :         startloc = 0;
     270             : 
     271             : #ifdef TRACE_SYNCSCAN
     272             :     if (trace_syncscan)
     273             :         elog(LOG,
     274             :              "SYNC_SCAN: start \"%s\" (size %u) at %u",
     275             :              RelationGetRelationName(rel), relnblocks, startloc);
     276             : #endif
     277             : 
     278         120 :     return startloc;
     279             : }
     280             : 
     281             : /*
     282             :  * ss_report_location --- update the current scan location
     283             :  *
     284             :  * Writes an entry into the shared Sync Scan state of the form
     285             :  * (relfilelocator, blocknumber), overwriting any existing entry for the
     286             :  * same relfilelocator.
     287             :  */
     288             : void
     289       35276 : ss_report_location(Relation rel, BlockNumber location)
     290             : {
     291             : #ifdef TRACE_SYNCSCAN
     292             :     if (trace_syncscan)
     293             :     {
     294             :         if ((location % 1024) == 0)
     295             :             elog(LOG,
     296             :                  "SYNC_SCAN: scanning \"%s\" at %u",
     297             :                  RelationGetRelationName(rel), location);
     298             :     }
     299             : #endif
     300             : 
     301             :     /*
     302             :      * To reduce lock contention, only report scan progress every N pages. For
     303             :      * the same reason, don't block if the lock isn't immediately available.
     304             :      * Missing a few updates isn't critical, it just means that a new scan
     305             :      * that wants to join the pack will start a little bit behind the head of
     306             :      * the scan.  Hopefully the pages are still in OS cache and the scan
     307             :      * catches up quickly.
     308             :      */
     309       35276 :     if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
     310             :     {
     311        2284 :         if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
     312             :         {
     313        2284 :             (void) ss_search(rel->rd_locator, location, true);
     314        2284 :             LWLockRelease(SyncScanLock);
     315             :         }
     316             : #ifdef TRACE_SYNCSCAN
     317             :         else if (trace_syncscan)
     318             :             elog(LOG,
     319             :                  "SYNC_SCAN: missed update for \"%s\" at %u",
     320             :                  RelationGetRelationName(rel), location);
     321             : #endif
     322             :     }
     323       35276 : }

Generated by: LCOV version 1.14