LCOV - code coverage report
Current view: top level - src/backend/access/heap - syncscan.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 52 54 96.3 %
Date: 2019-11-21 12:06:29 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * syncscan.c
       4             :  *    heap scan synchronization support
       5             :  *
       6             :  * When multiple backends run a sequential scan on the same table, we try
       7             :  * to keep them synchronized to reduce the overall I/O needed.  The goal is
       8             :  * to read each page into shared buffer cache only once, and let all backends
       9             :  * that take part in the shared scan process the page before it falls out of
      10             :  * the cache.
      11             :  *
      12             :  * Since the "leader" in a pack of backends doing a seqscan will have to wait
      13             :  * for I/O, while the "followers" don't, there is a strong self-synchronizing
      14             :  * effect once we can get the backends examining approximately the same part
      15             :  * of the table at the same time.  Hence all that is really needed is to get
      16             :  * a new backend beginning a seqscan to begin it close to where other backends
      17             :  * are reading.  We can scan the table circularly, from block X up to the
      18             :  * end and then from block 0 to X-1, to ensure we visit all rows while still
      19             :  * participating in the common scan.
      20             :  *
      21             :  * To accomplish that, we keep track of the scan position of each table, and
      22             :  * start new scans close to where the previous scan(s) are.  We don't try to
      23             :  * do any extra synchronization to keep the scans together afterwards; some
      24             :  * scans might progress much more slowly than others, for example if the
      25             :  * results need to be transferred to the client over a slow network, and we
      26             :  * don't want such queries to slow down others.
      27             :  *
      28             :  * There can realistically only be a few large sequential scans on different
      29             :  * tables in progress at any time.  Therefore we just keep the scan positions
      30             :  * in a small LRU list which we scan every time we need to look up or update a
      31             :  * scan position.  The whole mechanism is only applied for tables exceeding
      32             :  * a threshold size (but that is not the concern of this module).
      33             :  *
      34             :  * INTERFACE ROUTINES
      35             :  *      ss_get_location     - return current scan location of a relation
      36             :  *      ss_report_location  - update current scan location
      37             :  *
      38             :  *
      39             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
      40             :  * Portions Copyright (c) 1994, Regents of the University of California
      41             :  *
      42             :  * IDENTIFICATION
      43             :  *    src/backend/access/heap/syncscan.c
      44             :  *
      45             :  *-------------------------------------------------------------------------
      46             :  */
      47             : #include "postgres.h"
      48             : 
      49             : #include "access/heapam.h"
      50             : #include "miscadmin.h"
      51             : #include "storage/lwlock.h"
      52             : #include "storage/shmem.h"
      53             : #include "utils/rel.h"
      54             : 
      55             : 
      56             : /* GUC variables */
      57             : #ifdef TRACE_SYNCSCAN
      58             : bool        trace_syncscan = false;
      59             : #endif
      60             : 
      61             : 
      62             : /*
      63             :  * Size of the LRU list.
      64             :  *
      65             :  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
      66             :  *
      67             :  * XXX: What's a good value? It should be large enough to hold the
      68             :  * maximum number of large tables scanned simultaneously.  But a larger value
      69             :  * means more traversing of the LRU list when starting a new scan.
      70             :  */
      71             : #define SYNC_SCAN_NELEM 20
      72             : 
      73             : /*
      74             :  * Interval between reports of the location of the current scan, in pages.
      75             :  *
      76             :  * Note: This should be smaller than the ring size (see buffer/freelist.c)
      77             :  * we use for bulk reads.  Otherwise a scan joining other scans might start
      78             :  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
      79             :  * there's no guarantee that the new scan will read the page before it leaves
      80             :  * the buffer cache anyway, and on the other hand the page is most likely
      81             :  * still in the OS cache.
      82             :  */
      83             : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
      84             : 
      85             : 
      86             : /*
      87             :  * The scan locations structure is essentially a doubly-linked LRU with head
      88             :  * and tail pointer, but designed to hold a fixed maximum number of elements in
      89             :  * fixed-size shared memory.
      90             :  */
      91             : typedef struct ss_scan_location_t
      92             : {
      93             :     RelFileNode relfilenode;    /* identity of a relation */
      94             :     BlockNumber location;       /* last-reported location in the relation */
      95             : } ss_scan_location_t;
      96             : 
      97             : typedef struct ss_lru_item_t
      98             : {
      99             :     struct ss_lru_item_t *prev;
     100             :     struct ss_lru_item_t *next;
     101             :     ss_scan_location_t location;
     102             : } ss_lru_item_t;
     103             : 
     104             : typedef struct ss_scan_locations_t
     105             : {
     106             :     ss_lru_item_t *head;
     107             :     ss_lru_item_t *tail;
     108             :     ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
     109             : } ss_scan_locations_t;
     110             : 
     111             : #define SizeOfScanLocations(N) \
     112             :     (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
     113             : 
     114             : /* Pointer to struct in shared memory */
     115             : static ss_scan_locations_t *scan_locations;
     116             : 
     117             : /* prototypes for internal functions */
     118             : static BlockNumber ss_search(RelFileNode relfilenode,
     119             :                              BlockNumber location, bool set);
     120             : 
     121             : 
     122             : /*
     123             :  * SyncScanShmemSize --- report amount of shared memory space needed
     124             :  */
     125             : Size
     126        1894 : SyncScanShmemSize(void)
     127             : {
     128        1894 :     return SizeOfScanLocations(SYNC_SCAN_NELEM);
     129             : }
     130             : 
     131             : /*
     132             :  * SyncScanShmemInit --- initialize this module's shared memory
     133             :  */
     134             : void
     135        1890 : SyncScanShmemInit(void)
     136             : {
     137             :     int         i;
     138             :     bool        found;
     139             : 
     140        1890 :     scan_locations = (ss_scan_locations_t *)
     141        1890 :         ShmemInitStruct("Sync Scan Locations List",
     142             :                         SizeOfScanLocations(SYNC_SCAN_NELEM),
     143             :                         &found);
     144             : 
     145        1890 :     if (!IsUnderPostmaster)
     146             :     {
     147             :         /* Initialize shared memory area */
     148             :         Assert(!found);
     149             : 
     150        1890 :         scan_locations->head = &scan_locations->items[0];
     151        1890 :         scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
     152             : 
     153       39690 :         for (i = 0; i < SYNC_SCAN_NELEM; i++)
     154             :         {
     155       37800 :             ss_lru_item_t *item = &scan_locations->items[i];
     156             : 
     157             :             /*
     158             :              * Initialize all slots with invalid values. As scans are started,
     159             :              * these invalid entries will fall off the LRU list and get
     160             :              * replaced with real entries.
     161             :              */
     162       37800 :             item->location.relfilenode.spcNode = InvalidOid;
     163       37800 :             item->location.relfilenode.dbNode = InvalidOid;
     164       37800 :             item->location.relfilenode.relNode = InvalidOid;
     165       37800 :             item->location.location = InvalidBlockNumber;
     166             : 
     167       37800 :             item->prev = (i > 0) ?
     168       37800 :                 (&scan_locations->items[i - 1]) : NULL;
     169       37800 :             item->next = (i < SYNC_SCAN_NELEM - 1) ?
     170       37800 :                 (&scan_locations->items[i + 1]) : NULL;
     171             :         }
     172             :     }
     173             :     else
     174             :         Assert(found);
     175        1890 : }
     176             : 
     177             : /*
     178             :  * ss_search --- search the scan_locations structure for an entry with the
     179             :  *      given relfilenode.
     180             :  *
     181             :  * If "set" is true, the location is updated to the given location.  If no
     182             :  * entry for the given relfilenode is found, it will be created at the head
     183             :  * of the list with the given location, even if "set" is false.
     184             :  *
     185             :  * In any case, the location after possible update is returned.
     186             :  *
     187             :  * Caller is responsible for having acquired suitable lock on the shared
     188             :  * data structure.
     189             :  */
     190             : static BlockNumber
     191        1350 : ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
     192             : {
     193             :     ss_lru_item_t *item;
     194             : 
     195        1350 :     item = scan_locations->head;
     196             :     for (;;)
     197         266 :     {
     198             :         bool        match;
     199             : 
     200        1616 :         match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
     201             : 
     202        1616 :         if (match || item->next == NULL)
     203             :         {
     204             :             /*
     205             :              * If we reached the end of list and no match was found, take over
     206             :              * the last entry
     207             :              */
     208        1350 :             if (!match)
     209             :             {
     210          14 :                 item->location.relfilenode = relfilenode;
     211          14 :                 item->location.location = location;
     212             :             }
     213        1336 :             else if (set)
     214        1332 :                 item->location.location = location;
     215             : 
     216             :             /* Move the entry to the front of the LRU list */
     217        1350 :             if (item != scan_locations->head)
     218             :             {
     219             :                 /* unlink */
     220          14 :                 if (item == scan_locations->tail)
     221          14 :                     scan_locations->tail = item->prev;
     222          14 :                 item->prev->next = item->next;
     223          14 :                 if (item->next)
     224           0 :                     item->next->prev = item->prev;
     225             : 
     226             :                 /* link */
     227          14 :                 item->prev = NULL;
     228          14 :                 item->next = scan_locations->head;
     229          14 :                 scan_locations->head->prev = item;
     230          14 :                 scan_locations->head = item;
     231             :             }
     232             : 
     233        2700 :             return item->location.location;
     234             :         }
     235             : 
     236         266 :         item = item->next;
     237             :     }
     238             : 
     239             :     /* not reached */
     240             : }
     241             : 
     242             : /*
     243             :  * ss_get_location --- get the optimal starting location for scan
     244             :  *
     245             :  * Returns the last-reported location of a sequential scan on the
     246             :  * relation, or 0 if no valid location is found.
     247             :  *
     248             :  * We expect the caller has just done RelationGetNumberOfBlocks(), and
     249             :  * so that number is passed in rather than computing it again.  The result
     250             :  * is guaranteed less than relnblocks (assuming that's > 0).
     251             :  */
     252             : BlockNumber
     253          18 : ss_get_location(Relation rel, BlockNumber relnblocks)
     254             : {
     255             :     BlockNumber startloc;
     256             : 
     257          18 :     LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
     258          18 :     startloc = ss_search(rel->rd_node, 0, false);
     259          18 :     LWLockRelease(SyncScanLock);
     260             : 
     261             :     /*
     262             :      * If the location is not a valid block number for this scan, start at 0.
     263             :      *
     264             :      * This can happen if for instance a VACUUM truncated the table since the
     265             :      * location was saved.
     266             :      */
     267          18 :     if (startloc >= relnblocks)
     268           0 :         startloc = 0;
     269             : 
     270             : #ifdef TRACE_SYNCSCAN
     271             :     if (trace_syncscan)
     272             :         elog(LOG,
     273             :              "SYNC_SCAN: start \"%s\" (size %u) at %u",
     274             :              RelationGetRelationName(rel), relnblocks, startloc);
     275             : #endif
     276             : 
     277          18 :     return startloc;
     278             : }
     279             : 
     280             : /*
     281             :  * ss_report_location --- update the current scan location
     282             :  *
     283             :  * Writes an entry into the shared Sync Scan state of the form
     284             :  * (relfilenode, blocknumber), overwriting any existing entry for the
     285             :  * same relfilenode.
     286             :  */
     287             : void
     288       21076 : ss_report_location(Relation rel, BlockNumber location)
     289             : {
     290             : #ifdef TRACE_SYNCSCAN
     291             :     if (trace_syncscan)
     292             :     {
     293             :         if ((location % 1024) == 0)
     294             :             elog(LOG,
     295             :                  "SYNC_SCAN: scanning \"%s\" at %u",
     296             :                  RelationGetRelationName(rel), location);
     297             :     }
     298             : #endif
     299             : 
     300             :     /*
     301             :      * To reduce lock contention, only report scan progress every N pages. For
     302             :      * the same reason, don't block if the lock isn't immediately available.
     303             :      * Missing a few updates isn't critical, it just means that a new scan
     304             :      * that wants to join the pack will start a little bit behind the head of
     305             :      * the scan.  Hopefully the pages are still in OS cache and the scan
     306             :      * catches up quickly.
     307             :      */
     308       21076 :     if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
     309             :     {
     310        1332 :         if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
     311             :         {
     312        1332 :             (void) ss_search(rel->rd_node, location, true);
     313        1332 :             LWLockRelease(SyncScanLock);
     314             :         }
     315             : #ifdef TRACE_SYNCSCAN
     316             :         else if (trace_syncscan)
     317             :             elog(LOG,
     318             :                  "SYNC_SCAN: missed update for \"%s\" at %u",
     319             :                  RelationGetRelationName(rel), location);
     320             : #endif
     321             :     }
     322       21076 : }

Generated by: LCOV version 1.13