LCOV - code coverage report
Current view: top level - src/backend/storage/buffer - freelist.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 178 190 93.7 %
Date: 2025-04-24 12:15:10 Functions: 17 17 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * freelist.c
       4             :  *    routines for managing the buffer pool's replacement strategy.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  *
      11             :  * IDENTIFICATION
      12             :  *    src/backend/storage/buffer/freelist.c
      13             :  *
      14             :  *-------------------------------------------------------------------------
      15             :  */
      16             : #include "postgres.h"
      17             : 
      18             : #include "pgstat.h"
      19             : #include "port/atomics.h"
      20             : #include "storage/buf_internals.h"
      21             : #include "storage/bufmgr.h"
      22             : #include "storage/proc.h"
      23             : 
      24             : #define INT_ACCESS_ONCE(var)    ((int)(*((volatile int *)&(var))))
      25             : 
      26             : 
      27             : /*
      28             :  * The shared freelist control information.
      29             :  */
      30             : typedef struct
      31             : {
      32             :     /* Spinlock: protects the values below */
      33             :     slock_t     buffer_strategy_lock;
      34             : 
      35             :     /*
      36             :      * Clock sweep hand: index of next buffer to consider grabbing. Note that
      37             :      * this isn't a concrete buffer - we only ever increase the value. So, to
      38             :      * get an actual buffer, it needs to be used modulo NBuffers.
      39             :      */
      40             :     pg_atomic_uint32 nextVictimBuffer;
      41             : 
      42             :     int         firstFreeBuffer;    /* Head of list of unused buffers */
      43             :     int         lastFreeBuffer; /* Tail of list of unused buffers */
      44             : 
      45             :     /*
      46             :      * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
      47             :      * when the list is empty)
      48             :      */
      49             : 
      50             :     /*
      51             :      * Statistics.  These counters should be wide enough that they can't
      52             :      * overflow during a single bgwriter cycle.
      53             :      */
      54             :     uint32      completePasses; /* Complete cycles of the clock sweep */
      55             :     pg_atomic_uint32 numBufferAllocs;   /* Buffers allocated since last reset */
      56             : 
      57             :     /*
      58             :      * Bgworker process to be notified upon activity or -1 if none. See
      59             :      * StrategyNotifyBgWriter.
      60             :      */
      61             :     int         bgwprocno;
      62             : } BufferStrategyControl;
      63             : 
      64             : /* Pointers to shared state */
      65             : static BufferStrategyControl *StrategyControl = NULL;
      66             : 
      67             : /*
      68             :  * Private (non-shared) state for managing a ring of shared buffers to re-use.
      69             :  * This is currently the only kind of BufferAccessStrategy object, but someday
      70             :  * we might have more kinds.
      71             :  */
      72             : typedef struct BufferAccessStrategyData
      73             : {
      74             :     /* Overall strategy type */
      75             :     BufferAccessStrategyType btype;
      76             :     /* Number of elements in buffers[] array */
      77             :     int         nbuffers;
      78             : 
      79             :     /*
      80             :      * Index of the "current" slot in the ring, ie, the one most recently
      81             :      * returned by GetBufferFromRing.
      82             :      */
      83             :     int         current;
      84             : 
      85             :     /*
      86             :      * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
      87             :      * have not yet selected a buffer for this ring slot.  For allocation
      88             :      * simplicity this is palloc'd together with the fixed fields of the
      89             :      * struct.
      90             :      */
      91             :     Buffer      buffers[FLEXIBLE_ARRAY_MEMBER];
      92             : }           BufferAccessStrategyData;
      93             : 
      94             : 
      95             : /* Prototypes for internal functions */
      96             : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
      97             :                                      uint32 *buf_state);
      98             : static void AddBufferToRing(BufferAccessStrategy strategy,
      99             :                             BufferDesc *buf);
     100             : 
     101             : /*
     102             :  * ClockSweepTick - Helper routine for StrategyGetBuffer()
     103             :  *
     104             :  * Move the clock hand one buffer ahead of its current position and return the
     105             :  * id of the buffer now under the hand.
     106             :  */
     107             : static inline uint32
     108     7381634 : ClockSweepTick(void)
     109             : {
     110             :     uint32      victim;
     111             : 
     112             :     /*
     113             :      * Atomically move hand ahead one buffer - if there's several processes
     114             :      * doing this, this can lead to buffers being returned slightly out of
     115             :      * apparent order.
     116             :      */
     117             :     victim =
     118     7381634 :         pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
     119             : 
     120     7381634 :     if (victim >= NBuffers)
     121             :     {
     122       59514 :         uint32      originalVictim = victim;
     123             : 
     124             :         /* always wrap what we look up in BufferDescriptors */
     125       59514 :         victim = victim % NBuffers;
     126             : 
     127             :         /*
     128             :          * If we're the one that just caused a wraparound, force
     129             :          * completePasses to be incremented while holding the spinlock. We
     130             :          * need the spinlock so StrategySyncStart() can return a consistent
     131             :          * value consisting of nextVictimBuffer and completePasses.
     132             :          */
     133       59514 :         if (victim == 0)
     134             :         {
     135             :             uint32      expected;
     136             :             uint32      wrapped;
     137       59368 :             bool        success = false;
     138             : 
     139       59368 :             expected = originalVictim + 1;
     140             : 
     141      118872 :             while (!success)
     142             :             {
     143             :                 /*
     144             :                  * Acquire the spinlock while increasing completePasses. That
     145             :                  * allows other readers to read nextVictimBuffer and
     146             :                  * completePasses in a consistent manner which is required for
     147             :                  * StrategySyncStart().  In theory delaying the increment
     148             :                  * could lead to an overflow of nextVictimBuffers, but that's
     149             :                  * highly unlikely and wouldn't be particularly harmful.
     150             :                  */
     151       59504 :                 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     152             : 
     153       59504 :                 wrapped = expected % NBuffers;
     154             : 
     155       59504 :                 success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
     156             :                                                          &expected, wrapped);
     157       59504 :                 if (success)
     158       59368 :                     StrategyControl->completePasses++;
     159       59504 :                 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     160             :             }
     161             :         }
     162             :     }
     163     7381634 :     return victim;
     164             : }
     165             : 
     166             : /*
     167             :  * have_free_buffer -- a lockless check to see if there is a free buffer in
     168             :  *                     buffer pool.
     169             :  *
     170             :  * If the result is true that will become stale once free buffers are moved out
     171             :  * by other operations, so the caller who strictly want to use a free buffer
     172             :  * should not call this.
     173             :  */
     174             : bool
     175         778 : have_free_buffer(void)
     176             : {
     177         778 :     if (StrategyControl->firstFreeBuffer >= 0)
     178         778 :         return true;
     179             :     else
     180           0 :         return false;
     181             : }
     182             : 
     183             : /*
     184             :  * StrategyGetBuffer
     185             :  *
     186             :  *  Called by the bufmgr to get the next candidate buffer to use in
     187             :  *  BufferAlloc(). The only hard requirement BufferAlloc() has is that
     188             :  *  the selected buffer must not currently be pinned by anyone.
     189             :  *
     190             :  *  strategy is a BufferAccessStrategy object, or NULL for default strategy.
     191             :  *
     192             :  *  To ensure that no one else can pin the buffer before we do, we must
     193             :  *  return the buffer with the buffer header spinlock still held.
     194             :  */
     195             : BufferDesc *
     196     3803878 : StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
     197             : {
     198             :     BufferDesc *buf;
     199             :     int         bgwprocno;
     200             :     int         trycounter;
     201             :     uint32      local_buf_state;    /* to avoid repeated (de-)referencing */
     202             : 
     203     3803878 :     *from_ring = false;
     204             : 
     205             :     /*
     206             :      * If given a strategy object, see whether it can select a buffer. We
     207             :      * assume strategy objects don't need buffer_strategy_lock.
     208             :      */
     209     3803878 :     if (strategy != NULL)
     210             :     {
     211     1686838 :         buf = GetBufferFromRing(strategy, buf_state);
     212     1686838 :         if (buf != NULL)
     213             :         {
     214      709822 :             *from_ring = true;
     215      709822 :             return buf;
     216             :         }
     217             :     }
     218             : 
     219             :     /*
     220             :      * If asked, we need to waken the bgwriter. Since we don't want to rely on
     221             :      * a spinlock for this we force a read from shared memory once, and then
     222             :      * set the latch based on that value. We need to go through that length
     223             :      * because otherwise bgwprocno might be reset while/after we check because
     224             :      * the compiler might just reread from memory.
     225             :      *
     226             :      * This can possibly set the latch of the wrong process if the bgwriter
     227             :      * dies in the wrong moment. But since PGPROC->procLatch is never
     228             :      * deallocated the worst consequence of that is that we set the latch of
     229             :      * some arbitrary process.
     230             :      */
     231     3094056 :     bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
     232     3094056 :     if (bgwprocno != -1)
     233             :     {
     234             :         /* reset bgwprocno first, before setting the latch */
     235         518 :         StrategyControl->bgwprocno = -1;
     236             : 
     237             :         /*
     238             :          * Not acquiring ProcArrayLock here which is slightly icky. It's
     239             :          * actually fine because procLatch isn't ever freed, so we just can
     240             :          * potentially set the wrong process' (or no process') latch.
     241             :          */
     242         518 :         SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
     243             :     }
     244             : 
     245             :     /*
     246             :      * We count buffer allocation requests so that the bgwriter can estimate
     247             :      * the rate of buffer consumption.  Note that buffers recycled by a
     248             :      * strategy object are intentionally not counted here.
     249             :      */
     250     3094056 :     pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
     251             : 
     252             :     /*
     253             :      * First check, without acquiring the lock, whether there's buffers in the
     254             :      * freelist. Since we otherwise don't require the spinlock in every
     255             :      * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
     256             :      * uselessly in most cases. That obviously leaves a race where a buffer is
     257             :      * put on the freelist but we don't see the store yet - but that's pretty
     258             :      * harmless, it'll just get used during the next buffer acquisition.
     259             :      *
     260             :      * If there's buffers on the freelist, acquire the spinlock to pop one
     261             :      * buffer of the freelist. Then check whether that buffer is usable and
     262             :      * repeat if not.
     263             :      *
     264             :      * Note that the freeNext fields are considered to be protected by the
     265             :      * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
     266             :      * manipulate them without holding the spinlock.
     267             :      */
     268     3094056 :     if (StrategyControl->firstFreeBuffer >= 0)
     269             :     {
     270             :         while (true)
     271             :         {
     272             :             /* Acquire the spinlock to remove element from the freelist */
     273     1443472 :             SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     274             : 
     275     1443472 :             if (StrategyControl->firstFreeBuffer < 0)
     276             :             {
     277          24 :                 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     278          24 :                 break;
     279             :             }
     280             : 
     281     1443448 :             buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
     282             :             Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
     283             : 
     284             :             /* Unconditionally remove buffer from freelist */
     285     1443448 :             StrategyControl->firstFreeBuffer = buf->freeNext;
     286     1443448 :             buf->freeNext = FREENEXT_NOT_IN_LIST;
     287             : 
     288             :             /*
     289             :              * Release the lock so someone else can access the freelist while
     290             :              * we check out this buffer.
     291             :              */
     292     1443448 :             SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     293             : 
     294             :             /*
     295             :              * If the buffer is pinned or has a nonzero usage_count, we cannot
     296             :              * use it; discard it and retry.  (This can only happen if VACUUM
     297             :              * put a valid buffer in the freelist and then someone else used
     298             :              * it before we got to it.  It's probably impossible altogether as
     299             :              * of 8.3, but we'd better check anyway.)
     300             :              */
     301     1443448 :             local_buf_state = LockBufHdr(buf);
     302     1443448 :             if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
     303     1443442 :                 && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
     304             :             {
     305     1443414 :                 if (strategy != NULL)
     306      723656 :                     AddBufferToRing(strategy, buf);
     307     1443414 :                 *buf_state = local_buf_state;
     308     1443414 :                 return buf;
     309             :             }
     310          34 :             UnlockBufHdr(buf, local_buf_state);
     311             :         }
     312             :     }
     313             : 
     314             :     /* Nothing on the freelist, so run the "clock sweep" algorithm */
     315     1650642 :     trycounter = NBuffers;
     316             :     for (;;)
     317             :     {
     318     7381634 :         buf = GetBufferDescriptor(ClockSweepTick());
     319             : 
     320             :         /*
     321             :          * If the buffer is pinned or has a nonzero usage_count, we cannot use
     322             :          * it; decrement the usage_count (unless pinned) and keep scanning.
     323             :          */
     324     7381634 :         local_buf_state = LockBufHdr(buf);
     325             : 
     326     7381634 :         if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
     327             :         {
     328     7241212 :             if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
     329             :             {
     330     5590570 :                 local_buf_state -= BUF_USAGECOUNT_ONE;
     331             : 
     332     5590570 :                 trycounter = NBuffers;
     333             :             }
     334             :             else
     335             :             {
     336             :                 /* Found a usable buffer */
     337     1650642 :                 if (strategy != NULL)
     338      253360 :                     AddBufferToRing(strategy, buf);
     339     1650642 :                 *buf_state = local_buf_state;
     340     1650642 :                 return buf;
     341             :             }
     342             :         }
     343      140422 :         else if (--trycounter == 0)
     344             :         {
     345             :             /*
     346             :              * We've scanned all the buffers without making any state changes,
     347             :              * so all the buffers are pinned (or were when we looked at them).
     348             :              * We could hope that someone will free one eventually, but it's
     349             :              * probably better to fail than to risk getting stuck in an
     350             :              * infinite loop.
     351             :              */
     352           0 :             UnlockBufHdr(buf, local_buf_state);
     353           0 :             elog(ERROR, "no unpinned buffers available");
     354             :         }
     355     5730992 :         UnlockBufHdr(buf, local_buf_state);
     356             :     }
     357             : }
     358             : 
     359             : /*
     360             :  * StrategyFreeBuffer: put a buffer on the freelist
     361             :  */
     362             : void
     363      215342 : StrategyFreeBuffer(BufferDesc *buf)
     364             : {
     365      215342 :     SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     366             : 
     367             :     /*
     368             :      * It is possible that we are told to put something in the freelist that
     369             :      * is already in it; don't screw up the list if so.
     370             :      */
     371      215342 :     if (buf->freeNext == FREENEXT_NOT_IN_LIST)
     372             :     {
     373      215342 :         buf->freeNext = StrategyControl->firstFreeBuffer;
     374      215342 :         if (buf->freeNext < 0)
     375        4978 :             StrategyControl->lastFreeBuffer = buf->buf_id;
     376      215342 :         StrategyControl->firstFreeBuffer = buf->buf_id;
     377             :     }
     378             : 
     379      215342 :     SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     380      215342 : }
     381             : 
     382             : /*
     383             :  * StrategySyncStart -- tell BgBufferSync where to start syncing
     384             :  *
     385             :  * The result is the buffer index of the best buffer to sync first.
     386             :  * BgBufferSync() will proceed circularly around the buffer array from there.
     387             :  *
     388             :  * In addition, we return the completed-pass count (which is effectively
     389             :  * the higher-order bits of nextVictimBuffer) and the count of recent buffer
     390             :  * allocs if non-NULL pointers are passed.  The alloc count is reset after
     391             :  * being read.
     392             :  */
     393             : int
     394       20588 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
     395             : {
     396             :     uint32      nextVictimBuffer;
     397             :     int         result;
     398             : 
     399       20588 :     SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     400       20588 :     nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
     401       20588 :     result = nextVictimBuffer % NBuffers;
     402             : 
     403       20588 :     if (complete_passes)
     404             :     {
     405       20588 :         *complete_passes = StrategyControl->completePasses;
     406             : 
     407             :         /*
     408             :          * Additionally add the number of wraparounds that happened before
     409             :          * completePasses could be incremented. C.f. ClockSweepTick().
     410             :          */
     411       20588 :         *complete_passes += nextVictimBuffer / NBuffers;
     412             :     }
     413             : 
     414       20588 :     if (num_buf_alloc)
     415             :     {
     416       20588 :         *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
     417             :     }
     418       20588 :     SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     419       20588 :     return result;
     420             : }
     421             : 
     422             : /*
     423             :  * StrategyNotifyBgWriter -- set or clear allocation notification latch
     424             :  *
     425             :  * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
     426             :  * set that latch.  Pass -1 to clear the pending notification before it
     427             :  * happens.  This feature is used by the bgwriter process to wake itself up
     428             :  * from hibernation, and is not meant for anybody else to use.
     429             :  */
     430             : void
     431         808 : StrategyNotifyBgWriter(int bgwprocno)
     432             : {
     433             :     /*
     434             :      * We acquire buffer_strategy_lock just to ensure that the store appears
     435             :      * atomic to StrategyGetBuffer.  The bgwriter should call this rather
     436             :      * infrequently, so there's no performance penalty from being safe.
     437             :      */
     438         808 :     SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     439         808 :     StrategyControl->bgwprocno = bgwprocno;
     440         808 :     SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     441         808 : }
     442             : 
     443             : 
     444             : /*
     445             :  * StrategyShmemSize
     446             :  *
     447             :  * estimate the size of shared memory used by the freelist-related structures.
     448             :  *
     449             :  * Note: for somewhat historical reasons, the buffer lookup hashtable size
     450             :  * is also determined here.
     451             :  */
     452             : Size
     453        3906 : StrategyShmemSize(void)
     454             : {
     455        3906 :     Size        size = 0;
     456             : 
     457             :     /* size of lookup hash table ... see comment in StrategyInitialize */
     458        3906 :     size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
     459             : 
     460             :     /* size of the shared replacement strategy control block */
     461        3906 :     size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
     462             : 
     463        3906 :     return size;
     464             : }
     465             : 
     466             : /*
     467             :  * StrategyInitialize -- initialize the buffer cache replacement
     468             :  *      strategy.
     469             :  *
     470             :  * Assumes: All of the buffers are already built into a linked list.
     471             :  *      Only called by postmaster and only during initialization.
     472             :  */
     473             : void
     474        2100 : StrategyInitialize(bool init)
     475             : {
     476             :     bool        found;
     477             : 
     478             :     /*
     479             :      * Initialize the shared buffer lookup hashtable.
     480             :      *
     481             :      * Since we can't tolerate running out of lookup table entries, we must be
     482             :      * sure to specify an adequate table size here.  The maximum steady-state
     483             :      * usage is of course NBuffers entries, but BufferAlloc() tries to insert
     484             :      * a new entry before deleting the old.  In principle this could be
     485             :      * happening in each partition concurrently, so we could need as many as
     486             :      * NBuffers + NUM_BUFFER_PARTITIONS entries.
     487             :      */
     488        2100 :     InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
     489             : 
     490             :     /*
     491             :      * Get or create the shared strategy control block
     492             :      */
     493        2100 :     StrategyControl = (BufferStrategyControl *)
     494        2100 :         ShmemInitStruct("Buffer Strategy Status",
     495             :                         sizeof(BufferStrategyControl),
     496             :                         &found);
     497             : 
     498        2100 :     if (!found)
     499             :     {
     500             :         /*
     501             :          * Only done once, usually in postmaster
     502             :          */
     503             :         Assert(init);
     504             : 
     505        2100 :         SpinLockInit(&StrategyControl->buffer_strategy_lock);
     506             : 
     507             :         /*
     508             :          * Grab the whole linked list of free buffers for our strategy. We
     509             :          * assume it was previously set up by BufferManagerShmemInit().
     510             :          */
     511        2100 :         StrategyControl->firstFreeBuffer = 0;
     512        2100 :         StrategyControl->lastFreeBuffer = NBuffers - 1;
     513             : 
     514             :         /* Initialize the clock sweep pointer */
     515        2100 :         pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
     516             : 
     517             :         /* Clear statistics */
     518        2100 :         StrategyControl->completePasses = 0;
     519        2100 :         pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
     520             : 
     521             :         /* No pending notification */
     522        2100 :         StrategyControl->bgwprocno = -1;
     523             :     }
     524             :     else
     525             :         Assert(!init);
     526        2100 : }
     527             : 
     528             : 
     529             : /* ----------------------------------------------------------------
     530             :  *              Backend-private buffer ring management
     531             :  * ----------------------------------------------------------------
     532             :  */
     533             : 
     534             : 
     535             : /*
     536             :  * GetAccessStrategy -- create a BufferAccessStrategy object
     537             :  *
     538             :  * The object is allocated in the current memory context.
     539             :  */
     540             : BufferAccessStrategy
     541      279314 : GetAccessStrategy(BufferAccessStrategyType btype)
     542             : {
     543             :     int         ring_size_kb;
     544             : 
     545             :     /*
     546             :      * Select ring size to use.  See buffer/README for rationales.
     547             :      *
     548             :      * Note: if you change the ring size for BAS_BULKREAD, see also
     549             :      * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
     550             :      */
     551      279314 :     switch (btype)
     552             :     {
     553           0 :         case BAS_NORMAL:
     554             :             /* if someone asks for NORMAL, just give 'em a "default" object */
     555           0 :             return NULL;
     556             : 
     557      153926 :         case BAS_BULKREAD:
     558             :             {
     559             :                 int         ring_max_kb;
     560             : 
     561             :                 /*
     562             :                  * The ring always needs to be large enough to allow some
     563             :                  * separation in time between providing a buffer to the user
     564             :                  * of the strategy and that buffer being reused. Otherwise the
     565             :                  * user's pin will prevent reuse of the buffer, even without
     566             :                  * concurrent activity.
     567             :                  *
     568             :                  * We also need to ensure the ring always is large enough for
     569             :                  * SYNC_SCAN_REPORT_INTERVAL, as noted above.
     570             :                  *
     571             :                  * Thus we start out a minimal size and increase the size
     572             :                  * further if appropriate.
     573             :                  */
     574      153926 :                 ring_size_kb = 256;
     575             : 
     576             :                 /*
     577             :                  * There's no point in a larger ring if we won't be allowed to
     578             :                  * pin sufficiently many buffers.  But we never limit to less
     579             :                  * than the minimal size above.
     580             :                  */
     581      153926 :                 ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
     582      153926 :                 ring_max_kb = Max(ring_size_kb, ring_max_kb);
     583             : 
     584             :                 /*
     585             :                  * We would like the ring to additionally have space for the
     586             :                  * configured degree of IO concurrency. While being read in,
     587             :                  * buffers can obviously not yet be reused.
     588             :                  *
     589             :                  * Each IO can be up to io_combine_limit blocks large, and we
     590             :                  * want to start up to effective_io_concurrency IOs.
     591             :                  *
     592             :                  * Note that effective_io_concurrency may be 0, which disables
     593             :                  * AIO.
     594             :                  */
     595      153926 :                 ring_size_kb += (BLCKSZ / 1024) *
     596      153926 :                     io_combine_limit * effective_io_concurrency;
     597             : 
     598      153926 :                 if (ring_size_kb > ring_max_kb)
     599      153926 :                     ring_size_kb = ring_max_kb;
     600      153926 :                 break;
     601             :             }
     602      125388 :         case BAS_BULKWRITE:
     603      125388 :             ring_size_kb = 16 * 1024;
     604      125388 :             break;
     605           0 :         case BAS_VACUUM:
     606           0 :             ring_size_kb = 2048;
     607           0 :             break;
     608             : 
     609           0 :         default:
     610           0 :             elog(ERROR, "unrecognized buffer access strategy: %d",
     611             :                  (int) btype);
     612             :             return NULL;        /* keep compiler quiet */
     613             :     }
     614             : 
     615      279314 :     return GetAccessStrategyWithSize(btype, ring_size_kb);
     616             : }
     617             : 
     618             : /*
     619             :  * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
     620             :  *      number of buffers equivalent to the passed in size.
     621             :  *
     622             :  * If the given ring size is 0, no BufferAccessStrategy will be created and
     623             :  * the function will return NULL.  ring_size_kb must not be negative.
     624             :  */
     625             : BufferAccessStrategy
     626      294850 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
     627             : {
     628             :     int         ring_buffers;
     629             :     BufferAccessStrategy strategy;
     630             : 
     631             :     Assert(ring_size_kb >= 0);
     632             : 
     633             :     /* Figure out how many buffers ring_size_kb is */
     634      294850 :     ring_buffers = ring_size_kb / (BLCKSZ / 1024);
     635             : 
     636             :     /* 0 means unlimited, so no BufferAccessStrategy required */
     637      294850 :     if (ring_buffers == 0)
     638          12 :         return NULL;
     639             : 
     640             :     /* Cap to 1/8th of shared_buffers */
     641      294838 :     ring_buffers = Min(NBuffers / 8, ring_buffers);
     642             : 
     643             :     /* NBuffers should never be less than 16, so this shouldn't happen */
     644             :     Assert(ring_buffers > 0);
     645             : 
     646             :     /* Allocate the object and initialize all elements to zeroes */
     647             :     strategy = (BufferAccessStrategy)
     648      294838 :         palloc0(offsetof(BufferAccessStrategyData, buffers) +
     649             :                 ring_buffers * sizeof(Buffer));
     650             : 
     651             :     /* Set fields that don't start out zero */
     652      294838 :     strategy->btype = btype;
     653      294838 :     strategy->nbuffers = ring_buffers;
     654             : 
     655      294838 :     return strategy;
     656             : }
     657             : 
     658             : /*
     659             :  * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
     660             :  *      the ring
     661             :  *
     662             :  * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
     663             :  * returning NULL with 0 size.
     664             :  */
     665             : int
     666          34 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
     667             : {
     668          34 :     if (strategy == NULL)
     669           0 :         return 0;
     670             : 
     671          34 :     return strategy->nbuffers;
     672             : }
     673             : 
     674             : /*
     675             :  * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
     676             :  *
     677             :  * When pinning extra buffers to look ahead, users of a ring-based strategy are
     678             :  * in danger of pinning too much of the ring at once while performing look-ahead.
     679             :  * For some strategies, that means "escaping" from the ring, and in others it
     680             :  * means forcing dirty data to disk very frequently with associated WAL
     681             :  * flushing.  Since external code has no insight into any of that, allow
     682             :  * individual strategy types to expose a clamp that should be applied when
     683             :  * deciding on a maximum number of buffers to pin at once.
     684             :  *
     685             :  * Callers should combine this number with other relevant limits and take the
     686             :  * minimum.
     687             :  */
     688             : int
     689     1008696 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
     690             : {
     691     1008696 :     if (strategy == NULL)
     692      735850 :         return NBuffers;
     693             : 
     694      272846 :     switch (strategy->btype)
     695             :     {
     696      144532 :         case BAS_BULKREAD:
     697             : 
     698             :             /*
     699             :              * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
     700             :              * shouldn't be a problem and the caller is free to pin up to the
     701             :              * entire ring at once.
     702             :              */
     703      144532 :             return strategy->nbuffers;
     704             : 
     705      128314 :         default:
     706             : 
     707             :             /*
     708             :              * Tell caller not to pin more than half the buffers in the ring.
     709             :              * This is a trade-off between look ahead distance and deferring
     710             :              * writeback and associated WAL traffic.
     711             :              */
     712      128314 :             return strategy->nbuffers / 2;
     713             :     }
     714             : }
     715             : 
     716             : /*
     717             :  * FreeAccessStrategy -- release a BufferAccessStrategy object
     718             :  *
     719             :  * A simple pfree would do at the moment, but we would prefer that callers
     720             :  * don't assume that much about the representation of BufferAccessStrategy.
     721             :  */
     722             : void
     723      267454 : FreeAccessStrategy(BufferAccessStrategy strategy)
     724             : {
     725             :     /* don't crash if called on a "default" strategy */
     726      267454 :     if (strategy != NULL)
     727      267454 :         pfree(strategy);
     728      267454 : }
     729             : 
     730             : /*
     731             :  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
     732             :  *      ring is empty / not usable.
     733             :  *
     734             :  * The bufhdr spin lock is held on the returned buffer.
     735             :  */
     736             : static BufferDesc *
     737     1686838 : GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
     738             : {
     739             :     BufferDesc *buf;
     740             :     Buffer      bufnum;
     741             :     uint32      local_buf_state;    /* to avoid repeated (de-)referencing */
     742             : 
     743             : 
     744             :     /* Advance to next ring slot */
     745     1686838 :     if (++strategy->current >= strategy->nbuffers)
     746       52496 :         strategy->current = 0;
     747             : 
     748             :     /*
     749             :      * If the slot hasn't been filled yet, tell the caller to allocate a new
     750             :      * buffer with the normal allocation strategy.  He will then fill this
     751             :      * slot by calling AddBufferToRing with the new buffer.
     752             :      */
     753     1686838 :     bufnum = strategy->buffers[strategy->current];
     754     1686838 :     if (bufnum == InvalidBuffer)
     755      961756 :         return NULL;
     756             : 
     757             :     /*
     758             :      * If the buffer is pinned we cannot use it under any circumstances.
     759             :      *
     760             :      * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
     761             :      * since our own previous usage of the ring element would have left it
     762             :      * there, but it might've been decremented by clock sweep since then). A
     763             :      * higher usage_count indicates someone else has touched the buffer, so we
     764             :      * shouldn't re-use it.
     765             :      */
     766      725082 :     buf = GetBufferDescriptor(bufnum - 1);
     767      725082 :     local_buf_state = LockBufHdr(buf);
     768      725082 :     if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
     769      716968 :         && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
     770             :     {
     771      709822 :         *buf_state = local_buf_state;
     772      709822 :         return buf;
     773             :     }
     774       15260 :     UnlockBufHdr(buf, local_buf_state);
     775             : 
     776             :     /*
     777             :      * Tell caller to allocate a new buffer with the normal allocation
     778             :      * strategy.  He'll then replace this ring element via AddBufferToRing.
     779             :      */
     780       15260 :     return NULL;
     781             : }
     782             : 
     783             : /*
     784             :  * AddBufferToRing -- add a buffer to the buffer ring
     785             :  *
     786             :  * Caller must hold the buffer header spinlock on the buffer.  Since this
     787             :  * is called with the spinlock held, it had better be quite cheap.
     788             :  */
     789             : static void
     790      977016 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
     791             : {
     792      977016 :     strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
     793      977016 : }
     794             : 
     795             : /*
     796             :  * Utility function returning the IOContext of a given BufferAccessStrategy's
     797             :  * strategy ring.
     798             :  */
     799             : IOContext
     800   122965112 : IOContextForStrategy(BufferAccessStrategy strategy)
     801             : {
     802   122965112 :     if (!strategy)
     803   118028980 :         return IOCONTEXT_NORMAL;
     804             : 
     805     4936132 :     switch (strategy->btype)
     806             :     {
     807             :         case BAS_NORMAL:
     808             : 
     809             :             /*
     810             :              * Currently, GetAccessStrategy() returns NULL for
     811             :              * BufferAccessStrategyType BAS_NORMAL, so this case is
     812             :              * unreachable.
     813             :              */
     814             :             pg_unreachable();
     815             :             return IOCONTEXT_NORMAL;
     816     3444094 :         case BAS_BULKREAD:
     817     3444094 :             return IOCONTEXT_BULKREAD;
     818      556546 :         case BAS_BULKWRITE:
     819      556546 :             return IOCONTEXT_BULKWRITE;
     820      935492 :         case BAS_VACUUM:
     821      935492 :             return IOCONTEXT_VACUUM;
     822             :     }
     823             : 
     824           0 :     elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
     825             :     pg_unreachable();
     826             : }
     827             : 
     828             : /*
     829             :  * StrategyRejectBuffer -- consider rejecting a dirty buffer
     830             :  *
     831             :  * When a nondefault strategy is used, the buffer manager calls this function
     832             :  * when it turns out that the buffer selected by StrategyGetBuffer needs to
     833             :  * be written out and doing so would require flushing WAL too.  This gives us
     834             :  * a chance to choose a different victim.
     835             :  *
     836             :  * Returns true if buffer manager should ask for a new victim, and false
     837             :  * if this buffer should be written and re-used.
     838             :  */
     839             : bool
     840       16914 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
     841             : {
     842             :     /* We only do this in bulkread mode */
     843       16914 :     if (strategy->btype != BAS_BULKREAD)
     844        3480 :         return false;
     845             : 
     846             :     /* Don't muck with behavior of normal buffer-replacement strategy */
     847       25778 :     if (!from_ring ||
     848       12344 :         strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
     849        1090 :         return false;
     850             : 
     851             :     /*
     852             :      * Remove the dirty buffer from the ring; necessary to prevent infinite
     853             :      * loop if all ring members are dirty.
     854             :      */
     855       12344 :     strategy->buffers[strategy->current] = InvalidBuffer;
     856             : 
     857       12344 :     return true;
     858             : }

Generated by: LCOV version 1.14