LCOV - code coverage report
Current view: top level - src/backend/storage/buffer - freelist.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 92.5 % 173 160
Test Date: 2026-03-12 01:15:13 Functions: 100.0 % 15 15
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * freelist.c
       4              :  *    routines for managing the buffer pool's replacement strategy.
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  *
      11              :  * IDENTIFICATION
      12              :  *    src/backend/storage/buffer/freelist.c
      13              :  *
      14              :  *-------------------------------------------------------------------------
      15              :  */
      16              : #include "postgres.h"
      17              : 
      18              : #include "pgstat.h"
      19              : #include "port/atomics.h"
      20              : #include "storage/buf_internals.h"
      21              : #include "storage/bufmgr.h"
      22              : #include "storage/proc.h"
      23              : 
      24              : #define INT_ACCESS_ONCE(var)    ((int)(*((volatile int *)&(var))))
      25              : 
      26              : 
      27              : /*
      28              :  * The shared freelist control information.
      29              :  */
      30              : typedef struct
      31              : {
      32              :     /* Spinlock: protects the values below */
      33              :     slock_t     buffer_strategy_lock;
      34              : 
      35              :     /*
      36              :      * clock-sweep hand: index of next buffer to consider grabbing. Note that
      37              :      * this isn't a concrete buffer - we only ever increase the value. So, to
      38              :      * get an actual buffer, it needs to be used modulo NBuffers.
      39              :      */
      40              :     pg_atomic_uint32 nextVictimBuffer;
      41              : 
      42              :     /*
      43              :      * Statistics.  These counters should be wide enough that they can't
      44              :      * overflow during a single bgwriter cycle.
      45              :      */
      46              :     uint32      completePasses; /* Complete cycles of the clock-sweep */
      47              :     pg_atomic_uint32 numBufferAllocs;   /* Buffers allocated since last reset */
      48              : 
      49              :     /*
      50              :      * Bgworker process to be notified upon activity or -1 if none. See
      51              :      * StrategyNotifyBgWriter.
      52              :      */
      53              :     int         bgwprocno;
      54              : } BufferStrategyControl;
      55              : 
      56              : /* Pointers to shared state */
      57              : static BufferStrategyControl *StrategyControl = NULL;
      58              : 
      59              : /*
      60              :  * Private (non-shared) state for managing a ring of shared buffers to re-use.
      61              :  * This is currently the only kind of BufferAccessStrategy object, but someday
      62              :  * we might have more kinds.
      63              :  */
      64              : typedef struct BufferAccessStrategyData
      65              : {
      66              :     /* Overall strategy type */
      67              :     BufferAccessStrategyType btype;
      68              :     /* Number of elements in buffers[] array */
      69              :     int         nbuffers;
      70              : 
      71              :     /*
      72              :      * Index of the "current" slot in the ring, ie, the one most recently
      73              :      * returned by GetBufferFromRing.
      74              :      */
      75              :     int         current;
      76              : 
      77              :     /*
      78              :      * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
      79              :      * have not yet selected a buffer for this ring slot.  For allocation
      80              :      * simplicity this is palloc'd together with the fixed fields of the
      81              :      * struct.
      82              :      */
      83              :     Buffer      buffers[FLEXIBLE_ARRAY_MEMBER];
      84              : }           BufferAccessStrategyData;
      85              : 
      86              : 
      87              : /* Prototypes for internal functions */
      88              : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
      89              :                                      uint64 *buf_state);
      90              : static void AddBufferToRing(BufferAccessStrategy strategy,
      91              :                             BufferDesc *buf);
      92              : 
      93              : /*
      94              :  * ClockSweepTick - Helper routine for StrategyGetBuffer()
      95              :  *
      96              :  * Move the clock hand one buffer ahead of its current position and return the
      97              :  * id of the buffer now under the hand.
      98              :  */
      99              : static inline uint32
     100      4913264 : ClockSweepTick(void)
     101              : {
     102              :     uint32      victim;
     103              : 
     104              :     /*
     105              :      * Atomically move hand ahead one buffer - if there's several processes
     106              :      * doing this, this can lead to buffers being returned slightly out of
     107              :      * apparent order.
     108              :      */
     109              :     victim =
     110      4913264 :         pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
     111              : 
     112      4913264 :     if (victim >= NBuffers)
     113              :     {
     114        33556 :         uint32      originalVictim = victim;
     115              : 
     116              :         /* always wrap what we look up in BufferDescriptors */
     117        33556 :         victim = victim % NBuffers;
     118              : 
     119              :         /*
     120              :          * If we're the one that just caused a wraparound, force
     121              :          * completePasses to be incremented while holding the spinlock. We
     122              :          * need the spinlock so StrategySyncStart() can return a consistent
     123              :          * value consisting of nextVictimBuffer and completePasses.
     124              :          */
     125        33556 :         if (victim == 0)
     126              :         {
     127              :             uint32      expected;
     128              :             uint32      wrapped;
     129        33458 :             bool        success = false;
     130              : 
     131        33458 :             expected = originalVictim + 1;
     132              : 
     133        67006 :             while (!success)
     134              :             {
     135              :                 /*
     136              :                  * Acquire the spinlock while increasing completePasses. That
     137              :                  * allows other readers to read nextVictimBuffer and
     138              :                  * completePasses in a consistent manner which is required for
     139              :                  * StrategySyncStart().  In theory delaying the increment
     140              :                  * could lead to an overflow of nextVictimBuffers, but that's
     141              :                  * highly unlikely and wouldn't be particularly harmful.
     142              :                  */
     143        33548 :                 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     144              : 
     145        33548 :                 wrapped = expected % NBuffers;
     146              : 
     147        33548 :                 success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
     148              :                                                          &expected, wrapped);
     149        33548 :                 if (success)
     150        33458 :                     StrategyControl->completePasses++;
     151        33548 :                 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     152              :             }
     153              :         }
     154              :     }
     155      4913264 :     return victim;
     156              : }
     157              : 
     158              : /*
     159              :  * StrategyGetBuffer
     160              :  *
     161              :  *  Called by the bufmgr to get the next candidate buffer to use in
     162              :  *  GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
     163              :  *  the selected buffer must not currently be pinned by anyone.
     164              :  *
     165              :  *  strategy is a BufferAccessStrategy object, or NULL for default strategy.
     166              :  *
     167              :  *  It is the callers responsibility to ensure the buffer ownership can be
     168              :  *  tracked via TrackNewBufferPin().
     169              :  *
     170              :  *  The buffer is pinned and marked as owned, using TrackNewBufferPin(),
     171              :  *  before returning.
     172              :  */
     173              : BufferDesc *
     174      2000698 : StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
     175              : {
     176              :     BufferDesc *buf;
     177              :     int         bgwprocno;
     178              :     int         trycounter;
     179              : 
     180      2000698 :     *from_ring = false;
     181              : 
     182              :     /*
     183              :      * If given a strategy object, see whether it can select a buffer. We
     184              :      * assume strategy objects don't need buffer_strategy_lock.
     185              :      */
     186      2000698 :     if (strategy != NULL)
     187              :     {
     188       836518 :         buf = GetBufferFromRing(strategy, buf_state);
     189       836518 :         if (buf != NULL)
     190              :         {
     191       324531 :             *from_ring = true;
     192       324531 :             return buf;
     193              :         }
     194              :     }
     195              : 
     196              :     /*
     197              :      * If asked, we need to waken the bgwriter. Since we don't want to rely on
     198              :      * a spinlock for this we force a read from shared memory once, and then
     199              :      * set the latch based on that value. We need to go through that length
     200              :      * because otherwise bgwprocno might be reset while/after we check because
     201              :      * the compiler might just reread from memory.
     202              :      *
     203              :      * This can possibly set the latch of the wrong process if the bgwriter
     204              :      * dies in the wrong moment. But since PGPROC->procLatch is never
     205              :      * deallocated the worst consequence of that is that we set the latch of
     206              :      * some arbitrary process.
     207              :      */
     208      1676167 :     bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
     209      1676167 :     if (bgwprocno != -1)
     210              :     {
     211              :         /* reset bgwprocno first, before setting the latch */
     212          441 :         StrategyControl->bgwprocno = -1;
     213              : 
     214              :         /*
     215              :          * Not acquiring ProcArrayLock here which is slightly icky. It's
     216              :          * actually fine because procLatch isn't ever freed, so we just can
     217              :          * potentially set the wrong process' (or no process') latch.
     218              :          */
     219          441 :         SetLatch(&GetPGProcByNumber(bgwprocno)->procLatch);
     220              :     }
     221              : 
     222              :     /*
     223              :      * We count buffer allocation requests so that the bgwriter can estimate
     224              :      * the rate of buffer consumption.  Note that buffers recycled by a
     225              :      * strategy object are intentionally not counted here.
     226              :      */
     227      1676167 :     pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
     228              : 
     229              :     /* Use the "clock sweep" algorithm to find a free buffer */
     230      1676167 :     trycounter = NBuffers;
     231              :     for (;;)
     232      3237097 :     {
     233              :         uint64      old_buf_state;
     234              :         uint64      local_buf_state;
     235              : 
     236      4913264 :         buf = GetBufferDescriptor(ClockSweepTick());
     237              : 
     238              :         /*
     239              :          * Check whether the buffer can be used and pin it if so. Do this
     240              :          * using a CAS loop, to avoid having to lock the buffer header.
     241              :          */
     242      4913264 :         old_buf_state = pg_atomic_read_u64(&buf->state);
     243              :         for (;;)
     244              :         {
     245      4913294 :             local_buf_state = old_buf_state;
     246              : 
     247              :             /*
     248              :              * If the buffer is pinned or has a nonzero usage_count, we cannot
     249              :              * use it; decrement the usage_count (unless pinned) and keep
     250              :              * scanning.
     251              :              */
     252              : 
     253      4913294 :             if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
     254              :             {
     255        77980 :                 if (--trycounter == 0)
     256              :                 {
     257              :                     /*
     258              :                      * We've scanned all the buffers without making any state
     259              :                      * changes, so all the buffers are pinned (or were when we
     260              :                      * looked at them). We could hope that someone will free
     261              :                      * one eventually, but it's probably better to fail than
     262              :                      * to risk getting stuck in an infinite loop.
     263              :                      */
     264            0 :                     elog(ERROR, "no unpinned buffers available");
     265              :                 }
     266        77980 :                 break;
     267              :             }
     268              : 
     269              :             /* See equivalent code in PinBuffer() */
     270      4835314 :             if (unlikely(local_buf_state & BM_LOCKED))
     271              :             {
     272            1 :                 old_buf_state = WaitBufHdrUnlocked(buf);
     273            1 :                 continue;
     274              :             }
     275              : 
     276      4835313 :             if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
     277              :             {
     278      3159145 :                 local_buf_state -= BUF_USAGECOUNT_ONE;
     279              : 
     280      3159145 :                 if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
     281              :                                                    local_buf_state))
     282              :                 {
     283      3159117 :                     trycounter = NBuffers;
     284      3159117 :                     break;
     285              :                 }
     286              :             }
     287              :             else
     288              :             {
     289              :                 /* pin the buffer if the CAS succeeds */
     290      1676168 :                 local_buf_state += BUF_REFCOUNT_ONE;
     291              : 
     292      1676168 :                 if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
     293              :                                                    local_buf_state))
     294              :                 {
     295              :                     /* Found a usable buffer */
     296      1676167 :                     if (strategy != NULL)
     297       511987 :                         AddBufferToRing(strategy, buf);
     298      1676167 :                     *buf_state = local_buf_state;
     299              : 
     300      1676167 :                     TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
     301              : 
     302      1676167 :                     return buf;
     303              :                 }
     304              :             }
     305              :         }
     306              :     }
     307              : }
     308              : 
     309              : /*
     310              :  * StrategySyncStart -- tell BgBufferSync where to start syncing
     311              :  *
     312              :  * The result is the buffer index of the best buffer to sync first.
     313              :  * BgBufferSync() will proceed circularly around the buffer array from there.
     314              :  *
     315              :  * In addition, we return the completed-pass count (which is effectively
     316              :  * the higher-order bits of nextVictimBuffer) and the count of recent buffer
     317              :  * allocs if non-NULL pointers are passed.  The alloc count is reset after
     318              :  * being read.
     319              :  */
     320              : int
     321        12734 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
     322              : {
     323              :     uint32      nextVictimBuffer;
     324              :     int         result;
     325              : 
     326        12734 :     SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     327        12734 :     nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
     328        12734 :     result = nextVictimBuffer % NBuffers;
     329              : 
     330        12734 :     if (complete_passes)
     331              :     {
     332        12734 :         *complete_passes = StrategyControl->completePasses;
     333              : 
     334              :         /*
     335              :          * Additionally add the number of wraparounds that happened before
     336              :          * completePasses could be incremented. C.f. ClockSweepTick().
     337              :          */
     338        12734 :         *complete_passes += nextVictimBuffer / NBuffers;
     339              :     }
     340              : 
     341        12734 :     if (num_buf_alloc)
     342              :     {
     343        12734 :         *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
     344              :     }
     345        12734 :     SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     346        12734 :     return result;
     347              : }
     348              : 
     349              : /*
     350              :  * StrategyNotifyBgWriter -- set or clear allocation notification latch
     351              :  *
     352              :  * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
     353              :  * set that latch.  Pass -1 to clear the pending notification before it
     354              :  * happens.  This feature is used by the bgwriter process to wake itself up
     355              :  * from hibernation, and is not meant for anybody else to use.
     356              :  */
     357              : void
     358          872 : StrategyNotifyBgWriter(int bgwprocno)
     359              : {
     360              :     /*
     361              :      * We acquire buffer_strategy_lock just to ensure that the store appears
     362              :      * atomic to StrategyGetBuffer.  The bgwriter should call this rather
     363              :      * infrequently, so there's no performance penalty from being safe.
     364              :      */
     365          872 :     SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
     366          872 :     StrategyControl->bgwprocno = bgwprocno;
     367          872 :     SpinLockRelease(&StrategyControl->buffer_strategy_lock);
     368          872 : }
     369              : 
     370              : 
     371              : /*
     372              :  * StrategyShmemSize
     373              :  *
     374              :  * estimate the size of shared memory used by the freelist-related structures.
     375              :  *
     376              :  * Note: for somewhat historical reasons, the buffer lookup hashtable size
     377              :  * is also determined here.
     378              :  */
     379              : Size
     380         2163 : StrategyShmemSize(void)
     381              : {
     382         2163 :     Size        size = 0;
     383              : 
     384              :     /* size of lookup hash table ... see comment in StrategyInitialize */
     385         2163 :     size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
     386              : 
     387              :     /* size of the shared replacement strategy control block */
     388         2163 :     size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
     389              : 
     390         2163 :     return size;
     391              : }
     392              : 
     393              : /*
     394              :  * StrategyInitialize -- initialize the buffer cache replacement
     395              :  *      strategy.
     396              :  *
     397              :  * Assumes: All of the buffers are already built into a linked list.
     398              :  *      Only called by postmaster and only during initialization.
     399              :  */
     400              : void
     401         1158 : StrategyInitialize(bool init)
     402              : {
     403              :     bool        found;
     404              : 
     405              :     /*
     406              :      * Initialize the shared buffer lookup hashtable.
     407              :      *
     408              :      * Since we can't tolerate running out of lookup table entries, we must be
     409              :      * sure to specify an adequate table size here.  The maximum steady-state
     410              :      * usage is of course NBuffers entries, but BufferAlloc() tries to insert
     411              :      * a new entry before deleting the old.  In principle this could be
     412              :      * happening in each partition concurrently, so we could need as many as
     413              :      * NBuffers + NUM_BUFFER_PARTITIONS entries.
     414              :      */
     415         1158 :     InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
     416              : 
     417              :     /*
     418              :      * Get or create the shared strategy control block
     419              :      */
     420         1158 :     StrategyControl = (BufferStrategyControl *)
     421         1158 :         ShmemInitStruct("Buffer Strategy Status",
     422              :                         sizeof(BufferStrategyControl),
     423              :                         &found);
     424              : 
     425         1158 :     if (!found)
     426              :     {
     427              :         /*
     428              :          * Only done once, usually in postmaster
     429              :          */
     430              :         Assert(init);
     431              : 
     432         1158 :         SpinLockInit(&StrategyControl->buffer_strategy_lock);
     433              : 
     434              :         /* Initialize the clock-sweep pointer */
     435         1158 :         pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
     436              : 
     437              :         /* Clear statistics */
     438         1158 :         StrategyControl->completePasses = 0;
     439         1158 :         pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
     440              : 
     441              :         /* No pending notification */
     442         1158 :         StrategyControl->bgwprocno = -1;
     443              :     }
     444              :     else
     445              :         Assert(!init);
     446         1158 : }
     447              : 
     448              : 
     449              : /* ----------------------------------------------------------------
     450              :  *              Backend-private buffer ring management
     451              :  * ----------------------------------------------------------------
     452              :  */
     453              : 
     454              : 
     455              : /*
     456              :  * GetAccessStrategy -- create a BufferAccessStrategy object
     457              :  *
     458              :  * The object is allocated in the current memory context.
     459              :  */
     460              : BufferAccessStrategy
     461       149355 : GetAccessStrategy(BufferAccessStrategyType btype)
     462              : {
     463              :     int         ring_size_kb;
     464              : 
     465              :     /*
     466              :      * Select ring size to use.  See buffer/README for rationales.
     467              :      *
     468              :      * Note: if you change the ring size for BAS_BULKREAD, see also
     469              :      * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
     470              :      */
     471       149355 :     switch (btype)
     472              :     {
     473            0 :         case BAS_NORMAL:
     474              :             /* if someone asks for NORMAL, just give 'em a "default" object */
     475            0 :             return NULL;
     476              : 
     477        82842 :         case BAS_BULKREAD:
     478              :             {
     479              :                 int         ring_max_kb;
     480              : 
     481              :                 /*
     482              :                  * The ring always needs to be large enough to allow some
     483              :                  * separation in time between providing a buffer to the user
     484              :                  * of the strategy and that buffer being reused. Otherwise the
     485              :                  * user's pin will prevent reuse of the buffer, even without
     486              :                  * concurrent activity.
     487              :                  *
     488              :                  * We also need to ensure the ring always is large enough for
     489              :                  * SYNC_SCAN_REPORT_INTERVAL, as noted above.
     490              :                  *
     491              :                  * Thus we start out a minimal size and increase the size
     492              :                  * further if appropriate.
     493              :                  */
     494        82842 :                 ring_size_kb = 256;
     495              : 
     496              :                 /*
     497              :                  * There's no point in a larger ring if we won't be allowed to
     498              :                  * pin sufficiently many buffers.  But we never limit to less
     499              :                  * than the minimal size above.
     500              :                  */
     501        82842 :                 ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
     502        82842 :                 ring_max_kb = Max(ring_size_kb, ring_max_kb);
     503              : 
     504              :                 /*
     505              :                  * We would like the ring to additionally have space for the
     506              :                  * configured degree of IO concurrency. While being read in,
     507              :                  * buffers can obviously not yet be reused.
     508              :                  *
     509              :                  * Each IO can be up to io_combine_limit blocks large, and we
     510              :                  * want to start up to effective_io_concurrency IOs.
     511              :                  *
     512              :                  * Note that effective_io_concurrency may be 0, which disables
     513              :                  * AIO.
     514              :                  */
     515        82842 :                 ring_size_kb += (BLCKSZ / 1024) *
     516        82842 :                     io_combine_limit * effective_io_concurrency;
     517              : 
     518        82842 :                 if (ring_size_kb > ring_max_kb)
     519        82842 :                     ring_size_kb = ring_max_kb;
     520        82842 :                 break;
     521              :             }
     522        66513 :         case BAS_BULKWRITE:
     523        66513 :             ring_size_kb = 16 * 1024;
     524        66513 :             break;
     525            0 :         case BAS_VACUUM:
     526            0 :             ring_size_kb = 2048;
     527            0 :             break;
     528              : 
     529            0 :         default:
     530            0 :             elog(ERROR, "unrecognized buffer access strategy: %d",
     531              :                  (int) btype);
     532              :             return NULL;        /* keep compiler quiet */
     533              :     }
     534              : 
     535       149355 :     return GetAccessStrategyWithSize(btype, ring_size_kb);
     536              : }
     537              : 
     538              : /*
     539              :  * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
     540              :  *      number of buffers equivalent to the passed in size.
     541              :  *
     542              :  * If the given ring size is 0, no BufferAccessStrategy will be created and
     543              :  * the function will return NULL.  ring_size_kb must not be negative.
     544              :  */
     545              : BufferAccessStrategy
     546       158444 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
     547              : {
     548              :     int         ring_buffers;
     549              :     BufferAccessStrategy strategy;
     550              : 
     551              :     Assert(ring_size_kb >= 0);
     552              : 
     553              :     /* Figure out how many buffers ring_size_kb is */
     554       158444 :     ring_buffers = ring_size_kb / (BLCKSZ / 1024);
     555              : 
     556              :     /* 0 means unlimited, so no BufferAccessStrategy required */
     557       158444 :     if (ring_buffers == 0)
     558            6 :         return NULL;
     559              : 
     560              :     /* Cap to 1/8th of shared_buffers */
     561       158438 :     ring_buffers = Min(NBuffers / 8, ring_buffers);
     562              : 
     563              :     /* NBuffers should never be less than 16, so this shouldn't happen */
     564              :     Assert(ring_buffers > 0);
     565              : 
     566              :     /* Allocate the object and initialize all elements to zeroes */
     567              :     strategy = (BufferAccessStrategy)
     568       158438 :         palloc0(offsetof(BufferAccessStrategyData, buffers) +
     569              :                 ring_buffers * sizeof(Buffer));
     570              : 
     571              :     /* Set fields that don't start out zero */
     572       158438 :     strategy->btype = btype;
     573       158438 :     strategy->nbuffers = ring_buffers;
     574              : 
     575       158438 :     return strategy;
     576              : }
     577              : 
     578              : /*
     579              :  * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
     580              :  *      the ring
     581              :  *
     582              :  * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
     583              :  * returning NULL with 0 size.
     584              :  */
     585              : int
     586           17 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
     587              : {
     588           17 :     if (strategy == NULL)
     589            0 :         return 0;
     590              : 
     591           17 :     return strategy->nbuffers;
     592              : }
     593              : 
     594              : /*
     595              :  * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
     596              :  *
     597              :  * When pinning extra buffers to look ahead, users of a ring-based strategy are
     598              :  * in danger of pinning too much of the ring at once while performing look-ahead.
     599              :  * For some strategies, that means "escaping" from the ring, and in others it
     600              :  * means forcing dirty data to disk very frequently with associated WAL
     601              :  * flushing.  Since external code has no insight into any of that, allow
     602              :  * individual strategy types to expose a clamp that should be applied when
     603              :  * deciding on a maximum number of buffers to pin at once.
     604              :  *
     605              :  * Callers should combine this number with other relevant limits and take the
     606              :  * minimum.
     607              :  */
     608              : int
     609       600908 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
     610              : {
     611       600908 :     if (strategy == NULL)
     612       414196 :         return NBuffers;
     613              : 
     614       186712 :     switch (strategy->btype)
     615              :     {
     616        78362 :         case BAS_BULKREAD:
     617              : 
     618              :             /*
     619              :              * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
     620              :              * shouldn't be a problem and the caller is free to pin up to the
     621              :              * entire ring at once.
     622              :              */
     623        78362 :             return strategy->nbuffers;
     624              : 
     625       108350 :         default:
     626              : 
     627              :             /*
     628              :              * Tell caller not to pin more than half the buffers in the ring.
     629              :              * This is a trade-off between look ahead distance and deferring
     630              :              * writeback and associated WAL traffic.
     631              :              */
     632       108350 :             return strategy->nbuffers / 2;
     633              :     }
     634              : }
     635              : 
     636              : /*
     637              :  * FreeAccessStrategy -- release a BufferAccessStrategy object
     638              :  *
     639              :  * A simple pfree would do at the moment, but we would prefer that callers
     640              :  * don't assume that much about the representation of BufferAccessStrategy.
     641              :  */
     642              : void
     643       143198 : FreeAccessStrategy(BufferAccessStrategy strategy)
     644              : {
     645              :     /* don't crash if called on a "default" strategy */
     646       143198 :     if (strategy != NULL)
     647       143198 :         pfree(strategy);
     648       143198 : }
     649              : 
     650              : /*
     651              :  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
     652              :  *      ring is empty / not usable.
     653              :  *
     654              :  * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
     655              :  * returning.
     656              :  */
     657              : static BufferDesc *
     658       836518 : GetBufferFromRing(BufferAccessStrategy strategy, uint64 *buf_state)
     659              : {
     660              :     BufferDesc *buf;
     661              :     Buffer      bufnum;
     662              :     uint64      old_buf_state;
     663              :     uint64      local_buf_state;    /* to avoid repeated (de-)referencing */
     664              : 
     665              : 
     666              :     /* Advance to next ring slot */
     667       836518 :     if (++strategy->current >= strategy->nbuffers)
     668        24573 :         strategy->current = 0;
     669              : 
     670              :     /*
     671              :      * If the slot hasn't been filled yet, tell the caller to allocate a new
     672              :      * buffer with the normal allocation strategy.  He will then fill this
     673              :      * slot by calling AddBufferToRing with the new buffer.
     674              :      */
     675       836518 :     bufnum = strategy->buffers[strategy->current];
     676       836518 :     if (bufnum == InvalidBuffer)
     677       502593 :         return NULL;
     678              : 
     679       333925 :     buf = GetBufferDescriptor(bufnum - 1);
     680              : 
     681              :     /*
     682              :      * Check whether the buffer can be used and pin it if so. Do this using a
     683              :      * CAS loop, to avoid having to lock the buffer header.
     684              :      */
     685       333925 :     old_buf_state = pg_atomic_read_u64(&buf->state);
     686              :     for (;;)
     687              :     {
     688       333927 :         local_buf_state = old_buf_state;
     689              : 
     690              :         /*
     691              :          * If the buffer is pinned we cannot use it under any circumstances.
     692              :          *
     693              :          * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
     694              :          * since our own previous usage of the ring element would have left it
     695              :          * there, but it might've been decremented by clock-sweep since then).
     696              :          * A higher usage_count indicates someone else has touched the buffer,
     697              :          * so we shouldn't re-use it.
     698              :          */
     699       333927 :         if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
     700       329445 :             || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
     701              :             break;
     702              : 
     703              :         /* See equivalent code in PinBuffer() */
     704       324533 :         if (unlikely(local_buf_state & BM_LOCKED))
     705              :         {
     706            0 :             old_buf_state = WaitBufHdrUnlocked(buf);
     707            0 :             continue;
     708              :         }
     709              : 
     710              :         /* pin the buffer if the CAS succeeds */
     711       324533 :         local_buf_state += BUF_REFCOUNT_ONE;
     712              : 
     713       324533 :         if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
     714              :                                            local_buf_state))
     715              :         {
     716       324531 :             *buf_state = local_buf_state;
     717              : 
     718       324531 :             TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
     719       324531 :             return buf;
     720              :         }
     721              :     }
     722              : 
     723              :     /*
     724              :      * Tell caller to allocate a new buffer with the normal allocation
     725              :      * strategy.  He'll then replace this ring element via AddBufferToRing.
     726              :      */
     727         9394 :     return NULL;
     728              : }
     729              : 
     730              : /*
     731              :  * AddBufferToRing -- add a buffer to the buffer ring
     732              :  *
     733              :  * Caller must hold the buffer header spinlock on the buffer.  Since this
     734              :  * is called with the spinlock held, it had better be quite cheap.
     735              :  */
     736              : static void
     737       511987 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
     738              : {
     739       511987 :     strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
     740       511987 : }
     741              : 
     742              : /*
     743              :  * Utility function returning the IOContext of a given BufferAccessStrategy's
     744              :  * strategy ring.
     745              :  */
     746              : IOContext
     747     66052903 : IOContextForStrategy(BufferAccessStrategy strategy)
     748              : {
     749     66052903 :     if (!strategy)
     750     63442505 :         return IOCONTEXT_NORMAL;
     751              : 
     752      2610398 :     switch (strategy->btype)
     753              :     {
     754              :         case BAS_NORMAL:
     755              : 
     756              :             /*
     757              :              * Currently, GetAccessStrategy() returns NULL for
     758              :              * BufferAccessStrategyType BAS_NORMAL, so this case is
     759              :              * unreachable.
     760              :              */
     761              :             pg_unreachable();
     762              :             return IOCONTEXT_NORMAL;
     763      1675719 :         case BAS_BULKREAD:
     764      1675719 :             return IOCONTEXT_BULKREAD;
     765       288134 :         case BAS_BULKWRITE:
     766       288134 :             return IOCONTEXT_BULKWRITE;
     767       646545 :         case BAS_VACUUM:
     768       646545 :             return IOCONTEXT_VACUUM;
     769              :     }
     770              : 
     771            0 :     elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
     772              :     pg_unreachable();
     773              : }
     774              : 
     775              : /*
     776              :  * StrategyRejectBuffer -- consider rejecting a dirty buffer
     777              :  *
     778              :  * When a nondefault strategy is used, the buffer manager calls this function
     779              :  * when it turns out that the buffer selected by StrategyGetBuffer needs to
     780              :  * be written out and doing so would require flushing WAL too.  This gives us
     781              :  * a chance to choose a different victim.
     782              :  *
     783              :  * Returns true if buffer manager should ask for a new victim, and false
     784              :  * if this buffer should be written and re-used.
     785              :  */
     786              : bool
     787         7972 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
     788              : {
     789              :     /* We only do this in bulkread mode */
     790         7972 :     if (strategy->btype != BAS_BULKREAD)
     791         2269 :         return false;
     792              : 
     793              :     /* Don't muck with behavior of normal buffer-replacement strategy */
     794        11406 :     if (!from_ring ||
     795         5703 :         strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
     796            0 :         return false;
     797              : 
     798              :     /*
     799              :      * Remove the dirty buffer from the ring; necessary to prevent infinite
     800              :      * loop if all ring members are dirty.
     801              :      */
     802         5703 :     strategy->buffers[strategy->current] = InvalidBuffer;
     803              : 
     804         5703 :     return true;
     805              : }
        

Generated by: LCOV version 2.0-1