Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * freelist.c
4 : * routines for managing the buffer pool's replacement strategy.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * src/backend/storage/buffer/freelist.c
13 : *
14 : *-------------------------------------------------------------------------
15 : */
16 : #include "postgres.h"
17 :
18 : #include "pgstat.h"
19 : #include "port/atomics.h"
20 : #include "storage/buf_internals.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 : #include "storage/shmem.h"
24 : #include "storage/subsystems.h"
25 :
26 : #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
27 :
28 :
29 : /*
30 : * The shared freelist control information.
31 : */
32 : typedef struct
33 : {
34 : /* Spinlock: protects the values below */
35 : slock_t buffer_strategy_lock;
36 :
37 : /*
38 : * clock-sweep hand: index of next buffer to consider grabbing. Note that
39 : * this isn't a concrete buffer - we only ever increase the value. So, to
40 : * get an actual buffer, it needs to be used modulo NBuffers.
41 : */
42 : pg_atomic_uint32 nextVictimBuffer;
43 :
44 : /*
45 : * Statistics. These counters should be wide enough that they can't
46 : * overflow during a single bgwriter cycle.
47 : */
48 : uint32 completePasses; /* Complete cycles of the clock-sweep */
49 : pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
50 :
51 : /*
52 : * Bgworker process to be notified upon activity or -1 if none. See
53 : * StrategyNotifyBgWriter.
54 : */
55 : int bgwprocno;
56 : } BufferStrategyControl;
57 :
58 : /* Pointers to shared state */
59 : static BufferStrategyControl *StrategyControl = NULL;
60 :
61 : static void StrategyCtlShmemRequest(void *arg);
62 : static void StrategyCtlShmemInit(void *arg);
63 :
64 : const ShmemCallbacks StrategyCtlShmemCallbacks = {
65 : .request_fn = StrategyCtlShmemRequest,
66 : .init_fn = StrategyCtlShmemInit,
67 : };
68 :
69 : /*
70 : * Private (non-shared) state for managing a ring of shared buffers to re-use.
71 : * This is currently the only kind of BufferAccessStrategy object, but someday
72 : * we might have more kinds.
73 : */
74 : typedef struct BufferAccessStrategyData
75 : {
76 : /* Overall strategy type */
77 : BufferAccessStrategyType btype;
78 : /* Number of elements in buffers[] array */
79 : int nbuffers;
80 :
81 : /*
82 : * Index of the "current" slot in the ring, ie, the one most recently
83 : * returned by GetBufferFromRing.
84 : */
85 : int current;
86 :
87 : /*
88 : * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
89 : * have not yet selected a buffer for this ring slot. For allocation
90 : * simplicity this is palloc'd together with the fixed fields of the
91 : * struct.
92 : */
93 : Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
94 : } BufferAccessStrategyData;
95 :
96 :
97 : /* Prototypes for internal functions */
98 : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
99 : uint64 *buf_state);
100 : static void AddBufferToRing(BufferAccessStrategy strategy,
101 : BufferDesc *buf);
102 :
103 : /*
104 : * ClockSweepTick - Helper routine for StrategyGetBuffer()
105 : *
106 : * Move the clock hand one buffer ahead of its current position and return the
107 : * id of the buffer now under the hand.
108 : */
109 : static inline uint32
110 5392589 : ClockSweepTick(void)
111 : {
112 : uint32 victim;
113 :
114 : /*
115 : * Atomically move hand ahead one buffer - if there's several processes
116 : * doing this, this can lead to buffers being returned slightly out of
117 : * apparent order.
118 : */
119 : victim =
120 5392589 : pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
121 :
122 5392589 : if (victim >= NBuffers)
123 : {
124 36123 : uint32 originalVictim = victim;
125 :
126 : /* always wrap what we look up in BufferDescriptors */
127 36123 : victim = victim % NBuffers;
128 :
129 : /*
130 : * If we're the one that just caused a wraparound, force
131 : * completePasses to be incremented while holding the spinlock. We
132 : * need the spinlock so StrategySyncStart() can return a consistent
133 : * value consisting of nextVictimBuffer and completePasses.
134 : */
135 36123 : if (victim == 0)
136 : {
137 : uint32 expected;
138 : uint32 wrapped;
139 35899 : bool success = false;
140 :
141 35899 : expected = originalVictim + 1;
142 :
143 71969 : while (!success)
144 : {
145 : /*
146 : * Acquire the spinlock while increasing completePasses. That
147 : * allows other readers to read nextVictimBuffer and
148 : * completePasses in a consistent manner which is required for
149 : * StrategySyncStart(). In theory delaying the increment
150 : * could lead to an overflow of nextVictimBuffers, but that's
151 : * highly unlikely and wouldn't be particularly harmful.
152 : */
153 36070 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
154 :
155 36070 : wrapped = expected % NBuffers;
156 :
157 36070 : success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
158 : &expected, wrapped);
159 36070 : if (success)
160 35899 : StrategyControl->completePasses++;
161 36070 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
162 : }
163 : }
164 : }
165 5392589 : return victim;
166 : }
167 :
168 : /*
169 : * StrategyGetBuffer
170 : *
171 : * Called by the bufmgr to get the next candidate buffer to use in
172 : * GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
173 : * the selected buffer must not currently be pinned by anyone.
174 : *
175 : * strategy is a BufferAccessStrategy object, or NULL for default strategy.
176 : *
177 : * It is the callers responsibility to ensure the buffer ownership can be
178 : * tracked via TrackNewBufferPin().
179 : *
180 : * The buffer is pinned and marked as owned, using TrackNewBufferPin(),
181 : * before returning.
182 : */
183 : BufferDesc *
184 2282582 : StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
185 : {
186 : BufferDesc *buf;
187 : int bgwprocno;
188 : int trycounter;
189 :
190 2282582 : *from_ring = false;
191 :
192 : /*
193 : * If given a strategy object, see whether it can select a buffer. We
194 : * assume strategy objects don't need buffer_strategy_lock.
195 : */
196 2282582 : if (strategy != NULL)
197 : {
198 1007746 : buf = GetBufferFromRing(strategy, buf_state);
199 1007746 : if (buf != NULL)
200 : {
201 418330 : *from_ring = true;
202 418330 : return buf;
203 : }
204 : }
205 :
206 : /*
207 : * If asked, we need to waken the bgwriter. Since we don't want to rely on
208 : * a spinlock for this we force a read from shared memory once, and then
209 : * set the latch based on that value. We need to go through that length
210 : * because otherwise bgwprocno might be reset while/after we check because
211 : * the compiler might just reread from memory.
212 : *
213 : * This can possibly set the latch of the wrong process if the bgwriter
214 : * dies in the wrong moment. But since PGPROC->procLatch is never
215 : * deallocated the worst consequence of that is that we set the latch of
216 : * some arbitrary process.
217 : */
218 1864252 : bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
219 1864252 : if (bgwprocno != -1)
220 : {
221 : /* reset bgwprocno first, before setting the latch */
222 468 : StrategyControl->bgwprocno = -1;
223 :
224 : /*
225 : * Not acquiring ProcArrayLock here which is slightly icky. It's
226 : * actually fine because procLatch isn't ever freed, so we just can
227 : * potentially set the wrong process' (or no process') latch.
228 : */
229 468 : SetLatch(&GetPGProcByNumber(bgwprocno)->procLatch);
230 : }
231 :
232 : /*
233 : * We count buffer allocation requests so that the bgwriter can estimate
234 : * the rate of buffer consumption. Note that buffers recycled by a
235 : * strategy object are intentionally not counted here.
236 : */
237 1864252 : pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
238 :
239 : /* Use the "clock sweep" algorithm to find a free buffer */
240 1864252 : trycounter = NBuffers;
241 : for (;;)
242 3528337 : {
243 : uint64 old_buf_state;
244 : uint64 local_buf_state;
245 :
246 5392589 : buf = GetBufferDescriptor(ClockSweepTick());
247 :
248 : /*
249 : * Check whether the buffer can be used and pin it if so. Do this
250 : * using a CAS loop, to avoid having to lock the buffer header.
251 : */
252 5392589 : old_buf_state = pg_atomic_read_u64(&buf->state);
253 : for (;;)
254 : {
255 5392631 : local_buf_state = old_buf_state;
256 :
257 : /*
258 : * If the buffer is pinned or has a nonzero usage_count, we cannot
259 : * use it; decrement the usage_count (unless pinned) and keep
260 : * scanning.
261 : */
262 :
263 5392631 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
264 : {
265 86695 : if (--trycounter == 0)
266 : {
267 : /*
268 : * We've scanned all the buffers without making any state
269 : * changes, so all the buffers are pinned (or were when we
270 : * looked at them). We could hope that someone will free
271 : * one eventually, but it's probably better to fail than
272 : * to risk getting stuck in an infinite loop.
273 : */
274 0 : elog(ERROR, "no unpinned buffers available");
275 : }
276 86695 : break;
277 : }
278 :
279 : /* See equivalent code in PinBuffer() */
280 5305936 : if (unlikely(local_buf_state & BM_LOCKED))
281 : {
282 0 : old_buf_state = WaitBufHdrUnlocked(buf);
283 0 : continue;
284 : }
285 :
286 5305936 : if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
287 : {
288 3441680 : local_buf_state -= BUF_USAGECOUNT_ONE;
289 :
290 3441680 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
291 : local_buf_state))
292 : {
293 3441642 : trycounter = NBuffers;
294 3441642 : break;
295 : }
296 : }
297 : else
298 : {
299 : /* pin the buffer if the CAS succeeds */
300 1864256 : local_buf_state += BUF_REFCOUNT_ONE;
301 :
302 1864256 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
303 : local_buf_state))
304 : {
305 : /* Found a usable buffer */
306 1864252 : if (strategy != NULL)
307 589416 : AddBufferToRing(strategy, buf);
308 1864252 : *buf_state = local_buf_state;
309 :
310 1864252 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
311 :
312 1864252 : return buf;
313 : }
314 : }
315 : }
316 : }
317 : }
318 :
319 : /*
320 : * StrategySyncStart -- tell BgBufferSync where to start syncing
321 : *
322 : * The result is the buffer index of the best buffer to sync first.
323 : * BgBufferSync() will proceed circularly around the buffer array from there.
324 : *
325 : * In addition, we return the completed-pass count (which is effectively
326 : * the higher-order bits of nextVictimBuffer) and the count of recent buffer
327 : * allocs if non-NULL pointers are passed. The alloc count is reset after
328 : * being read.
329 : */
330 : int
331 14504 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
332 : {
333 : uint32 nextVictimBuffer;
334 : int result;
335 :
336 14504 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
337 14504 : nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
338 14504 : result = nextVictimBuffer % NBuffers;
339 :
340 14504 : if (complete_passes)
341 : {
342 14504 : *complete_passes = StrategyControl->completePasses;
343 :
344 : /*
345 : * Additionally add the number of wraparounds that happened before
346 : * completePasses could be incremented. C.f. ClockSweepTick().
347 : */
348 14504 : *complete_passes += nextVictimBuffer / NBuffers;
349 : }
350 :
351 14504 : if (num_buf_alloc)
352 : {
353 14504 : *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
354 : }
355 14504 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
356 14504 : return result;
357 : }
358 :
359 : /*
360 : * StrategyNotifyBgWriter -- set or clear allocation notification latch
361 : *
362 : * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
363 : * set that latch. Pass -1 to clear the pending notification before it
364 : * happens. This feature is used by the bgwriter process to wake itself up
365 : * from hibernation, and is not meant for anybody else to use.
366 : */
367 : void
368 880 : StrategyNotifyBgWriter(int bgwprocno)
369 : {
370 : /*
371 : * We acquire buffer_strategy_lock just to ensure that the store appears
372 : * atomic to StrategyGetBuffer. The bgwriter should call this rather
373 : * infrequently, so there's no performance penalty from being safe.
374 : */
375 880 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
376 880 : StrategyControl->bgwprocno = bgwprocno;
377 880 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
378 880 : }
379 :
380 :
381 : /*
382 : * StrategyCtlShmemRequest -- request shared memory for the buffer
383 : * cache replacement strategy.
384 : */
385 : static void
386 1234 : StrategyCtlShmemRequest(void *arg)
387 : {
388 1234 : ShmemRequestStruct(.name = "Buffer Strategy Status",
389 : .size = sizeof(BufferStrategyControl),
390 : .ptr = (void **) &StrategyControl
391 : );
392 1234 : }
393 :
394 : /*
395 : * StrategyCtlShmemInit -- initialize the buffer cache replacement strategy.
396 : */
397 : static void
398 1231 : StrategyCtlShmemInit(void *arg)
399 : {
400 1231 : SpinLockInit(&StrategyControl->buffer_strategy_lock);
401 :
402 : /* Initialize the clock-sweep pointer */
403 1231 : pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
404 :
405 : /* Clear statistics */
406 1231 : StrategyControl->completePasses = 0;
407 1231 : pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
408 :
409 : /* No pending notification */
410 1231 : StrategyControl->bgwprocno = -1;
411 1231 : }
412 :
413 :
414 : /* ----------------------------------------------------------------
415 : * Backend-private buffer ring management
416 : * ----------------------------------------------------------------
417 : */
418 :
419 :
420 : /*
421 : * GetAccessStrategy -- create a BufferAccessStrategy object
422 : *
423 : * The object is allocated in the current memory context.
424 : */
425 : BufferAccessStrategy
426 167332 : GetAccessStrategy(BufferAccessStrategyType btype)
427 : {
428 : int ring_size_kb;
429 :
430 : /*
431 : * Select ring size to use. See buffer/README for rationales.
432 : *
433 : * Note: if you change the ring size for BAS_BULKREAD, see also
434 : * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
435 : */
436 167332 : switch (btype)
437 : {
438 0 : case BAS_NORMAL:
439 : /* if someone asks for NORMAL, just give 'em a "default" object */
440 0 : return NULL;
441 :
442 92004 : case BAS_BULKREAD:
443 : {
444 : int ring_max_kb;
445 :
446 : /*
447 : * The ring always needs to be large enough to allow some
448 : * separation in time between providing a buffer to the user
449 : * of the strategy and that buffer being reused. Otherwise the
450 : * user's pin will prevent reuse of the buffer, even without
451 : * concurrent activity.
452 : *
453 : * We also need to ensure the ring always is large enough for
454 : * SYNC_SCAN_REPORT_INTERVAL, as noted above.
455 : *
456 : * Thus we start out a minimal size and increase the size
457 : * further if appropriate.
458 : */
459 92004 : ring_size_kb = 256;
460 :
461 : /*
462 : * There's no point in a larger ring if we won't be allowed to
463 : * pin sufficiently many buffers. But we never limit to less
464 : * than the minimal size above.
465 : */
466 92004 : ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
467 92004 : ring_max_kb = Max(ring_size_kb, ring_max_kb);
468 :
469 : /*
470 : * We would like the ring to additionally have space for the
471 : * configured degree of IO concurrency. While being read in,
472 : * buffers can obviously not yet be reused.
473 : *
474 : * Each IO can be up to io_combine_limit blocks large, and we
475 : * want to start up to effective_io_concurrency IOs.
476 : *
477 : * Note that effective_io_concurrency may be 0, which disables
478 : * AIO.
479 : */
480 92004 : ring_size_kb += (BLCKSZ / 1024) *
481 92004 : io_combine_limit * effective_io_concurrency;
482 :
483 92004 : if (ring_size_kb > ring_max_kb)
484 92004 : ring_size_kb = ring_max_kb;
485 92004 : break;
486 : }
487 75308 : case BAS_BULKWRITE:
488 75308 : ring_size_kb = 16 * 1024;
489 75308 : break;
490 20 : case BAS_VACUUM:
491 20 : ring_size_kb = 2048;
492 20 : break;
493 :
494 0 : default:
495 0 : elog(ERROR, "unrecognized buffer access strategy: %d",
496 : (int) btype);
497 : return NULL; /* keep compiler quiet */
498 : }
499 :
500 167332 : return GetAccessStrategyWithSize(btype, ring_size_kb);
501 : }
502 :
503 : /*
504 : * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
505 : * number of buffers equivalent to the passed in size.
506 : *
507 : * If the given ring size is 0, no BufferAccessStrategy will be created and
508 : * the function will return NULL. ring_size_kb must not be negative.
509 : */
510 : BufferAccessStrategy
511 177166 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
512 : {
513 : int ring_buffers;
514 : BufferAccessStrategy strategy;
515 :
516 : Assert(ring_size_kb >= 0);
517 :
518 : /* Figure out how many buffers ring_size_kb is */
519 177166 : ring_buffers = ring_size_kb / (BLCKSZ / 1024);
520 :
521 : /* 0 means unlimited, so no BufferAccessStrategy required */
522 177166 : if (ring_buffers == 0)
523 8 : return NULL;
524 :
525 : /* Cap to 1/8th of shared_buffers */
526 177158 : ring_buffers = Min(NBuffers / 8, ring_buffers);
527 :
528 : /* NBuffers should never be less than 16, so this shouldn't happen */
529 : Assert(ring_buffers > 0);
530 :
531 : /* Allocate the object and initialize all elements to zeroes */
532 : strategy = (BufferAccessStrategy)
533 177158 : palloc0(offsetof(BufferAccessStrategyData, buffers) +
534 : ring_buffers * sizeof(Buffer));
535 :
536 : /* Set fields that don't start out zero */
537 177158 : strategy->btype = btype;
538 177158 : strategy->nbuffers = ring_buffers;
539 :
540 177158 : return strategy;
541 : }
542 :
543 : /*
544 : * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
545 : * the ring
546 : *
547 : * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
548 : * returning NULL with 0 size.
549 : */
550 : int
551 27 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
552 : {
553 27 : if (strategy == NULL)
554 0 : return 0;
555 :
556 27 : return strategy->nbuffers;
557 : }
558 :
559 : /*
560 : * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
561 : *
562 : * When pinning extra buffers to look ahead, users of a ring-based strategy are
563 : * in danger of pinning too much of the ring at once while performing look-ahead.
564 : * For some strategies, that means "escaping" from the ring, and in others it
565 : * means forcing dirty data to disk very frequently with associated WAL
566 : * flushing. Since external code has no insight into any of that, allow
567 : * individual strategy types to expose a clamp that should be applied when
568 : * deciding on a maximum number of buffers to pin at once.
569 : *
570 : * Callers should combine this number with other relevant limits and take the
571 : * minimum.
572 : */
573 : int
574 684233 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
575 : {
576 684233 : if (strategy == NULL)
577 490926 : return NBuffers;
578 :
579 193307 : switch (strategy->btype)
580 : {
581 87194 : case BAS_BULKREAD:
582 :
583 : /*
584 : * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
585 : * shouldn't be a problem and the caller is free to pin up to the
586 : * entire ring at once.
587 : */
588 87194 : return strategy->nbuffers;
589 :
590 106113 : default:
591 :
592 : /*
593 : * Tell caller not to pin more than half the buffers in the ring.
594 : * This is a trade-off between look ahead distance and deferring
595 : * writeback and associated WAL traffic.
596 : */
597 106113 : return strategy->nbuffers / 2;
598 : }
599 : }
600 :
601 : /*
602 : * FreeAccessStrategy -- release a BufferAccessStrategy object
603 : *
604 : * A simple pfree would do at the moment, but we would prefer that callers
605 : * don't assume that much about the representation of BufferAccessStrategy.
606 : */
607 : void
608 160721 : FreeAccessStrategy(BufferAccessStrategy strategy)
609 : {
610 : /* don't crash if called on a "default" strategy */
611 160721 : if (strategy != NULL)
612 160721 : pfree(strategy);
613 160721 : }
614 :
615 : /*
616 : * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
617 : * ring is empty / not usable.
618 : *
619 : * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
620 : * returning.
621 : */
622 : static BufferDesc *
623 1007746 : GetBufferFromRing(BufferAccessStrategy strategy, uint64 *buf_state)
624 : {
625 : BufferDesc *buf;
626 : Buffer bufnum;
627 : uint64 old_buf_state;
628 : uint64 local_buf_state; /* to avoid repeated (de-)referencing */
629 :
630 :
631 : /* Advance to next ring slot */
632 1007746 : if (++strategy->current >= strategy->nbuffers)
633 31579 : strategy->current = 0;
634 :
635 : /*
636 : * If the slot hasn't been filled yet, tell the caller to allocate a new
637 : * buffer with the normal allocation strategy. He will then fill this
638 : * slot by calling AddBufferToRing with the new buffer.
639 : */
640 1007746 : bufnum = strategy->buffers[strategy->current];
641 1007746 : if (bufnum == InvalidBuffer)
642 577855 : return NULL;
643 :
644 429891 : buf = GetBufferDescriptor(bufnum - 1);
645 :
646 : /*
647 : * Check whether the buffer can be used and pin it if so. Do this using a
648 : * CAS loop, to avoid having to lock the buffer header.
649 : */
650 429891 : old_buf_state = pg_atomic_read_u64(&buf->state);
651 : for (;;)
652 : {
653 429894 : local_buf_state = old_buf_state;
654 :
655 : /*
656 : * If the buffer is pinned we cannot use it under any circumstances.
657 : *
658 : * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
659 : * since our own previous usage of the ring element would have left it
660 : * there, but it might've been decremented by clock-sweep since then).
661 : * A higher usage_count indicates someone else has touched the buffer,
662 : * so we shouldn't re-use it.
663 : */
664 429894 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
665 423660 : || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
666 : break;
667 :
668 : /* See equivalent code in PinBuffer() */
669 418333 : if (unlikely(local_buf_state & BM_LOCKED))
670 : {
671 0 : old_buf_state = WaitBufHdrUnlocked(buf);
672 0 : continue;
673 : }
674 :
675 : /* pin the buffer if the CAS succeeds */
676 418333 : local_buf_state += BUF_REFCOUNT_ONE;
677 :
678 418333 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
679 : local_buf_state))
680 : {
681 418330 : *buf_state = local_buf_state;
682 :
683 418330 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
684 418330 : return buf;
685 : }
686 : }
687 :
688 : /*
689 : * Tell caller to allocate a new buffer with the normal allocation
690 : * strategy. He'll then replace this ring element via AddBufferToRing.
691 : */
692 11561 : return NULL;
693 : }
694 :
695 : /*
696 : * AddBufferToRing -- add a buffer to the buffer ring
697 : *
698 : * Caller must hold the buffer header spinlock on the buffer. Since this
699 : * is called with the spinlock held, it had better be quite cheap.
700 : */
701 : static void
702 589416 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
703 : {
704 589416 : strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
705 589416 : }
706 :
707 : /*
708 : * Utility function returning the IOContext of a given BufferAccessStrategy's
709 : * strategy ring.
710 : */
711 : IOContext
712 86140502 : IOContextForStrategy(BufferAccessStrategy strategy)
713 : {
714 86140502 : if (!strategy)
715 83361166 : return IOCONTEXT_NORMAL;
716 :
717 2779336 : switch (strategy->btype)
718 : {
719 : case BAS_NORMAL:
720 :
721 : /*
722 : * Currently, GetAccessStrategy() returns NULL for
723 : * BufferAccessStrategyType BAS_NORMAL, so this case is
724 : * unreachable.
725 : */
726 : pg_unreachable();
727 : return IOCONTEXT_NORMAL;
728 1823810 : case BAS_BULKREAD:
729 1823810 : return IOCONTEXT_BULKREAD;
730 332748 : case BAS_BULKWRITE:
731 332748 : return IOCONTEXT_BULKWRITE;
732 622778 : case BAS_VACUUM:
733 622778 : return IOCONTEXT_VACUUM;
734 : }
735 :
736 0 : elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
737 : pg_unreachable();
738 : }
739 :
740 : /*
741 : * StrategyRejectBuffer -- consider rejecting a dirty buffer
742 : *
743 : * When a nondefault strategy is used, the buffer manager calls this function
744 : * when it turns out that the buffer selected by StrategyGetBuffer needs to
745 : * be written out and doing so would require flushing WAL too. This gives us
746 : * a chance to choose a different victim.
747 : *
748 : * Returns true if buffer manager should ask for a new victim, and false
749 : * if this buffer should be written and re-used.
750 : */
751 : bool
752 26704 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
753 : {
754 : /* We only do this in bulkread mode */
755 26704 : if (strategy->btype != BAS_BULKREAD)
756 4319 : return false;
757 :
758 : /* Don't muck with behavior of normal buffer-replacement strategy */
759 44770 : if (!from_ring ||
760 22385 : strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
761 0 : return false;
762 :
763 : /*
764 : * Remove the dirty buffer from the ring; necessary to prevent infinite
765 : * loop if all ring members are dirty.
766 : */
767 22385 : strategy->buffers[strategy->current] = InvalidBuffer;
768 :
769 22385 : return true;
770 : }
|