Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * freelist.c
4 : * routines for managing the buffer pool's replacement strategy.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * src/backend/storage/buffer/freelist.c
13 : *
14 : *-------------------------------------------------------------------------
15 : */
16 : #include "postgres.h"
17 :
18 : #include "pgstat.h"
19 : #include "port/atomics.h"
20 : #include "storage/buf_internals.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/proc.h"
23 :
24 : #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
25 :
26 :
27 : /*
28 : * The shared freelist control information.
29 : */
30 : typedef struct
31 : {
32 : /* Spinlock: protects the values below */
33 : slock_t buffer_strategy_lock;
34 :
35 : /*
36 : * clock-sweep hand: index of next buffer to consider grabbing. Note that
37 : * this isn't a concrete buffer - we only ever increase the value. So, to
38 : * get an actual buffer, it needs to be used modulo NBuffers.
39 : */
40 : pg_atomic_uint32 nextVictimBuffer;
41 :
42 : /*
43 : * Statistics. These counters should be wide enough that they can't
44 : * overflow during a single bgwriter cycle.
45 : */
46 : uint32 completePasses; /* Complete cycles of the clock-sweep */
47 : pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
48 :
49 : /*
50 : * Bgworker process to be notified upon activity or -1 if none. See
51 : * StrategyNotifyBgWriter.
52 : */
53 : int bgwprocno;
54 : } BufferStrategyControl;
55 :
56 : /* Pointers to shared state */
57 : static BufferStrategyControl *StrategyControl = NULL;
58 :
59 : /*
60 : * Private (non-shared) state for managing a ring of shared buffers to re-use.
61 : * This is currently the only kind of BufferAccessStrategy object, but someday
62 : * we might have more kinds.
63 : */
64 : typedef struct BufferAccessStrategyData
65 : {
66 : /* Overall strategy type */
67 : BufferAccessStrategyType btype;
68 : /* Number of elements in buffers[] array */
69 : int nbuffers;
70 :
71 : /*
72 : * Index of the "current" slot in the ring, ie, the one most recently
73 : * returned by GetBufferFromRing.
74 : */
75 : int current;
76 :
77 : /*
78 : * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
79 : * have not yet selected a buffer for this ring slot. For allocation
80 : * simplicity this is palloc'd together with the fixed fields of the
81 : * struct.
82 : */
83 : Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
84 : } BufferAccessStrategyData;
85 :
86 :
87 : /* Prototypes for internal functions */
88 : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
89 : uint32 *buf_state);
90 : static void AddBufferToRing(BufferAccessStrategy strategy,
91 : BufferDesc *buf);
92 :
93 : /*
94 : * ClockSweepTick - Helper routine for StrategyGetBuffer()
95 : *
96 : * Move the clock hand one buffer ahead of its current position and return the
97 : * id of the buffer now under the hand.
98 : */
99 : static inline uint32
100 9543640 : ClockSweepTick(void)
101 : {
102 : uint32 victim;
103 :
104 : /*
105 : * Atomically move hand ahead one buffer - if there's several processes
106 : * doing this, this can lead to buffers being returned slightly out of
107 : * apparent order.
108 : */
109 : victim =
110 9543640 : pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
111 :
112 9543640 : if (victim >= NBuffers)
113 : {
114 65392 : uint32 originalVictim = victim;
115 :
116 : /* always wrap what we look up in BufferDescriptors */
117 65392 : victim = victim % NBuffers;
118 :
119 : /*
120 : * If we're the one that just caused a wraparound, force
121 : * completePasses to be incremented while holding the spinlock. We
122 : * need the spinlock so StrategySyncStart() can return a consistent
123 : * value consisting of nextVictimBuffer and completePasses.
124 : */
125 65392 : if (victim == 0)
126 : {
127 : uint32 expected;
128 : uint32 wrapped;
129 65092 : bool success = false;
130 :
131 65092 : expected = originalVictim + 1;
132 :
133 130408 : while (!success)
134 : {
135 : /*
136 : * Acquire the spinlock while increasing completePasses. That
137 : * allows other readers to read nextVictimBuffer and
138 : * completePasses in a consistent manner which is required for
139 : * StrategySyncStart(). In theory delaying the increment
140 : * could lead to an overflow of nextVictimBuffers, but that's
141 : * highly unlikely and wouldn't be particularly harmful.
142 : */
143 65316 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
144 :
145 65316 : wrapped = expected % NBuffers;
146 :
147 65316 : success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
148 : &expected, wrapped);
149 65316 : if (success)
150 65092 : StrategyControl->completePasses++;
151 65316 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
152 : }
153 : }
154 : }
155 9543640 : return victim;
156 : }
157 :
158 : /*
159 : * StrategyGetBuffer
160 : *
161 : * Called by the bufmgr to get the next candidate buffer to use in
162 : * GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
163 : * the selected buffer must not currently be pinned by anyone.
164 : *
165 : * strategy is a BufferAccessStrategy object, or NULL for default strategy.
166 : *
167 : * It is the callers responsibility to ensure the buffer ownership can be
168 : * tracked via TrackNewBufferPin().
169 : *
170 : * The buffer is pinned and marked as owned, using TrackNewBufferPin(),
171 : * before returning.
172 : */
173 : BufferDesc *
174 3844722 : StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
175 : {
176 : BufferDesc *buf;
177 : int bgwprocno;
178 : int trycounter;
179 :
180 3844722 : *from_ring = false;
181 :
182 : /*
183 : * If given a strategy object, see whether it can select a buffer. We
184 : * assume strategy objects don't need buffer_strategy_lock.
185 : */
186 3844722 : if (strategy != NULL)
187 : {
188 1607858 : buf = GetBufferFromRing(strategy, buf_state);
189 1607858 : if (buf != NULL)
190 : {
191 610464 : *from_ring = true;
192 610464 : return buf;
193 : }
194 : }
195 :
196 : /*
197 : * If asked, we need to waken the bgwriter. Since we don't want to rely on
198 : * a spinlock for this we force a read from shared memory once, and then
199 : * set the latch based on that value. We need to go through that length
200 : * because otherwise bgwprocno might be reset while/after we check because
201 : * the compiler might just reread from memory.
202 : *
203 : * This can possibly set the latch of the wrong process if the bgwriter
204 : * dies in the wrong moment. But since PGPROC->procLatch is never
205 : * deallocated the worst consequence of that is that we set the latch of
206 : * some arbitrary process.
207 : */
208 3234258 : bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
209 3234258 : if (bgwprocno != -1)
210 : {
211 : /* reset bgwprocno first, before setting the latch */
212 1270 : StrategyControl->bgwprocno = -1;
213 :
214 : /*
215 : * Not acquiring ProcArrayLock here which is slightly icky. It's
216 : * actually fine because procLatch isn't ever freed, so we just can
217 : * potentially set the wrong process' (or no process') latch.
218 : */
219 1270 : SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
220 : }
221 :
222 : /*
223 : * We count buffer allocation requests so that the bgwriter can estimate
224 : * the rate of buffer consumption. Note that buffers recycled by a
225 : * strategy object are intentionally not counted here.
226 : */
227 3234258 : pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
228 :
229 : /* Use the "clock sweep" algorithm to find a free buffer */
230 3234258 : trycounter = NBuffers;
231 : for (;;)
232 6309382 : {
233 : uint32 old_buf_state;
234 : uint32 local_buf_state;
235 :
236 9543640 : buf = GetBufferDescriptor(ClockSweepTick());
237 :
238 : /*
239 : * Check whether the buffer can be used and pin it if so. Do this
240 : * using a CAS loop, to avoid having to lock the buffer header.
241 : */
242 9543640 : old_buf_state = pg_atomic_read_u32(&buf->state);
243 : for (;;)
244 : {
245 9543768 : local_buf_state = old_buf_state;
246 :
247 : /*
248 : * If the buffer is pinned or has a nonzero usage_count, we cannot
249 : * use it; decrement the usage_count (unless pinned) and keep
250 : * scanning.
251 : */
252 :
253 9543768 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
254 : {
255 182314 : if (--trycounter == 0)
256 : {
257 : /*
258 : * We've scanned all the buffers without making any state
259 : * changes, so all the buffers are pinned (or were when we
260 : * looked at them). We could hope that someone will free
261 : * one eventually, but it's probably better to fail than
262 : * to risk getting stuck in an infinite loop.
263 : */
264 0 : elog(ERROR, "no unpinned buffers available");
265 : }
266 182314 : break;
267 : }
268 :
269 : /* See equivalent code in PinBuffer() */
270 9361454 : if (unlikely(local_buf_state & BM_LOCKED))
271 : {
272 22 : old_buf_state = WaitBufHdrUnlocked(buf);
273 22 : continue;
274 : }
275 :
276 9361432 : if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
277 : {
278 6127162 : local_buf_state -= BUF_USAGECOUNT_ONE;
279 :
280 6127162 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
281 : local_buf_state))
282 : {
283 6127068 : trycounter = NBuffers;
284 6127068 : break;
285 : }
286 : }
287 : else
288 : {
289 : /* pin the buffer if the CAS succeeds */
290 3234270 : local_buf_state += BUF_REFCOUNT_ONE;
291 :
292 3234270 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
293 : local_buf_state))
294 : {
295 : /* Found a usable buffer */
296 3234258 : if (strategy != NULL)
297 997394 : AddBufferToRing(strategy, buf);
298 3234258 : *buf_state = local_buf_state;
299 :
300 3234258 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
301 :
302 3234258 : return buf;
303 : }
304 : }
305 : }
306 : }
307 : }
308 :
309 : /*
310 : * StrategySyncStart -- tell BgBufferSync where to start syncing
311 : *
312 : * The result is the buffer index of the best buffer to sync first.
313 : * BgBufferSync() will proceed circularly around the buffer array from there.
314 : *
315 : * In addition, we return the completed-pass count (which is effectively
316 : * the higher-order bits of nextVictimBuffer) and the count of recent buffer
317 : * allocs if non-NULL pointers are passed. The alloc count is reset after
318 : * being read.
319 : */
320 : int
321 25688 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
322 : {
323 : uint32 nextVictimBuffer;
324 : int result;
325 :
326 25688 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
327 25688 : nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
328 25688 : result = nextVictimBuffer % NBuffers;
329 :
330 25688 : if (complete_passes)
331 : {
332 25688 : *complete_passes = StrategyControl->completePasses;
333 :
334 : /*
335 : * Additionally add the number of wraparounds that happened before
336 : * completePasses could be incremented. C.f. ClockSweepTick().
337 : */
338 25688 : *complete_passes += nextVictimBuffer / NBuffers;
339 : }
340 :
341 25688 : if (num_buf_alloc)
342 : {
343 25688 : *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
344 : }
345 25688 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
346 25688 : return result;
347 : }
348 :
349 : /*
350 : * StrategyNotifyBgWriter -- set or clear allocation notification latch
351 : *
352 : * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
353 : * set that latch. Pass -1 to clear the pending notification before it
354 : * happens. This feature is used by the bgwriter process to wake itself up
355 : * from hibernation, and is not meant for anybody else to use.
356 : */
357 : void
358 2556 : StrategyNotifyBgWriter(int bgwprocno)
359 : {
360 : /*
361 : * We acquire buffer_strategy_lock just to ensure that the store appears
362 : * atomic to StrategyGetBuffer. The bgwriter should call this rather
363 : * infrequently, so there's no performance penalty from being safe.
364 : */
365 2556 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
366 2556 : StrategyControl->bgwprocno = bgwprocno;
367 2556 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
368 2556 : }
369 :
370 :
371 : /*
372 : * StrategyShmemSize
373 : *
374 : * estimate the size of shared memory used by the freelist-related structures.
375 : *
376 : * Note: for somewhat historical reasons, the buffer lookup hashtable size
377 : * is also determined here.
378 : */
379 : Size
380 4096 : StrategyShmemSize(void)
381 : {
382 4096 : Size size = 0;
383 :
384 : /* size of lookup hash table ... see comment in StrategyInitialize */
385 4096 : size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
386 :
387 : /* size of the shared replacement strategy control block */
388 4096 : size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
389 :
390 4096 : return size;
391 : }
392 :
393 : /*
394 : * StrategyInitialize -- initialize the buffer cache replacement
395 : * strategy.
396 : *
397 : * Assumes: All of the buffers are already built into a linked list.
398 : * Only called by postmaster and only during initialization.
399 : */
400 : void
401 2198 : StrategyInitialize(bool init)
402 : {
403 : bool found;
404 :
405 : /*
406 : * Initialize the shared buffer lookup hashtable.
407 : *
408 : * Since we can't tolerate running out of lookup table entries, we must be
409 : * sure to specify an adequate table size here. The maximum steady-state
410 : * usage is of course NBuffers entries, but BufferAlloc() tries to insert
411 : * a new entry before deleting the old. In principle this could be
412 : * happening in each partition concurrently, so we could need as many as
413 : * NBuffers + NUM_BUFFER_PARTITIONS entries.
414 : */
415 2198 : InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
416 :
417 : /*
418 : * Get or create the shared strategy control block
419 : */
420 2198 : StrategyControl = (BufferStrategyControl *)
421 2198 : ShmemInitStruct("Buffer Strategy Status",
422 : sizeof(BufferStrategyControl),
423 : &found);
424 :
425 2198 : if (!found)
426 : {
427 : /*
428 : * Only done once, usually in postmaster
429 : */
430 : Assert(init);
431 :
432 2198 : SpinLockInit(&StrategyControl->buffer_strategy_lock);
433 :
434 : /* Initialize the clock-sweep pointer */
435 2198 : pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
436 :
437 : /* Clear statistics */
438 2198 : StrategyControl->completePasses = 0;
439 2198 : pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
440 :
441 : /* No pending notification */
442 2198 : StrategyControl->bgwprocno = -1;
443 : }
444 : else
445 : Assert(!init);
446 2198 : }
447 :
448 :
449 : /* ----------------------------------------------------------------
450 : * Backend-private buffer ring management
451 : * ----------------------------------------------------------------
452 : */
453 :
454 :
455 : /*
456 : * GetAccessStrategy -- create a BufferAccessStrategy object
457 : *
458 : * The object is allocated in the current memory context.
459 : */
460 : BufferAccessStrategy
461 287422 : GetAccessStrategy(BufferAccessStrategyType btype)
462 : {
463 : int ring_size_kb;
464 :
465 : /*
466 : * Select ring size to use. See buffer/README for rationales.
467 : *
468 : * Note: if you change the ring size for BAS_BULKREAD, see also
469 : * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
470 : */
471 287422 : switch (btype)
472 : {
473 0 : case BAS_NORMAL:
474 : /* if someone asks for NORMAL, just give 'em a "default" object */
475 0 : return NULL;
476 :
477 159584 : case BAS_BULKREAD:
478 : {
479 : int ring_max_kb;
480 :
481 : /*
482 : * The ring always needs to be large enough to allow some
483 : * separation in time between providing a buffer to the user
484 : * of the strategy and that buffer being reused. Otherwise the
485 : * user's pin will prevent reuse of the buffer, even without
486 : * concurrent activity.
487 : *
488 : * We also need to ensure the ring always is large enough for
489 : * SYNC_SCAN_REPORT_INTERVAL, as noted above.
490 : *
491 : * Thus we start out a minimal size and increase the size
492 : * further if appropriate.
493 : */
494 159584 : ring_size_kb = 256;
495 :
496 : /*
497 : * There's no point in a larger ring if we won't be allowed to
498 : * pin sufficiently many buffers. But we never limit to less
499 : * than the minimal size above.
500 : */
501 159584 : ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
502 159584 : ring_max_kb = Max(ring_size_kb, ring_max_kb);
503 :
504 : /*
505 : * We would like the ring to additionally have space for the
506 : * configured degree of IO concurrency. While being read in,
507 : * buffers can obviously not yet be reused.
508 : *
509 : * Each IO can be up to io_combine_limit blocks large, and we
510 : * want to start up to effective_io_concurrency IOs.
511 : *
512 : * Note that effective_io_concurrency may be 0, which disables
513 : * AIO.
514 : */
515 159584 : ring_size_kb += (BLCKSZ / 1024) *
516 159584 : io_combine_limit * effective_io_concurrency;
517 :
518 159584 : if (ring_size_kb > ring_max_kb)
519 159584 : ring_size_kb = ring_max_kb;
520 159584 : break;
521 : }
522 127838 : case BAS_BULKWRITE:
523 127838 : ring_size_kb = 16 * 1024;
524 127838 : break;
525 0 : case BAS_VACUUM:
526 0 : ring_size_kb = 2048;
527 0 : break;
528 :
529 0 : default:
530 0 : elog(ERROR, "unrecognized buffer access strategy: %d",
531 : (int) btype);
532 : return NULL; /* keep compiler quiet */
533 : }
534 :
535 287422 : return GetAccessStrategyWithSize(btype, ring_size_kb);
536 : }
537 :
538 : /*
539 : * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
540 : * number of buffers equivalent to the passed in size.
541 : *
542 : * If the given ring size is 0, no BufferAccessStrategy will be created and
543 : * the function will return NULL. ring_size_kb must not be negative.
544 : */
545 : BufferAccessStrategy
546 304518 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
547 : {
548 : int ring_buffers;
549 : BufferAccessStrategy strategy;
550 :
551 : Assert(ring_size_kb >= 0);
552 :
553 : /* Figure out how many buffers ring_size_kb is */
554 304518 : ring_buffers = ring_size_kb / (BLCKSZ / 1024);
555 :
556 : /* 0 means unlimited, so no BufferAccessStrategy required */
557 304518 : if (ring_buffers == 0)
558 12 : return NULL;
559 :
560 : /* Cap to 1/8th of shared_buffers */
561 304506 : ring_buffers = Min(NBuffers / 8, ring_buffers);
562 :
563 : /* NBuffers should never be less than 16, so this shouldn't happen */
564 : Assert(ring_buffers > 0);
565 :
566 : /* Allocate the object and initialize all elements to zeroes */
567 : strategy = (BufferAccessStrategy)
568 304506 : palloc0(offsetof(BufferAccessStrategyData, buffers) +
569 : ring_buffers * sizeof(Buffer));
570 :
571 : /* Set fields that don't start out zero */
572 304506 : strategy->btype = btype;
573 304506 : strategy->nbuffers = ring_buffers;
574 :
575 304506 : return strategy;
576 : }
577 :
578 : /*
579 : * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
580 : * the ring
581 : *
582 : * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
583 : * returning NULL with 0 size.
584 : */
585 : int
586 34 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
587 : {
588 34 : if (strategy == NULL)
589 0 : return 0;
590 :
591 34 : return strategy->nbuffers;
592 : }
593 :
594 : /*
595 : * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
596 : *
597 : * When pinning extra buffers to look ahead, users of a ring-based strategy are
598 : * in danger of pinning too much of the ring at once while performing look-ahead.
599 : * For some strategies, that means "escaping" from the ring, and in others it
600 : * means forcing dirty data to disk very frequently with associated WAL
601 : * flushing. Since external code has no insight into any of that, allow
602 : * individual strategy types to expose a clamp that should be applied when
603 : * deciding on a maximum number of buffers to pin at once.
604 : *
605 : * Callers should combine this number with other relevant limits and take the
606 : * minimum.
607 : */
608 : int
609 1102472 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
610 : {
611 1102472 : if (strategy == NULL)
612 815754 : return NBuffers;
613 :
614 286718 : switch (strategy->btype)
615 : {
616 150240 : case BAS_BULKREAD:
617 :
618 : /*
619 : * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
620 : * shouldn't be a problem and the caller is free to pin up to the
621 : * entire ring at once.
622 : */
623 150240 : return strategy->nbuffers;
624 :
625 136478 : default:
626 :
627 : /*
628 : * Tell caller not to pin more than half the buffers in the ring.
629 : * This is a trade-off between look ahead distance and deferring
630 : * writeback and associated WAL traffic.
631 : */
632 136478 : return strategy->nbuffers / 2;
633 : }
634 : }
635 :
636 : /*
637 : * FreeAccessStrategy -- release a BufferAccessStrategy object
638 : *
639 : * A simple pfree would do at the moment, but we would prefer that callers
640 : * don't assume that much about the representation of BufferAccessStrategy.
641 : */
642 : void
643 275622 : FreeAccessStrategy(BufferAccessStrategy strategy)
644 : {
645 : /* don't crash if called on a "default" strategy */
646 275622 : if (strategy != NULL)
647 275622 : pfree(strategy);
648 275622 : }
649 :
650 : /*
651 : * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
652 : * ring is empty / not usable.
653 : *
654 : * The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
655 : * returning.
656 : */
657 : static BufferDesc *
658 1607858 : GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
659 : {
660 : BufferDesc *buf;
661 : Buffer bufnum;
662 : uint32 old_buf_state;
663 : uint32 local_buf_state; /* to avoid repeated (de-)referencing */
664 :
665 :
666 : /* Advance to next ring slot */
667 1607858 : if (++strategy->current >= strategy->nbuffers)
668 46442 : strategy->current = 0;
669 :
670 : /*
671 : * If the slot hasn't been filled yet, tell the caller to allocate a new
672 : * buffer with the normal allocation strategy. He will then fill this
673 : * slot by calling AddBufferToRing with the new buffer.
674 : */
675 1607858 : bufnum = strategy->buffers[strategy->current];
676 1607858 : if (bufnum == InvalidBuffer)
677 977030 : return NULL;
678 :
679 630828 : buf = GetBufferDescriptor(bufnum - 1);
680 :
681 : /*
682 : * Check whether the buffer can be used and pin it if so. Do this using a
683 : * CAS loop, to avoid having to lock the buffer header.
684 : */
685 630828 : old_buf_state = pg_atomic_read_u32(&buf->state);
686 : for (;;)
687 : {
688 630828 : local_buf_state = old_buf_state;
689 :
690 : /*
691 : * If the buffer is pinned we cannot use it under any circumstances.
692 : *
693 : * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
694 : * since our own previous usage of the ring element would have left it
695 : * there, but it might've been decremented by clock-sweep since then).
696 : * A higher usage_count indicates someone else has touched the buffer,
697 : * so we shouldn't re-use it.
698 : */
699 630828 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
700 620866 : || BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
701 : break;
702 :
703 : /* See equivalent code in PinBuffer() */
704 610464 : if (unlikely(local_buf_state & BM_LOCKED))
705 : {
706 0 : old_buf_state = WaitBufHdrUnlocked(buf);
707 0 : continue;
708 : }
709 :
710 : /* pin the buffer if the CAS succeeds */
711 610464 : local_buf_state += BUF_REFCOUNT_ONE;
712 :
713 610464 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
714 : local_buf_state))
715 : {
716 610464 : *buf_state = local_buf_state;
717 :
718 610464 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
719 610464 : return buf;
720 : }
721 : }
722 :
723 : /*
724 : * Tell caller to allocate a new buffer with the normal allocation
725 : * strategy. He'll then replace this ring element via AddBufferToRing.
726 : */
727 20364 : return NULL;
728 : }
729 :
730 : /*
731 : * AddBufferToRing -- add a buffer to the buffer ring
732 : *
733 : * Caller must hold the buffer header spinlock on the buffer. Since this
734 : * is called with the spinlock held, it had better be quite cheap.
735 : */
736 : static void
737 997394 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
738 : {
739 997394 : strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
740 997394 : }
741 :
742 : /*
743 : * Utility function returning the IOContext of a given BufferAccessStrategy's
744 : * strategy ring.
745 : */
746 : IOContext
747 125038200 : IOContextForStrategy(BufferAccessStrategy strategy)
748 : {
749 125038200 : if (!strategy)
750 120430192 : return IOCONTEXT_NORMAL;
751 :
752 4608008 : switch (strategy->btype)
753 : {
754 : case BAS_NORMAL:
755 :
756 : /*
757 : * Currently, GetAccessStrategy() returns NULL for
758 : * BufferAccessStrategyType BAS_NORMAL, so this case is
759 : * unreachable.
760 : */
761 : pg_unreachable();
762 : return IOCONTEXT_NORMAL;
763 3138630 : case BAS_BULKREAD:
764 3138630 : return IOCONTEXT_BULKREAD;
765 562512 : case BAS_BULKWRITE:
766 562512 : return IOCONTEXT_BULKWRITE;
767 906866 : case BAS_VACUUM:
768 906866 : return IOCONTEXT_VACUUM;
769 : }
770 :
771 0 : elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
772 : pg_unreachable();
773 : }
774 :
775 : /*
776 : * StrategyRejectBuffer -- consider rejecting a dirty buffer
777 : *
778 : * When a nondefault strategy is used, the buffer manager calls this function
779 : * when it turns out that the buffer selected by StrategyGetBuffer needs to
780 : * be written out and doing so would require flushing WAL too. This gives us
781 : * a chance to choose a different victim.
782 : *
783 : * Returns true if buffer manager should ask for a new victim, and false
784 : * if this buffer should be written and re-used.
785 : */
786 : bool
787 17724 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
788 : {
789 : /* We only do this in bulkread mode */
790 17724 : if (strategy->btype != BAS_BULKREAD)
791 4732 : return false;
792 :
793 : /* Don't muck with behavior of normal buffer-replacement strategy */
794 24580 : if (!from_ring ||
795 11588 : strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
796 1404 : return false;
797 :
798 : /*
799 : * Remove the dirty buffer from the ring; necessary to prevent infinite
800 : * loop if all ring members are dirty.
801 : */
802 11588 : strategy->buffers[strategy->current] = InvalidBuffer;
803 :
804 11588 : return true;
805 : }
|