Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * buf_internals.h
4 : * Internal definitions for buffer manager and the buffer replacement
5 : * strategy.
6 : *
7 : *
8 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 : * Portions Copyright (c) 1994, Regents of the University of California
10 : *
11 : * src/include/storage/buf_internals.h
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef BUFMGR_INTERNALS_H
16 : #define BUFMGR_INTERNALS_H
17 :
18 : #include "pgstat.h"
19 : #include "port/atomics.h"
20 : #include "storage/aio_types.h"
21 : #include "storage/buf.h"
22 : #include "storage/bufmgr.h"
23 : #include "storage/condition_variable.h"
24 : #include "storage/lwlock.h"
25 : #include "storage/procnumber.h"
26 : #include "storage/shmem.h"
27 : #include "storage/smgr.h"
28 : #include "storage/spin.h"
29 : #include "utils/relcache.h"
30 : #include "utils/resowner.h"
31 :
32 : /*
33 : * Buffer state is a single 32-bit variable where following data is combined.
34 : *
35 : * - 18 bits refcount
36 : * - 4 bits usage count
37 : * - 10 bits of flags
38 : *
39 : * Combining these values allows to perform some operations without locking
40 : * the buffer header, by modifying them together with a CAS loop.
41 : *
42 : * The definition of buffer state components is below.
43 : */
44 : #define BUF_REFCOUNT_BITS 18
45 : #define BUF_USAGECOUNT_BITS 4
46 : #define BUF_FLAG_BITS 10
47 :
48 : StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
49 : "parts of buffer state space need to equal 32");
50 :
51 : #define BUF_REFCOUNT_ONE 1
52 : #define BUF_REFCOUNT_MASK ((1U << BUF_REFCOUNT_BITS) - 1)
53 : #define BUF_USAGECOUNT_MASK (((1U << BUF_USAGECOUNT_BITS) - 1) << (BUF_REFCOUNT_BITS))
54 : #define BUF_USAGECOUNT_ONE (1U << BUF_REFCOUNT_BITS)
55 : #define BUF_USAGECOUNT_SHIFT BUF_REFCOUNT_BITS
56 : #define BUF_FLAG_MASK (((1U << BUF_FLAG_BITS) - 1) << (BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS))
57 :
58 : /* Get refcount and usagecount from buffer state */
59 : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
60 : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
61 :
62 : /*
63 : * Flags for buffer descriptors
64 : *
65 : * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
66 : * entry associated with the buffer's tag.
67 : */
68 : #define BM_LOCKED (1U << 22) /* buffer header is locked */
69 : #define BM_DIRTY (1U << 23) /* data needs writing */
70 : #define BM_VALID (1U << 24) /* data is valid */
71 : #define BM_TAG_VALID (1U << 25) /* tag is assigned */
72 : #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
73 : #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
74 : #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
75 : #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
76 : #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
77 : #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
78 : * or init fork) */
79 : /*
80 : * The maximum allowed value of usage_count represents a tradeoff between
81 : * accuracy and speed of the clock-sweep buffer management algorithm. A
82 : * large value (comparable to NBuffers) would approximate LRU semantics.
83 : * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
84 : * clock sweeps to find a free buffer, so in practice we don't want the
85 : * value to be very large.
86 : */
87 : #define BM_MAX_USAGE_COUNT 5
88 :
89 : StaticAssertDecl(BM_MAX_USAGE_COUNT < (1 << BUF_USAGECOUNT_BITS),
90 : "BM_MAX_USAGE_COUNT doesn't fit in BUF_USAGECOUNT_BITS bits");
91 : StaticAssertDecl(MAX_BACKENDS_BITS <= BUF_REFCOUNT_BITS,
92 : "MAX_BACKENDS_BITS needs to be <= BUF_REFCOUNT_BITS");
93 :
94 : /*
95 : * Buffer tag identifies which disk block the buffer contains.
96 : *
97 : * Note: the BufferTag data must be sufficient to determine where to write the
98 : * block, without reference to pg_class or pg_tablespace entries. It's
99 : * possible that the backend flushing the buffer doesn't even believe the
100 : * relation is visible yet (its xact may have started before the xact that
101 : * created the rel). The storage manager must be able to cope anyway.
102 : *
103 : * Note: if there's any pad bytes in the struct, InitBufferTag will have
104 : * to be fixed to zero them, since this struct is used as a hash key.
105 : */
106 : typedef struct buftag
107 : {
108 : Oid spcOid; /* tablespace oid */
109 : Oid dbOid; /* database oid */
110 : RelFileNumber relNumber; /* relation file number */
111 : ForkNumber forkNum; /* fork number */
112 : BlockNumber blockNum; /* blknum relative to begin of reln */
113 : } BufferTag;
114 :
115 : static inline RelFileNumber
116 294762564 : BufTagGetRelNumber(const BufferTag *tag)
117 : {
118 294762564 : return tag->relNumber;
119 : }
120 :
121 : static inline ForkNumber
122 41838264 : BufTagGetForkNum(const BufferTag *tag)
123 : {
124 41838264 : return tag->forkNum;
125 : }
126 :
127 : static inline void
128 137697786 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
129 : ForkNumber forknum)
130 : {
131 137697786 : tag->relNumber = relnumber;
132 137697786 : tag->forkNum = forknum;
133 137697786 : }
134 :
135 : static inline RelFileLocator
136 33830616 : BufTagGetRelFileLocator(const BufferTag *tag)
137 : {
138 : RelFileLocator rlocator;
139 :
140 33830616 : rlocator.spcOid = tag->spcOid;
141 33830616 : rlocator.dbOid = tag->dbOid;
142 33830616 : rlocator.relNumber = BufTagGetRelNumber(tag);
143 :
144 33830616 : return rlocator;
145 : }
146 :
147 : static inline void
148 21907066 : ClearBufferTag(BufferTag *tag)
149 : {
150 21907066 : tag->spcOid = InvalidOid;
151 21907066 : tag->dbOid = InvalidOid;
152 21907066 : BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
153 21907066 : tag->blockNum = InvalidBlockNumber;
154 21907066 : }
155 :
156 : static inline void
157 115790720 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
158 : ForkNumber forkNum, BlockNumber blockNum)
159 : {
160 115790720 : tag->spcOid = rlocator->spcOid;
161 115790720 : tag->dbOid = rlocator->dbOid;
162 115790720 : BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
163 115790720 : tag->blockNum = blockNum;
164 115790720 : }
165 :
166 : static inline bool
167 213946 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
168 : {
169 427892 : return (tag1->spcOid == tag2->spcOid) &&
170 213946 : (tag1->dbOid == tag2->dbOid) &&
171 213944 : (tag1->relNumber == tag2->relNumber) &&
172 641762 : (tag1->blockNum == tag2->blockNum) &&
173 213870 : (tag1->forkNum == tag2->forkNum);
174 : }
175 :
176 : static inline bool
177 793429884 : BufTagMatchesRelFileLocator(const BufferTag *tag,
178 : const RelFileLocator *rlocator)
179 : {
180 1143281460 : return (tag->spcOid == rlocator->spcOid) &&
181 1053758040 : (tag->dbOid == rlocator->dbOid) &&
182 260328156 : (BufTagGetRelNumber(tag) == rlocator->relNumber);
183 : }
184 :
185 :
186 : /*
187 : * The shared buffer mapping table is partitioned to reduce contention.
188 : * To determine which partition lock a given tag requires, compute the tag's
189 : * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
190 : * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
191 : */
192 : static inline uint32
193 115975826 : BufTableHashPartition(uint32 hashcode)
194 : {
195 115975826 : return hashcode % NUM_BUFFER_PARTITIONS;
196 : }
197 :
198 : static inline LWLock *
199 115975826 : BufMappingPartitionLock(uint32 hashcode)
200 : {
201 115975826 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
202 115975826 : BufTableHashPartition(hashcode)].lock;
203 : }
204 :
205 : static inline LWLock *
206 : BufMappingPartitionLockByIndex(uint32 index)
207 : {
208 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
209 : }
210 :
211 : /*
212 : * BufferDesc -- shared descriptor/state data for a single shared buffer.
213 : *
214 : * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
215 : * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
216 : * is a spinlock which is combined with flags, refcount and usagecount into
217 : * single atomic variable. This layout allow us to do some operations in a
218 : * single atomic operation, without actually acquiring and releasing spinlock;
219 : * for instance, increase or decrease refcount. buf_id field never changes
220 : * after initialization, so does not need locking. freeNext is protected by
221 : * the buffer_strategy_lock not buffer header lock. The LWLock can take care
222 : * of itself. The buffer header lock is *not* used to control access to the
223 : * data in the buffer!
224 : *
225 : * It's assumed that nobody changes the state field while buffer header lock
226 : * is held. Thus buffer header lock holder can do complex updates of the
227 : * state variable in single write, simultaneously with lock release (cleaning
228 : * BM_LOCKED flag). On the other hand, updating of state without holding
229 : * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
230 : * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
231 : *
232 : * An exception is that if we have the buffer pinned, its tag can't change
233 : * underneath us, so we can examine the tag without locking the buffer header.
234 : * Also, in places we do one-time reads of the flags without bothering to
235 : * lock the buffer header; this is generally for situations where we don't
236 : * expect the flag bit being tested to be changing.
237 : *
238 : * We can't physically remove items from a disk page if another backend has
239 : * the buffer pinned. Hence, a backend may need to wait for all other pins
240 : * to go away. This is signaled by storing its own pgprocno into
241 : * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
242 : * there can be only one such waiter per buffer.
243 : *
244 : * We use this same struct for local buffer headers, but the locks are not
245 : * used and not all of the flag bits are useful either. To avoid unnecessary
246 : * overhead, manipulations of the state field should be done without actual
247 : * atomic operations (i.e. only pg_atomic_read_u32() and
248 : * pg_atomic_unlocked_write_u32()).
249 : *
250 : * Be careful to avoid increasing the size of the struct when adding or
251 : * reordering members. Keeping it below 64 bytes (the most common CPU
252 : * cache line size) is fairly important for performance.
253 : *
254 : * Per-buffer I/O condition variables are currently kept outside this struct in
255 : * a separate array. They could be moved in here and still fit within that
256 : * limit on common systems, but for now that is not done.
257 : */
258 : typedef struct BufferDesc
259 : {
260 : BufferTag tag; /* ID of page contained in buffer */
261 : int buf_id; /* buffer's index number (from 0) */
262 :
263 : /* state of the tag, containing flags, refcount and usagecount */
264 : pg_atomic_uint32 state;
265 :
266 : int wait_backend_pgprocno; /* backend of pin-count waiter */
267 : int freeNext; /* link in freelist chain */
268 :
269 : PgAioWaitRef io_wref; /* set iff AIO is in progress */
270 : LWLock content_lock; /* to lock access to buffer contents */
271 : } BufferDesc;
272 :
273 : /*
274 : * Concurrent access to buffer headers has proven to be more efficient if
275 : * they're cache line aligned. So we force the start of the BufferDescriptors
276 : * array to be on a cache line boundary and force the elements to be cache
277 : * line sized.
278 : *
279 : * XXX: As this is primarily matters in highly concurrent workloads which
280 : * probably all are 64bit these days, and the space wastage would be a bit
281 : * more noticeable on 32bit systems, we don't force the stride to be cache
282 : * line sized on those. If somebody does actual performance testing, we can
283 : * reevaluate.
284 : *
285 : * Note that local buffer descriptors aren't forced to be aligned - as there's
286 : * no concurrent access to those it's unlikely to be beneficial.
287 : *
288 : * We use a 64-byte cache line size here, because that's the most common
289 : * size. Making it bigger would be a waste of memory. Even if running on a
290 : * platform with either 32 or 128 byte line sizes, it's good to align to
291 : * boundaries and avoid false sharing.
292 : */
293 : #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
294 :
295 : typedef union BufferDescPadded
296 : {
297 : BufferDesc bufferdesc;
298 : char pad[BUFFERDESC_PAD_TO_SIZE];
299 : } BufferDescPadded;
300 :
301 : /*
302 : * The PendingWriteback & WritebackContext structure are used to keep
303 : * information about pending flush requests to be issued to the OS.
304 : */
305 : typedef struct PendingWriteback
306 : {
307 : /* could store different types of pending flushes here */
308 : BufferTag tag;
309 : } PendingWriteback;
310 :
311 : /* struct forward declared in bufmgr.h */
312 : typedef struct WritebackContext
313 : {
314 : /* pointer to the max number of writeback requests to coalesce */
315 : int *max_pending;
316 :
317 : /* current number of pending writeback requests */
318 : int nr_pending;
319 :
320 : /* pending requests */
321 : PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
322 : } WritebackContext;
323 :
324 : /* in buf_init.c */
325 : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
326 : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
327 : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
328 :
329 : /* in localbuf.c */
330 : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
331 :
332 :
333 : static inline BufferDesc *
334 1111833684 : GetBufferDescriptor(uint32 id)
335 : {
336 1111833684 : return &(BufferDescriptors[id]).bufferdesc;
337 : }
338 :
339 : static inline BufferDesc *
340 18964834 : GetLocalBufferDescriptor(uint32 id)
341 : {
342 18964834 : return &LocalBufferDescriptors[id];
343 : }
344 :
345 : static inline Buffer
346 514514856 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
347 : {
348 514514856 : return (Buffer) (bdesc->buf_id + 1);
349 : }
350 :
351 : static inline ConditionVariable *
352 23726094 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
353 : {
354 23726094 : return &(BufferIOCVArray[bdesc->buf_id]).cv;
355 : }
356 :
357 : static inline LWLock *
358 331519126 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
359 : {
360 331519126 : return (LWLock *) (&bdesc->content_lock);
361 : }
362 :
363 : /*
364 : * The freeNext field is either the index of the next freelist entry,
365 : * or one of these special values:
366 : */
367 : #define FREENEXT_END_OF_LIST (-1)
368 : #define FREENEXT_NOT_IN_LIST (-2)
369 :
370 : /*
371 : * Functions for acquiring/releasing a shared buffer header's spinlock. Do
372 : * not apply these to local buffers!
373 : */
374 : extern uint32 LockBufHdr(BufferDesc *desc);
375 :
376 : static inline void
377 72004042 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
378 : {
379 72004042 : pg_write_barrier();
380 72004042 : pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
381 72004042 : }
382 :
383 : /* in bufmgr.c */
384 :
385 : /*
386 : * Structure to sort buffers per file on checkpoints.
387 : *
388 : * This structure is allocated per buffer in shared memory, so it should be
389 : * kept as small as possible.
390 : */
391 : typedef struct CkptSortItem
392 : {
393 : Oid tsId;
394 : RelFileNumber relNumber;
395 : ForkNumber forkNum;
396 : BlockNumber blockNum;
397 : int buf_id;
398 : } CkptSortItem;
399 :
400 : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
401 :
402 : /* ResourceOwner callbacks to hold buffer I/Os and pins */
403 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
404 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
405 :
406 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
407 : static inline void
408 137687958 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
409 : {
410 137687958 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
411 137687958 : }
412 : static inline void
413 137673554 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
414 : {
415 137673554 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
416 137673554 : }
417 : static inline void
418 4675920 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
419 : {
420 4675920 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
421 4675920 : }
422 : static inline void
423 4675890 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
424 : {
425 4675890 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
426 4675890 : }
427 :
428 : /*
429 : * Internal buffer management routines
430 : */
431 : /* bufmgr.c */
432 : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
433 : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
434 : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
435 : IOContext io_context, BufferTag *tag);
436 :
437 : /* freelist.c */
438 : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
439 : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
440 : uint32 *buf_state, bool *from_ring);
441 : extern void StrategyFreeBuffer(BufferDesc *buf);
442 : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
443 : BufferDesc *buf, bool from_ring);
444 :
445 : extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
446 : extern void StrategyNotifyBgWriter(int bgwprocno);
447 :
448 : extern Size StrategyShmemSize(void);
449 : extern void StrategyInitialize(bool init);
450 : extern bool have_free_buffer(void);
451 :
452 : /* buf_table.c */
453 : extern Size BufTableShmemSize(int size);
454 : extern void InitBufTable(int size);
455 : extern uint32 BufTableHashCode(BufferTag *tagPtr);
456 : extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
457 : extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
458 : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
459 :
460 : /* localbuf.c */
461 : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
462 : extern void UnpinLocalBuffer(Buffer buffer);
463 : extern void UnpinLocalBufferNoOwner(Buffer buffer);
464 : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
465 : ForkNumber forkNum,
466 : BlockNumber blockNum);
467 : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
468 : BlockNumber blockNum, bool *foundPtr);
469 : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
470 : ForkNumber fork,
471 : uint32 flags,
472 : uint32 extend_by,
473 : BlockNumber extend_upto,
474 : Buffer *buffers,
475 : uint32 *extended_by);
476 : extern void MarkLocalBufferDirty(Buffer buffer);
477 : extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty,
478 : uint32 set_flag_bits, bool release_aio);
479 : extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait);
480 : extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln);
481 : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
482 : ForkNumber forkNum,
483 : BlockNumber firstDelBlock);
484 : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
485 : extern void AtEOXact_LocalBuffers(bool isCommit);
486 :
487 : #endif /* BUFMGR_INTERNALS_H */
|