Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * buf_internals.h
4 : * Internal definitions for buffer manager and the buffer replacement
5 : * strategy.
6 : *
7 : *
8 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 : * Portions Copyright (c) 1994, Regents of the University of California
10 : *
11 : * src/include/storage/buf_internals.h
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef BUFMGR_INTERNALS_H
16 : #define BUFMGR_INTERNALS_H
17 :
18 : #include "pgstat.h"
19 : #include "port/atomics.h"
20 : #include "storage/buf.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/condition_variable.h"
23 : #include "storage/lwlock.h"
24 : #include "storage/shmem.h"
25 : #include "storage/smgr.h"
26 : #include "storage/spin.h"
27 : #include "utils/relcache.h"
28 : #include "utils/resowner.h"
29 :
30 : /*
31 : * Buffer state is a single 32-bit variable where following data is combined.
32 : *
33 : * - 18 bits refcount
34 : * - 4 bits usage count
35 : * - 10 bits of flags
36 : *
37 : * Combining these values allows to perform some operations without locking
38 : * the buffer header, by modifying them together with a CAS loop.
39 : *
40 : * The definition of buffer state components is below.
41 : */
42 : #define BUF_REFCOUNT_ONE 1
43 : #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
44 : #define BUF_USAGECOUNT_MASK 0x003C0000U
45 : #define BUF_USAGECOUNT_ONE (1U << 18)
46 : #define BUF_USAGECOUNT_SHIFT 18
47 : #define BUF_FLAG_MASK 0xFFC00000U
48 :
49 : /* Get refcount and usagecount from buffer state */
50 : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
51 : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
52 :
53 : /*
54 : * Flags for buffer descriptors
55 : *
56 : * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
57 : * entry associated with the buffer's tag.
58 : */
59 : #define BM_LOCKED (1U << 22) /* buffer header is locked */
60 : #define BM_DIRTY (1U << 23) /* data needs writing */
61 : #define BM_VALID (1U << 24) /* data is valid */
62 : #define BM_TAG_VALID (1U << 25) /* tag is assigned */
63 : #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
64 : #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
65 : #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
66 : #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
67 : #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
68 : #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
69 : * or init fork) */
70 : /*
71 : * The maximum allowed value of usage_count represents a tradeoff between
72 : * accuracy and speed of the clock-sweep buffer management algorithm. A
73 : * large value (comparable to NBuffers) would approximate LRU semantics.
74 : * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
75 : * clock sweeps to find a free buffer, so in practice we don't want the
76 : * value to be very large.
77 : */
78 : #define BM_MAX_USAGE_COUNT 5
79 :
80 : /*
81 : * Buffer tag identifies which disk block the buffer contains.
82 : *
83 : * Note: the BufferTag data must be sufficient to determine where to write the
84 : * block, without reference to pg_class or pg_tablespace entries. It's
85 : * possible that the backend flushing the buffer doesn't even believe the
86 : * relation is visible yet (its xact may have started before the xact that
87 : * created the rel). The storage manager must be able to cope anyway.
88 : *
89 : * Note: if there's any pad bytes in the struct, InitBufferTag will have
90 : * to be fixed to zero them, since this struct is used as a hash key.
91 : */
92 : typedef struct buftag
93 : {
94 : Oid spcOid; /* tablespace oid */
95 : Oid dbOid; /* database oid */
96 : RelFileNumber relNumber; /* relation file number */
97 : ForkNumber forkNum; /* fork number */
98 : BlockNumber blockNum; /* blknum relative to begin of reln */
99 : } BufferTag;
100 :
101 : static inline RelFileNumber
102 290394196 : BufTagGetRelNumber(const BufferTag *tag)
103 : {
104 290394196 : return tag->relNumber;
105 : }
106 :
107 : static inline ForkNumber
108 40295676 : BufTagGetForkNum(const BufferTag *tag)
109 : {
110 40295676 : return tag->forkNum;
111 : }
112 :
113 : static inline void
114 131269204 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
115 : ForkNumber forknum)
116 : {
117 131269204 : tag->relNumber = relnumber;
118 131269204 : tag->forkNum = forknum;
119 131269204 : }
120 :
121 : static inline RelFileLocator
122 33054310 : BufTagGetRelFileLocator(const BufferTag *tag)
123 : {
124 : RelFileLocator rlocator;
125 :
126 33054310 : rlocator.spcOid = tag->spcOid;
127 33054310 : rlocator.dbOid = tag->dbOid;
128 33054310 : rlocator.relNumber = BufTagGetRelNumber(tag);
129 :
130 33054310 : return rlocator;
131 : }
132 :
133 : static inline void
134 20246200 : ClearBufferTag(BufferTag *tag)
135 : {
136 20246200 : tag->spcOid = InvalidOid;
137 20246200 : tag->dbOid = InvalidOid;
138 20246200 : BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
139 20246200 : tag->blockNum = InvalidBlockNumber;
140 20246200 : }
141 :
142 : static inline void
143 111023004 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
144 : ForkNumber forkNum, BlockNumber blockNum)
145 : {
146 111023004 : tag->spcOid = rlocator->spcOid;
147 111023004 : tag->dbOid = rlocator->dbOid;
148 111023004 : BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
149 111023004 : tag->blockNum = blockNum;
150 111023004 : }
151 :
152 : static inline bool
153 204936 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
154 : {
155 409872 : return (tag1->spcOid == tag2->spcOid) &&
156 204936 : (tag1->dbOid == tag2->dbOid) &&
157 204936 : (tag1->relNumber == tag2->relNumber) &&
158 614738 : (tag1->blockNum == tag2->blockNum) &&
159 204866 : (tag1->forkNum == tag2->forkNum);
160 : }
161 :
162 : static inline bool
163 776846272 : BufTagMatchesRelFileLocator(const BufferTag *tag,
164 : const RelFileLocator *rlocator)
165 : {
166 1116476536 : return (tag->spcOid == rlocator->spcOid) &&
167 1033608136 : (tag->dbOid == rlocator->dbOid) &&
168 256761864 : (BufTagGetRelNumber(tag) == rlocator->relNumber);
169 : }
170 :
171 :
172 : /*
173 : * The shared buffer mapping table is partitioned to reduce contention.
174 : * To determine which partition lock a given tag requires, compute the tag's
175 : * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
176 : * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
177 : */
178 : static inline uint32
179 111290980 : BufTableHashPartition(uint32 hashcode)
180 : {
181 111290980 : return hashcode % NUM_BUFFER_PARTITIONS;
182 : }
183 :
184 : static inline LWLock *
185 111290980 : BufMappingPartitionLock(uint32 hashcode)
186 : {
187 111290980 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
188 111290980 : BufTableHashPartition(hashcode)].lock;
189 : }
190 :
191 : static inline LWLock *
192 : BufMappingPartitionLockByIndex(uint32 index)
193 : {
194 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
195 : }
196 :
197 : /*
198 : * BufferDesc -- shared descriptor/state data for a single shared buffer.
199 : *
200 : * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
201 : * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
202 : * is a spinlock which is combined with flags, refcount and usagecount into
203 : * single atomic variable. This layout allow us to do some operations in a
204 : * single atomic operation, without actually acquiring and releasing spinlock;
205 : * for instance, increase or decrease refcount. buf_id field never changes
206 : * after initialization, so does not need locking. freeNext is protected by
207 : * the buffer_strategy_lock not buffer header lock. The LWLock can take care
208 : * of itself. The buffer header lock is *not* used to control access to the
209 : * data in the buffer!
210 : *
211 : * It's assumed that nobody changes the state field while buffer header lock
212 : * is held. Thus buffer header lock holder can do complex updates of the
213 : * state variable in single write, simultaneously with lock release (cleaning
214 : * BM_LOCKED flag). On the other hand, updating of state without holding
215 : * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
216 : * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
217 : *
218 : * An exception is that if we have the buffer pinned, its tag can't change
219 : * underneath us, so we can examine the tag without locking the buffer header.
220 : * Also, in places we do one-time reads of the flags without bothering to
221 : * lock the buffer header; this is generally for situations where we don't
222 : * expect the flag bit being tested to be changing.
223 : *
224 : * We can't physically remove items from a disk page if another backend has
225 : * the buffer pinned. Hence, a backend may need to wait for all other pins
226 : * to go away. This is signaled by storing its own pgprocno into
227 : * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
228 : * there can be only one such waiter per buffer.
229 : *
230 : * We use this same struct for local buffer headers, but the locks are not
231 : * used and not all of the flag bits are useful either. To avoid unnecessary
232 : * overhead, manipulations of the state field should be done without actual
233 : * atomic operations (i.e. only pg_atomic_read_u32() and
234 : * pg_atomic_unlocked_write_u32()).
235 : *
236 : * Be careful to avoid increasing the size of the struct when adding or
237 : * reordering members. Keeping it below 64 bytes (the most common CPU
238 : * cache line size) is fairly important for performance.
239 : *
240 : * Per-buffer I/O condition variables are currently kept outside this struct in
241 : * a separate array. They could be moved in here and still fit within that
242 : * limit on common systems, but for now that is not done.
243 : */
244 : typedef struct BufferDesc
245 : {
246 : BufferTag tag; /* ID of page contained in buffer */
247 : int buf_id; /* buffer's index number (from 0) */
248 :
249 : /* state of the tag, containing flags, refcount and usagecount */
250 : pg_atomic_uint32 state;
251 :
252 : int wait_backend_pgprocno; /* backend of pin-count waiter */
253 : int freeNext; /* link in freelist chain */
254 : LWLock content_lock; /* to lock access to buffer contents */
255 : } BufferDesc;
256 :
257 : /*
258 : * Concurrent access to buffer headers has proven to be more efficient if
259 : * they're cache line aligned. So we force the start of the BufferDescriptors
260 : * array to be on a cache line boundary and force the elements to be cache
261 : * line sized.
262 : *
263 : * XXX: As this is primarily matters in highly concurrent workloads which
264 : * probably all are 64bit these days, and the space wastage would be a bit
265 : * more noticeable on 32bit systems, we don't force the stride to be cache
266 : * line sized on those. If somebody does actual performance testing, we can
267 : * reevaluate.
268 : *
269 : * Note that local buffer descriptors aren't forced to be aligned - as there's
270 : * no concurrent access to those it's unlikely to be beneficial.
271 : *
272 : * We use a 64-byte cache line size here, because that's the most common
273 : * size. Making it bigger would be a waste of memory. Even if running on a
274 : * platform with either 32 or 128 byte line sizes, it's good to align to
275 : * boundaries and avoid false sharing.
276 : */
277 : #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
278 :
279 : typedef union BufferDescPadded
280 : {
281 : BufferDesc bufferdesc;
282 : char pad[BUFFERDESC_PAD_TO_SIZE];
283 : } BufferDescPadded;
284 :
285 : /*
286 : * The PendingWriteback & WritebackContext structure are used to keep
287 : * information about pending flush requests to be issued to the OS.
288 : */
289 : typedef struct PendingWriteback
290 : {
291 : /* could store different types of pending flushes here */
292 : BufferTag tag;
293 : } PendingWriteback;
294 :
295 : /* struct forward declared in bufmgr.h */
296 : typedef struct WritebackContext
297 : {
298 : /* pointer to the max number of writeback requests to coalesce */
299 : int *max_pending;
300 :
301 : /* current number of pending writeback requests */
302 : int nr_pending;
303 :
304 : /* pending requests */
305 : PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
306 : } WritebackContext;
307 :
308 : /* in buf_init.c */
309 : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
310 : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
311 : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
312 :
313 : /* in localbuf.c */
314 : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
315 :
316 :
317 : static inline BufferDesc *
318 1063597314 : GetBufferDescriptor(uint32 id)
319 : {
320 1063597314 : return &(BufferDescriptors[id]).bufferdesc;
321 : }
322 :
323 : static inline BufferDesc *
324 15890626 : GetLocalBufferDescriptor(uint32 id)
325 : {
326 15890626 : return &LocalBufferDescriptors[id];
327 : }
328 :
329 : static inline Buffer
330 492880320 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
331 : {
332 492880320 : return (Buffer) (bdesc->buf_id + 1);
333 : }
334 :
335 : static inline ConditionVariable *
336 22169036 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
337 : {
338 22169036 : return &(BufferIOCVArray[bdesc->buf_id]).cv;
339 : }
340 :
341 : static inline LWLock *
342 317595586 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
343 : {
344 317595586 : return (LWLock *) (&bdesc->content_lock);
345 : }
346 :
347 : /*
348 : * The freeNext field is either the index of the next freelist entry,
349 : * or one of these special values:
350 : */
351 : #define FREENEXT_END_OF_LIST (-1)
352 : #define FREENEXT_NOT_IN_LIST (-2)
353 :
354 : /*
355 : * Functions for acquiring/releasing a shared buffer header's spinlock. Do
356 : * not apply these to local buffers!
357 : */
358 : extern uint32 LockBufHdr(BufferDesc *desc);
359 :
360 : static inline void
361 64760266 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
362 : {
363 64760266 : pg_write_barrier();
364 64760266 : pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
365 64760266 : }
366 :
367 : /* in bufmgr.c */
368 :
369 : /*
370 : * Structure to sort buffers per file on checkpoints.
371 : *
372 : * This structure is allocated per buffer in shared memory, so it should be
373 : * kept as small as possible.
374 : */
375 : typedef struct CkptSortItem
376 : {
377 : Oid tsId;
378 : RelFileNumber relNumber;
379 : ForkNumber forkNum;
380 : BlockNumber blockNum;
381 : int buf_id;
382 : } CkptSortItem;
383 :
384 : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
385 :
386 : /* ResourceOwner callbacks to hold buffer I/Os and pins */
387 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
388 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
389 :
390 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
391 : static inline void
392 130992088 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
393 : {
394 130992088 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
395 130992088 : }
396 : static inline void
397 130983068 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
398 : {
399 130983068 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
400 130983068 : }
401 : static inline void
402 4383784 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
403 : {
404 4383784 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
405 4383784 : }
406 : static inline void
407 4383754 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
408 : {
409 4383754 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
410 4383754 : }
411 :
412 : /*
413 : * Internal buffer management routines
414 : */
415 : /* bufmgr.c */
416 : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
417 : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
418 : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
419 : IOContext io_context, BufferTag *tag);
420 :
421 : /* freelist.c */
422 : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
423 : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
424 : uint32 *buf_state, bool *from_ring);
425 : extern void StrategyFreeBuffer(BufferDesc *buf);
426 : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
427 : BufferDesc *buf, bool from_ring);
428 :
429 : extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
430 : extern void StrategyNotifyBgWriter(int bgwprocno);
431 :
432 : extern Size StrategyShmemSize(void);
433 : extern void StrategyInitialize(bool init);
434 : extern bool have_free_buffer(void);
435 :
436 : /* buf_table.c */
437 : extern Size BufTableShmemSize(int size);
438 : extern void InitBufTable(int size);
439 : extern uint32 BufTableHashCode(BufferTag *tagPtr);
440 : extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
441 : extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
442 : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
443 :
444 : /* localbuf.c */
445 : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
446 : extern void UnpinLocalBuffer(Buffer buffer);
447 : extern void UnpinLocalBufferNoOwner(Buffer buffer);
448 : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
449 : ForkNumber forkNum,
450 : BlockNumber blockNum);
451 : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
452 : BlockNumber blockNum, bool *foundPtr);
453 : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
454 : ForkNumber fork,
455 : uint32 flags,
456 : uint32 extend_by,
457 : BlockNumber extend_upto,
458 : Buffer *buffers,
459 : uint32 *extended_by);
460 : extern void MarkLocalBufferDirty(Buffer buffer);
461 : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
462 : ForkNumber forkNum,
463 : BlockNumber firstDelBlock);
464 : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
465 : extern void AtEOXact_LocalBuffers(bool isCommit);
466 :
467 : #endif /* BUFMGR_INTERNALS_H */
|