Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * buf_internals.h
4 : * Internal definitions for buffer manager and the buffer replacement
5 : * strategy.
6 : *
7 : *
8 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 : * Portions Copyright (c) 1994, Regents of the University of California
10 : *
11 : * src/include/storage/buf_internals.h
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef BUFMGR_INTERNALS_H
16 : #define BUFMGR_INTERNALS_H
17 :
18 : #include "pgstat.h"
19 : #include "port/atomics.h"
20 : #include "storage/buf.h"
21 : #include "storage/bufmgr.h"
22 : #include "storage/condition_variable.h"
23 : #include "storage/latch.h"
24 : #include "storage/lwlock.h"
25 : #include "storage/shmem.h"
26 : #include "storage/smgr.h"
27 : #include "storage/spin.h"
28 : #include "utils/relcache.h"
29 : #include "utils/resowner.h"
30 :
31 : /*
32 : * Buffer state is a single 32-bit variable where following data is combined.
33 : *
34 : * - 18 bits refcount
35 : * - 4 bits usage count
36 : * - 10 bits of flags
37 : *
38 : * Combining these values allows to perform some operations without locking
39 : * the buffer header, by modifying them together with a CAS loop.
40 : *
41 : * The definition of buffer state components is below.
42 : */
43 : #define BUF_REFCOUNT_ONE 1
44 : #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
45 : #define BUF_USAGECOUNT_MASK 0x003C0000U
46 : #define BUF_USAGECOUNT_ONE (1U << 18)
47 : #define BUF_USAGECOUNT_SHIFT 18
48 : #define BUF_FLAG_MASK 0xFFC00000U
49 :
50 : /* Get refcount and usagecount from buffer state */
51 : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
52 : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
53 :
54 : /*
55 : * Flags for buffer descriptors
56 : *
57 : * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
58 : * entry associated with the buffer's tag.
59 : */
60 : #define BM_LOCKED (1U << 22) /* buffer header is locked */
61 : #define BM_DIRTY (1U << 23) /* data needs writing */
62 : #define BM_VALID (1U << 24) /* data is valid */
63 : #define BM_TAG_VALID (1U << 25) /* tag is assigned */
64 : #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
65 : #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
66 : #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
67 : #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
68 : #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
69 : #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
70 : * or init fork) */
71 : /*
72 : * The maximum allowed value of usage_count represents a tradeoff between
73 : * accuracy and speed of the clock-sweep buffer management algorithm. A
74 : * large value (comparable to NBuffers) would approximate LRU semantics.
75 : * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
76 : * clock sweeps to find a free buffer, so in practice we don't want the
77 : * value to be very large.
78 : */
79 : #define BM_MAX_USAGE_COUNT 5
80 :
81 : /*
82 : * Buffer tag identifies which disk block the buffer contains.
83 : *
84 : * Note: the BufferTag data must be sufficient to determine where to write the
85 : * block, without reference to pg_class or pg_tablespace entries. It's
86 : * possible that the backend flushing the buffer doesn't even believe the
87 : * relation is visible yet (its xact may have started before the xact that
88 : * created the rel). The storage manager must be able to cope anyway.
89 : *
90 : * Note: if there's any pad bytes in the struct, InitBufferTag will have
91 : * to be fixed to zero them, since this struct is used as a hash key.
92 : */
93 : typedef struct buftag
94 : {
95 : Oid spcOid; /* tablespace oid */
96 : Oid dbOid; /* database oid */
97 : RelFileNumber relNumber; /* relation file number */
98 : ForkNumber forkNum; /* fork number */
99 : BlockNumber blockNum; /* blknum relative to begin of reln */
100 : } BufferTag;
101 :
102 : static inline RelFileNumber
103 288192166 : BufTagGetRelNumber(const BufferTag *tag)
104 : {
105 288192166 : return tag->relNumber;
106 : }
107 :
108 : static inline ForkNumber
109 41641614 : BufTagGetForkNum(const BufferTag *tag)
110 : {
111 41641614 : return tag->forkNum;
112 : }
113 :
114 : static inline void
115 116101710 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
116 : ForkNumber forknum)
117 : {
118 116101710 : tag->relNumber = relnumber;
119 116101710 : tag->forkNum = forknum;
120 116101710 : }
121 :
122 : static inline RelFileLocator
123 35681626 : BufTagGetRelFileLocator(const BufferTag *tag)
124 : {
125 : RelFileLocator rlocator;
126 :
127 35681626 : rlocator.spcOid = tag->spcOid;
128 35681626 : rlocator.dbOid = tag->dbOid;
129 35681626 : rlocator.relNumber = BufTagGetRelNumber(tag);
130 :
131 35681626 : return rlocator;
132 : }
133 :
134 : static inline void
135 18310882 : ClearBufferTag(BufferTag *tag)
136 : {
137 18310882 : tag->spcOid = InvalidOid;
138 18310882 : tag->dbOid = InvalidOid;
139 18310882 : BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
140 18310882 : tag->blockNum = InvalidBlockNumber;
141 18310882 : }
142 :
143 : static inline void
144 97790828 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
145 : ForkNumber forkNum, BlockNumber blockNum)
146 : {
147 97790828 : tag->spcOid = rlocator->spcOid;
148 97790828 : tag->dbOid = rlocator->dbOid;
149 97790828 : BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
150 97790828 : tag->blockNum = blockNum;
151 97790828 : }
152 :
153 : static inline bool
154 1105846 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
155 : {
156 2211594 : return (tag1->spcOid == tag2->spcOid) &&
157 1105748 : (tag1->dbOid == tag2->dbOid) &&
158 1105746 : (tag1->relNumber == tag2->relNumber) &&
159 3314018 : (tag1->blockNum == tag2->blockNum) &&
160 1102424 : (tag1->forkNum == tag2->forkNum);
161 : }
162 :
163 : static inline bool
164 762675202 : BufTagMatchesRelFileLocator(const BufferTag *tag,
165 : const RelFileLocator *rlocator)
166 : {
167 1091587934 : return (tag->spcOid == rlocator->spcOid) &&
168 1014688160 : (tag->dbOid == rlocator->dbOid) &&
169 252012958 : (BufTagGetRelNumber(tag) == rlocator->relNumber);
170 : }
171 :
172 :
173 : /*
174 : * The shared buffer mapping table is partitioned to reduce contention.
175 : * To determine which partition lock a given tag requires, compute the tag's
176 : * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
177 : * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
178 : */
179 : static inline uint32
180 96855262 : BufTableHashPartition(uint32 hashcode)
181 : {
182 96855262 : return hashcode % NUM_BUFFER_PARTITIONS;
183 : }
184 :
185 : static inline LWLock *
186 96855262 : BufMappingPartitionLock(uint32 hashcode)
187 : {
188 96855262 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
189 96855262 : BufTableHashPartition(hashcode)].lock;
190 : }
191 :
192 : static inline LWLock *
193 : BufMappingPartitionLockByIndex(uint32 index)
194 : {
195 : return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
196 : }
197 :
198 : /*
199 : * BufferDesc -- shared descriptor/state data for a single shared buffer.
200 : *
201 : * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
202 : * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
203 : * is a spinlock which is combined with flags, refcount and usagecount into
204 : * single atomic variable. This layout allow us to do some operations in a
205 : * single atomic operation, without actually acquiring and releasing spinlock;
206 : * for instance, increase or decrease refcount. buf_id field never changes
207 : * after initialization, so does not need locking. freeNext is protected by
208 : * the buffer_strategy_lock not buffer header lock. The LWLock can take care
209 : * of itself. The buffer header lock is *not* used to control access to the
210 : * data in the buffer!
211 : *
212 : * It's assumed that nobody changes the state field while buffer header lock
213 : * is held. Thus buffer header lock holder can do complex updates of the
214 : * state variable in single write, simultaneously with lock release (cleaning
215 : * BM_LOCKED flag). On the other hand, updating of state without holding
216 : * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
217 : * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
218 : *
219 : * An exception is that if we have the buffer pinned, its tag can't change
220 : * underneath us, so we can examine the tag without locking the buffer header.
221 : * Also, in places we do one-time reads of the flags without bothering to
222 : * lock the buffer header; this is generally for situations where we don't
223 : * expect the flag bit being tested to be changing.
224 : *
225 : * We can't physically remove items from a disk page if another backend has
226 : * the buffer pinned. Hence, a backend may need to wait for all other pins
227 : * to go away. This is signaled by storing its own pgprocno into
228 : * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
229 : * there can be only one such waiter per buffer.
230 : *
231 : * We use this same struct for local buffer headers, but the locks are not
232 : * used and not all of the flag bits are useful either. To avoid unnecessary
233 : * overhead, manipulations of the state field should be done without actual
234 : * atomic operations (i.e. only pg_atomic_read_u32() and
235 : * pg_atomic_unlocked_write_u32()).
236 : *
237 : * Be careful to avoid increasing the size of the struct when adding or
238 : * reordering members. Keeping it below 64 bytes (the most common CPU
239 : * cache line size) is fairly important for performance.
240 : *
241 : * Per-buffer I/O condition variables are currently kept outside this struct in
242 : * a separate array. They could be moved in here and still fit within that
243 : * limit on common systems, but for now that is not done.
244 : */
245 : typedef struct BufferDesc
246 : {
247 : BufferTag tag; /* ID of page contained in buffer */
248 : int buf_id; /* buffer's index number (from 0) */
249 :
250 : /* state of the tag, containing flags, refcount and usagecount */
251 : pg_atomic_uint32 state;
252 :
253 : int wait_backend_pgprocno; /* backend of pin-count waiter */
254 : int freeNext; /* link in freelist chain */
255 : LWLock content_lock; /* to lock access to buffer contents */
256 : } BufferDesc;
257 :
258 : /*
259 : * Concurrent access to buffer headers has proven to be more efficient if
260 : * they're cache line aligned. So we force the start of the BufferDescriptors
261 : * array to be on a cache line boundary and force the elements to be cache
262 : * line sized.
263 : *
264 : * XXX: As this is primarily matters in highly concurrent workloads which
265 : * probably all are 64bit these days, and the space wastage would be a bit
266 : * more noticeable on 32bit systems, we don't force the stride to be cache
267 : * line sized on those. If somebody does actual performance testing, we can
268 : * reevaluate.
269 : *
270 : * Note that local buffer descriptors aren't forced to be aligned - as there's
271 : * no concurrent access to those it's unlikely to be beneficial.
272 : *
273 : * We use a 64-byte cache line size here, because that's the most common
274 : * size. Making it bigger would be a waste of memory. Even if running on a
275 : * platform with either 32 or 128 byte line sizes, it's good to align to
276 : * boundaries and avoid false sharing.
277 : */
278 : #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
279 :
280 : typedef union BufferDescPadded
281 : {
282 : BufferDesc bufferdesc;
283 : char pad[BUFFERDESC_PAD_TO_SIZE];
284 : } BufferDescPadded;
285 :
286 : /*
287 : * The PendingWriteback & WritebackContext structure are used to keep
288 : * information about pending flush requests to be issued to the OS.
289 : */
290 : typedef struct PendingWriteback
291 : {
292 : /* could store different types of pending flushes here */
293 : BufferTag tag;
294 : } PendingWriteback;
295 :
296 : /* struct forward declared in bufmgr.h */
297 : typedef struct WritebackContext
298 : {
299 : /* pointer to the max number of writeback requests to coalesce */
300 : int *max_pending;
301 :
302 : /* current number of pending writeback requests */
303 : int nr_pending;
304 :
305 : /* pending requests */
306 : PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
307 : } WritebackContext;
308 :
309 : /* in buf_init.c */
310 : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
311 : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
312 : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
313 :
314 : /* in localbuf.c */
315 : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
316 :
317 :
318 : static inline BufferDesc *
319 979669306 : GetBufferDescriptor(uint32 id)
320 : {
321 979669306 : return &(BufferDescriptors[id]).bufferdesc;
322 : }
323 :
324 : static inline BufferDesc *
325 15547100 : GetLocalBufferDescriptor(uint32 id)
326 : {
327 15547100 : return &LocalBufferDescriptors[id];
328 : }
329 :
330 : static inline Buffer
331 427966374 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
332 : {
333 427966374 : return (Buffer) (bdesc->buf_id + 1);
334 : }
335 :
336 : static inline ConditionVariable *
337 19669242 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
338 : {
339 19669242 : return &(BufferIOCVArray[bdesc->buf_id]).cv;
340 : }
341 :
342 : static inline LWLock *
343 284054772 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
344 : {
345 284054772 : return (LWLock *) (&bdesc->content_lock);
346 : }
347 :
348 : /*
349 : * The freeNext field is either the index of the next freelist entry,
350 : * or one of these special values:
351 : */
352 : #define FREENEXT_END_OF_LIST (-1)
353 : #define FREENEXT_NOT_IN_LIST (-2)
354 :
355 : /*
356 : * Functions for acquiring/releasing a shared buffer header's spinlock. Do
357 : * not apply these to local buffers!
358 : */
359 : extern uint32 LockBufHdr(BufferDesc *desc);
360 :
361 : static inline void
362 47544122 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
363 : {
364 47544122 : pg_write_barrier();
365 47544122 : pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
366 47544122 : }
367 :
368 : /* in bufmgr.c */
369 :
370 : /*
371 : * Structure to sort buffers per file on checkpoints.
372 : *
373 : * This structure is allocated per buffer in shared memory, so it should be
374 : * kept as small as possible.
375 : */
376 : typedef struct CkptSortItem
377 : {
378 : Oid tsId;
379 : RelFileNumber relNumber;
380 : ForkNumber forkNum;
381 : BlockNumber blockNum;
382 : int buf_id;
383 : } CkptSortItem;
384 :
385 : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
386 :
387 : /* ResourceOwner callbacks to hold buffer I/Os and pins */
388 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
389 : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
390 :
391 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
392 : static inline void
393 114300814 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
394 : {
395 114300814 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
396 114300814 : }
397 : static inline void
398 114292540 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
399 : {
400 114292540 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
401 114292540 : }
402 : static inline void
403 3507626 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
404 : {
405 3507626 : ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
406 3507626 : }
407 : static inline void
408 3507596 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
409 : {
410 3507596 : ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
411 3507596 : }
412 :
413 : /*
414 : * Internal buffer management routines
415 : */
416 : /* bufmgr.c */
417 : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
418 : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
419 : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
420 : IOContext io_context, BufferTag *tag);
421 :
422 : /* freelist.c */
423 : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
424 : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
425 : uint32 *buf_state, bool *from_ring);
426 : extern void StrategyFreeBuffer(BufferDesc *buf);
427 : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
428 : BufferDesc *buf, bool from_ring);
429 :
430 : extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
431 : extern void StrategyNotifyBgWriter(int bgwprocno);
432 :
433 : extern Size StrategyShmemSize(void);
434 : extern void StrategyInitialize(bool init);
435 : extern bool have_free_buffer(void);
436 :
437 : /* buf_table.c */
438 : extern Size BufTableShmemSize(int size);
439 : extern void InitBufTable(int size);
440 : extern uint32 BufTableHashCode(BufferTag *tagPtr);
441 : extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
442 : extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
443 : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
444 :
445 : /* localbuf.c */
446 : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
447 : extern void UnpinLocalBuffer(Buffer buffer);
448 : extern void UnpinLocalBufferNoOwner(Buffer buffer);
449 : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
450 : ForkNumber forkNum,
451 : BlockNumber blockNum);
452 : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
453 : BlockNumber blockNum, bool *foundPtr);
454 : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
455 : ForkNumber fork,
456 : uint32 flags,
457 : uint32 extend_by,
458 : BlockNumber extend_upto,
459 : Buffer *buffers,
460 : uint32 *extended_by);
461 : extern void MarkLocalBufferDirty(Buffer buffer);
462 : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
463 : ForkNumber forkNum,
464 : BlockNumber firstDelBlock);
465 : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
466 : extern void AtEOXact_LocalBuffers(bool isCommit);
467 :
468 : #endif /* BUFMGR_INTERNALS_H */
|