LCOV - code coverage report
Current view: top level - src/include/storage - buf_internals.h (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 66 66 100.0 %
Date: 2025-01-18 04:15:08 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * buf_internals.h
       4             :  *    Internal definitions for buffer manager and the buffer replacement
       5             :  *    strategy.
       6             :  *
       7             :  *
       8             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       9             :  * Portions Copyright (c) 1994, Regents of the University of California
      10             :  *
      11             :  * src/include/storage/buf_internals.h
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #ifndef BUFMGR_INTERNALS_H
      16             : #define BUFMGR_INTERNALS_H
      17             : 
      18             : #include "pgstat.h"
      19             : #include "port/atomics.h"
      20             : #include "storage/buf.h"
      21             : #include "storage/bufmgr.h"
      22             : #include "storage/condition_variable.h"
      23             : #include "storage/lwlock.h"
      24             : #include "storage/shmem.h"
      25             : #include "storage/smgr.h"
      26             : #include "storage/spin.h"
      27             : #include "utils/relcache.h"
      28             : #include "utils/resowner.h"
      29             : 
      30             : /*
      31             :  * Buffer state is a single 32-bit variable where following data is combined.
      32             :  *
      33             :  * - 18 bits refcount
      34             :  * - 4 bits usage count
      35             :  * - 10 bits of flags
      36             :  *
      37             :  * Combining these values allows to perform some operations without locking
      38             :  * the buffer header, by modifying them together with a CAS loop.
      39             :  *
      40             :  * The definition of buffer state components is below.
      41             :  */
      42             : #define BUF_REFCOUNT_ONE 1
      43             : #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
      44             : #define BUF_USAGECOUNT_MASK 0x003C0000U
      45             : #define BUF_USAGECOUNT_ONE (1U << 18)
      46             : #define BUF_USAGECOUNT_SHIFT 18
      47             : #define BUF_FLAG_MASK 0xFFC00000U
      48             : 
      49             : /* Get refcount and usagecount from buffer state */
      50             : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
      51             : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
      52             : 
      53             : /*
      54             :  * Flags for buffer descriptors
      55             :  *
      56             :  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
      57             :  * entry associated with the buffer's tag.
      58             :  */
      59             : #define BM_LOCKED               (1U << 22)    /* buffer header is locked */
      60             : #define BM_DIRTY                (1U << 23)    /* data needs writing */
      61             : #define BM_VALID                (1U << 24)    /* data is valid */
      62             : #define BM_TAG_VALID            (1U << 25)    /* tag is assigned */
      63             : #define BM_IO_IN_PROGRESS       (1U << 26)    /* read or write in progress */
      64             : #define BM_IO_ERROR             (1U << 27)    /* previous I/O failed */
      65             : #define BM_JUST_DIRTIED         (1U << 28)    /* dirtied since write started */
      66             : #define BM_PIN_COUNT_WAITER     (1U << 29)    /* have waiter for sole pin */
      67             : #define BM_CHECKPOINT_NEEDED    (1U << 30)    /* must write for checkpoint */
      68             : #define BM_PERMANENT            (1U << 31)    /* permanent buffer (not unlogged,
      69             :                                              * or init fork) */
      70             : /*
      71             :  * The maximum allowed value of usage_count represents a tradeoff between
      72             :  * accuracy and speed of the clock-sweep buffer management algorithm.  A
      73             :  * large value (comparable to NBuffers) would approximate LRU semantics.
      74             :  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
      75             :  * clock sweeps to find a free buffer, so in practice we don't want the
      76             :  * value to be very large.
      77             :  */
      78             : #define BM_MAX_USAGE_COUNT  5
      79             : 
      80             : /*
      81             :  * Buffer tag identifies which disk block the buffer contains.
      82             :  *
      83             :  * Note: the BufferTag data must be sufficient to determine where to write the
      84             :  * block, without reference to pg_class or pg_tablespace entries.  It's
      85             :  * possible that the backend flushing the buffer doesn't even believe the
      86             :  * relation is visible yet (its xact may have started before the xact that
      87             :  * created the rel).  The storage manager must be able to cope anyway.
      88             :  *
      89             :  * Note: if there's any pad bytes in the struct, InitBufferTag will have
      90             :  * to be fixed to zero them, since this struct is used as a hash key.
      91             :  */
      92             : typedef struct buftag
      93             : {
      94             :     Oid         spcOid;         /* tablespace oid */
      95             :     Oid         dbOid;          /* database oid */
      96             :     RelFileNumber relNumber;    /* relation file number */
      97             :     ForkNumber  forkNum;        /* fork number */
      98             :     BlockNumber blockNum;       /* blknum relative to begin of reln */
      99             : } BufferTag;
     100             : 
     101             : static inline RelFileNumber
     102   290394196 : BufTagGetRelNumber(const BufferTag *tag)
     103             : {
     104   290394196 :     return tag->relNumber;
     105             : }
     106             : 
     107             : static inline ForkNumber
     108    40295676 : BufTagGetForkNum(const BufferTag *tag)
     109             : {
     110    40295676 :     return tag->forkNum;
     111             : }
     112             : 
     113             : static inline void
     114   131269204 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
     115             :                         ForkNumber forknum)
     116             : {
     117   131269204 :     tag->relNumber = relnumber;
     118   131269204 :     tag->forkNum = forknum;
     119   131269204 : }
     120             : 
     121             : static inline RelFileLocator
     122    33054310 : BufTagGetRelFileLocator(const BufferTag *tag)
     123             : {
     124             :     RelFileLocator rlocator;
     125             : 
     126    33054310 :     rlocator.spcOid = tag->spcOid;
     127    33054310 :     rlocator.dbOid = tag->dbOid;
     128    33054310 :     rlocator.relNumber = BufTagGetRelNumber(tag);
     129             : 
     130    33054310 :     return rlocator;
     131             : }
     132             : 
     133             : static inline void
     134    20246200 : ClearBufferTag(BufferTag *tag)
     135             : {
     136    20246200 :     tag->spcOid = InvalidOid;
     137    20246200 :     tag->dbOid = InvalidOid;
     138    20246200 :     BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
     139    20246200 :     tag->blockNum = InvalidBlockNumber;
     140    20246200 : }
     141             : 
     142             : static inline void
     143   111023004 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
     144             :               ForkNumber forkNum, BlockNumber blockNum)
     145             : {
     146   111023004 :     tag->spcOid = rlocator->spcOid;
     147   111023004 :     tag->dbOid = rlocator->dbOid;
     148   111023004 :     BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
     149   111023004 :     tag->blockNum = blockNum;
     150   111023004 : }
     151             : 
     152             : static inline bool
     153      204936 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
     154             : {
     155      409872 :     return (tag1->spcOid == tag2->spcOid) &&
     156      204936 :         (tag1->dbOid == tag2->dbOid) &&
     157      204936 :         (tag1->relNumber == tag2->relNumber) &&
     158      614738 :         (tag1->blockNum == tag2->blockNum) &&
     159      204866 :         (tag1->forkNum == tag2->forkNum);
     160             : }
     161             : 
     162             : static inline bool
     163   776846272 : BufTagMatchesRelFileLocator(const BufferTag *tag,
     164             :                             const RelFileLocator *rlocator)
     165             : {
     166  1116476536 :     return (tag->spcOid == rlocator->spcOid) &&
     167  1033608136 :         (tag->dbOid == rlocator->dbOid) &&
     168   256761864 :         (BufTagGetRelNumber(tag) == rlocator->relNumber);
     169             : }
     170             : 
     171             : 
     172             : /*
     173             :  * The shared buffer mapping table is partitioned to reduce contention.
     174             :  * To determine which partition lock a given tag requires, compute the tag's
     175             :  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
     176             :  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
     177             :  */
     178             : static inline uint32
     179   111290980 : BufTableHashPartition(uint32 hashcode)
     180             : {
     181   111290980 :     return hashcode % NUM_BUFFER_PARTITIONS;
     182             : }
     183             : 
     184             : static inline LWLock *
     185   111290980 : BufMappingPartitionLock(uint32 hashcode)
     186             : {
     187   111290980 :     return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
     188   111290980 :                             BufTableHashPartition(hashcode)].lock;
     189             : }
     190             : 
     191             : static inline LWLock *
     192             : BufMappingPartitionLockByIndex(uint32 index)
     193             : {
     194             :     return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
     195             : }
     196             : 
     197             : /*
     198             :  *  BufferDesc -- shared descriptor/state data for a single shared buffer.
     199             :  *
     200             :  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
     201             :  * tag, state or wait_backend_pgprocno fields.  In general, buffer header lock
     202             :  * is a spinlock which is combined with flags, refcount and usagecount into
     203             :  * single atomic variable.  This layout allow us to do some operations in a
     204             :  * single atomic operation, without actually acquiring and releasing spinlock;
     205             :  * for instance, increase or decrease refcount.  buf_id field never changes
     206             :  * after initialization, so does not need locking.  freeNext is protected by
     207             :  * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
     208             :  * of itself.  The buffer header lock is *not* used to control access to the
     209             :  * data in the buffer!
     210             :  *
     211             :  * It's assumed that nobody changes the state field while buffer header lock
     212             :  * is held.  Thus buffer header lock holder can do complex updates of the
     213             :  * state variable in single write, simultaneously with lock release (cleaning
     214             :  * BM_LOCKED flag).  On the other hand, updating of state without holding
     215             :  * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
     216             :  * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
     217             :  *
     218             :  * An exception is that if we have the buffer pinned, its tag can't change
     219             :  * underneath us, so we can examine the tag without locking the buffer header.
     220             :  * Also, in places we do one-time reads of the flags without bothering to
     221             :  * lock the buffer header; this is generally for situations where we don't
     222             :  * expect the flag bit being tested to be changing.
     223             :  *
     224             :  * We can't physically remove items from a disk page if another backend has
     225             :  * the buffer pinned.  Hence, a backend may need to wait for all other pins
     226             :  * to go away.  This is signaled by storing its own pgprocno into
     227             :  * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER.  At present,
     228             :  * there can be only one such waiter per buffer.
     229             :  *
     230             :  * We use this same struct for local buffer headers, but the locks are not
     231             :  * used and not all of the flag bits are useful either. To avoid unnecessary
     232             :  * overhead, manipulations of the state field should be done without actual
     233             :  * atomic operations (i.e. only pg_atomic_read_u32() and
     234             :  * pg_atomic_unlocked_write_u32()).
     235             :  *
     236             :  * Be careful to avoid increasing the size of the struct when adding or
     237             :  * reordering members.  Keeping it below 64 bytes (the most common CPU
     238             :  * cache line size) is fairly important for performance.
     239             :  *
     240             :  * Per-buffer I/O condition variables are currently kept outside this struct in
     241             :  * a separate array.  They could be moved in here and still fit within that
     242             :  * limit on common systems, but for now that is not done.
     243             :  */
     244             : typedef struct BufferDesc
     245             : {
     246             :     BufferTag   tag;            /* ID of page contained in buffer */
     247             :     int         buf_id;         /* buffer's index number (from 0) */
     248             : 
     249             :     /* state of the tag, containing flags, refcount and usagecount */
     250             :     pg_atomic_uint32 state;
     251             : 
     252             :     int         wait_backend_pgprocno;  /* backend of pin-count waiter */
     253             :     int         freeNext;       /* link in freelist chain */
     254             :     LWLock      content_lock;   /* to lock access to buffer contents */
     255             : } BufferDesc;
     256             : 
     257             : /*
     258             :  * Concurrent access to buffer headers has proven to be more efficient if
     259             :  * they're cache line aligned. So we force the start of the BufferDescriptors
     260             :  * array to be on a cache line boundary and force the elements to be cache
     261             :  * line sized.
     262             :  *
     263             :  * XXX: As this is primarily matters in highly concurrent workloads which
     264             :  * probably all are 64bit these days, and the space wastage would be a bit
     265             :  * more noticeable on 32bit systems, we don't force the stride to be cache
     266             :  * line sized on those. If somebody does actual performance testing, we can
     267             :  * reevaluate.
     268             :  *
     269             :  * Note that local buffer descriptors aren't forced to be aligned - as there's
     270             :  * no concurrent access to those it's unlikely to be beneficial.
     271             :  *
     272             :  * We use a 64-byte cache line size here, because that's the most common
     273             :  * size. Making it bigger would be a waste of memory. Even if running on a
     274             :  * platform with either 32 or 128 byte line sizes, it's good to align to
     275             :  * boundaries and avoid false sharing.
     276             :  */
     277             : #define BUFFERDESC_PAD_TO_SIZE  (SIZEOF_VOID_P == 8 ? 64 : 1)
     278             : 
     279             : typedef union BufferDescPadded
     280             : {
     281             :     BufferDesc  bufferdesc;
     282             :     char        pad[BUFFERDESC_PAD_TO_SIZE];
     283             : } BufferDescPadded;
     284             : 
     285             : /*
     286             :  * The PendingWriteback & WritebackContext structure are used to keep
     287             :  * information about pending flush requests to be issued to the OS.
     288             :  */
     289             : typedef struct PendingWriteback
     290             : {
     291             :     /* could store different types of pending flushes here */
     292             :     BufferTag   tag;
     293             : } PendingWriteback;
     294             : 
     295             : /* struct forward declared in bufmgr.h */
     296             : typedef struct WritebackContext
     297             : {
     298             :     /* pointer to the max number of writeback requests to coalesce */
     299             :     int        *max_pending;
     300             : 
     301             :     /* current number of pending writeback requests */
     302             :     int         nr_pending;
     303             : 
     304             :     /* pending requests */
     305             :     PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
     306             : } WritebackContext;
     307             : 
     308             : /* in buf_init.c */
     309             : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
     310             : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
     311             : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
     312             : 
     313             : /* in localbuf.c */
     314             : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
     315             : 
     316             : 
     317             : static inline BufferDesc *
     318  1063597314 : GetBufferDescriptor(uint32 id)
     319             : {
     320  1063597314 :     return &(BufferDescriptors[id]).bufferdesc;
     321             : }
     322             : 
     323             : static inline BufferDesc *
     324    15890626 : GetLocalBufferDescriptor(uint32 id)
     325             : {
     326    15890626 :     return &LocalBufferDescriptors[id];
     327             : }
     328             : 
     329             : static inline Buffer
     330   492880320 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
     331             : {
     332   492880320 :     return (Buffer) (bdesc->buf_id + 1);
     333             : }
     334             : 
     335             : static inline ConditionVariable *
     336    22169036 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
     337             : {
     338    22169036 :     return &(BufferIOCVArray[bdesc->buf_id]).cv;
     339             : }
     340             : 
     341             : static inline LWLock *
     342   317595586 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
     343             : {
     344   317595586 :     return (LWLock *) (&bdesc->content_lock);
     345             : }
     346             : 
     347             : /*
     348             :  * The freeNext field is either the index of the next freelist entry,
     349             :  * or one of these special values:
     350             :  */
     351             : #define FREENEXT_END_OF_LIST    (-1)
     352             : #define FREENEXT_NOT_IN_LIST    (-2)
     353             : 
     354             : /*
     355             :  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
     356             :  * not apply these to local buffers!
     357             :  */
     358             : extern uint32 LockBufHdr(BufferDesc *desc);
     359             : 
     360             : static inline void
     361    64760266 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
     362             : {
     363    64760266 :     pg_write_barrier();
     364    64760266 :     pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
     365    64760266 : }
     366             : 
     367             : /* in bufmgr.c */
     368             : 
     369             : /*
     370             :  * Structure to sort buffers per file on checkpoints.
     371             :  *
     372             :  * This structure is allocated per buffer in shared memory, so it should be
     373             :  * kept as small as possible.
     374             :  */
     375             : typedef struct CkptSortItem
     376             : {
     377             :     Oid         tsId;
     378             :     RelFileNumber relNumber;
     379             :     ForkNumber  forkNum;
     380             :     BlockNumber blockNum;
     381             :     int         buf_id;
     382             : } CkptSortItem;
     383             : 
     384             : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
     385             : 
     386             : /* ResourceOwner callbacks to hold buffer I/Os and pins */
     387             : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
     388             : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
     389             : 
     390             : /* Convenience wrappers over ResourceOwnerRemember/Forget */
     391             : static inline void
     392   130992088 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
     393             : {
     394   130992088 :     ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
     395   130992088 : }
     396             : static inline void
     397   130983068 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
     398             : {
     399   130983068 :     ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
     400   130983068 : }
     401             : static inline void
     402     4383784 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
     403             : {
     404     4383784 :     ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
     405     4383784 : }
     406             : static inline void
     407     4383754 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
     408             : {
     409     4383754 :     ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
     410     4383754 : }
     411             : 
     412             : /*
     413             :  * Internal buffer management routines
     414             :  */
     415             : /* bufmgr.c */
     416             : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
     417             : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
     418             : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
     419             :                                           IOContext io_context, BufferTag *tag);
     420             : 
     421             : /* freelist.c */
     422             : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
     423             : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
     424             :                                      uint32 *buf_state, bool *from_ring);
     425             : extern void StrategyFreeBuffer(BufferDesc *buf);
     426             : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
     427             :                                  BufferDesc *buf, bool from_ring);
     428             : 
     429             : extern int  StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
     430             : extern void StrategyNotifyBgWriter(int bgwprocno);
     431             : 
     432             : extern Size StrategyShmemSize(void);
     433             : extern void StrategyInitialize(bool init);
     434             : extern bool have_free_buffer(void);
     435             : 
     436             : /* buf_table.c */
     437             : extern Size BufTableShmemSize(int size);
     438             : extern void InitBufTable(int size);
     439             : extern uint32 BufTableHashCode(BufferTag *tagPtr);
     440             : extern int  BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
     441             : extern int  BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
     442             : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
     443             : 
     444             : /* localbuf.c */
     445             : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
     446             : extern void UnpinLocalBuffer(Buffer buffer);
     447             : extern void UnpinLocalBufferNoOwner(Buffer buffer);
     448             : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
     449             :                                                 ForkNumber forkNum,
     450             :                                                 BlockNumber blockNum);
     451             : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
     452             :                                     BlockNumber blockNum, bool *foundPtr);
     453             : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
     454             :                                           ForkNumber fork,
     455             :                                           uint32 flags,
     456             :                                           uint32 extend_by,
     457             :                                           BlockNumber extend_upto,
     458             :                                           Buffer *buffers,
     459             :                                           uint32 *extended_by);
     460             : extern void MarkLocalBufferDirty(Buffer buffer);
     461             : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
     462             :                                      ForkNumber forkNum,
     463             :                                      BlockNumber firstDelBlock);
     464             : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
     465             : extern void AtEOXact_LocalBuffers(bool isCommit);
     466             : 
     467             : #endif                          /* BUFMGR_INTERNALS_H */

Generated by: LCOV version 1.14