LCOV - code coverage report
Current view: top level - src/include/storage - buf_internals.h (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 66 66 100.0 %
Date: 2024-04-19 04:11:42 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * buf_internals.h
       4             :  *    Internal definitions for buffer manager and the buffer replacement
       5             :  *    strategy.
       6             :  *
       7             :  *
       8             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       9             :  * Portions Copyright (c) 1994, Regents of the University of California
      10             :  *
      11             :  * src/include/storage/buf_internals.h
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #ifndef BUFMGR_INTERNALS_H
      16             : #define BUFMGR_INTERNALS_H
      17             : 
      18             : #include "pgstat.h"
      19             : #include "port/atomics.h"
      20             : #include "storage/buf.h"
      21             : #include "storage/bufmgr.h"
      22             : #include "storage/condition_variable.h"
      23             : #include "storage/latch.h"
      24             : #include "storage/lwlock.h"
      25             : #include "storage/shmem.h"
      26             : #include "storage/smgr.h"
      27             : #include "storage/spin.h"
      28             : #include "utils/relcache.h"
      29             : #include "utils/resowner.h"
      30             : 
      31             : /*
      32             :  * Buffer state is a single 32-bit variable where following data is combined.
      33             :  *
      34             :  * - 18 bits refcount
      35             :  * - 4 bits usage count
      36             :  * - 10 bits of flags
      37             :  *
      38             :  * Combining these values allows to perform some operations without locking
      39             :  * the buffer header, by modifying them together with a CAS loop.
      40             :  *
      41             :  * The definition of buffer state components is below.
      42             :  */
      43             : #define BUF_REFCOUNT_ONE 1
      44             : #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
      45             : #define BUF_USAGECOUNT_MASK 0x003C0000U
      46             : #define BUF_USAGECOUNT_ONE (1U << 18)
      47             : #define BUF_USAGECOUNT_SHIFT 18
      48             : #define BUF_FLAG_MASK 0xFFC00000U
      49             : 
      50             : /* Get refcount and usagecount from buffer state */
      51             : #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
      52             : #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
      53             : 
      54             : /*
      55             :  * Flags for buffer descriptors
      56             :  *
      57             :  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
      58             :  * entry associated with the buffer's tag.
      59             :  */
      60             : #define BM_LOCKED               (1U << 22)    /* buffer header is locked */
      61             : #define BM_DIRTY                (1U << 23)    /* data needs writing */
      62             : #define BM_VALID                (1U << 24)    /* data is valid */
      63             : #define BM_TAG_VALID            (1U << 25)    /* tag is assigned */
      64             : #define BM_IO_IN_PROGRESS       (1U << 26)    /* read or write in progress */
      65             : #define BM_IO_ERROR             (1U << 27)    /* previous I/O failed */
      66             : #define BM_JUST_DIRTIED         (1U << 28)    /* dirtied since write started */
      67             : #define BM_PIN_COUNT_WAITER     (1U << 29)    /* have waiter for sole pin */
      68             : #define BM_CHECKPOINT_NEEDED    (1U << 30)    /* must write for checkpoint */
      69             : #define BM_PERMANENT            (1U << 31)    /* permanent buffer (not unlogged,
      70             :                                              * or init fork) */
      71             : /*
      72             :  * The maximum allowed value of usage_count represents a tradeoff between
      73             :  * accuracy and speed of the clock-sweep buffer management algorithm.  A
      74             :  * large value (comparable to NBuffers) would approximate LRU semantics.
      75             :  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
      76             :  * clock sweeps to find a free buffer, so in practice we don't want the
      77             :  * value to be very large.
      78             :  */
      79             : #define BM_MAX_USAGE_COUNT  5
      80             : 
      81             : /*
      82             :  * Buffer tag identifies which disk block the buffer contains.
      83             :  *
      84             :  * Note: the BufferTag data must be sufficient to determine where to write the
      85             :  * block, without reference to pg_class or pg_tablespace entries.  It's
      86             :  * possible that the backend flushing the buffer doesn't even believe the
      87             :  * relation is visible yet (its xact may have started before the xact that
      88             :  * created the rel).  The storage manager must be able to cope anyway.
      89             :  *
      90             :  * Note: if there's any pad bytes in the struct, InitBufferTag will have
      91             :  * to be fixed to zero them, since this struct is used as a hash key.
      92             :  */
      93             : typedef struct buftag
      94             : {
      95             :     Oid         spcOid;         /* tablespace oid */
      96             :     Oid         dbOid;          /* database oid */
      97             :     RelFileNumber relNumber;    /* relation file number */
      98             :     ForkNumber  forkNum;        /* fork number */
      99             :     BlockNumber blockNum;       /* blknum relative to begin of reln */
     100             : } BufferTag;
     101             : 
     102             : static inline RelFileNumber
     103   288192166 : BufTagGetRelNumber(const BufferTag *tag)
     104             : {
     105   288192166 :     return tag->relNumber;
     106             : }
     107             : 
     108             : static inline ForkNumber
     109    41641614 : BufTagGetForkNum(const BufferTag *tag)
     110             : {
     111    41641614 :     return tag->forkNum;
     112             : }
     113             : 
     114             : static inline void
     115   116101710 : BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
     116             :                         ForkNumber forknum)
     117             : {
     118   116101710 :     tag->relNumber = relnumber;
     119   116101710 :     tag->forkNum = forknum;
     120   116101710 : }
     121             : 
     122             : static inline RelFileLocator
     123    35681626 : BufTagGetRelFileLocator(const BufferTag *tag)
     124             : {
     125             :     RelFileLocator rlocator;
     126             : 
     127    35681626 :     rlocator.spcOid = tag->spcOid;
     128    35681626 :     rlocator.dbOid = tag->dbOid;
     129    35681626 :     rlocator.relNumber = BufTagGetRelNumber(tag);
     130             : 
     131    35681626 :     return rlocator;
     132             : }
     133             : 
     134             : static inline void
     135    18310882 : ClearBufferTag(BufferTag *tag)
     136             : {
     137    18310882 :     tag->spcOid = InvalidOid;
     138    18310882 :     tag->dbOid = InvalidOid;
     139    18310882 :     BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
     140    18310882 :     tag->blockNum = InvalidBlockNumber;
     141    18310882 : }
     142             : 
     143             : static inline void
     144    97790828 : InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
     145             :               ForkNumber forkNum, BlockNumber blockNum)
     146             : {
     147    97790828 :     tag->spcOid = rlocator->spcOid;
     148    97790828 :     tag->dbOid = rlocator->dbOid;
     149    97790828 :     BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
     150    97790828 :     tag->blockNum = blockNum;
     151    97790828 : }
     152             : 
     153             : static inline bool
     154     1105846 : BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
     155             : {
     156     2211594 :     return (tag1->spcOid == tag2->spcOid) &&
     157     1105748 :         (tag1->dbOid == tag2->dbOid) &&
     158     1105746 :         (tag1->relNumber == tag2->relNumber) &&
     159     3314018 :         (tag1->blockNum == tag2->blockNum) &&
     160     1102424 :         (tag1->forkNum == tag2->forkNum);
     161             : }
     162             : 
     163             : static inline bool
     164   762675202 : BufTagMatchesRelFileLocator(const BufferTag *tag,
     165             :                             const RelFileLocator *rlocator)
     166             : {
     167  1091587934 :     return (tag->spcOid == rlocator->spcOid) &&
     168  1014688160 :         (tag->dbOid == rlocator->dbOid) &&
     169   252012958 :         (BufTagGetRelNumber(tag) == rlocator->relNumber);
     170             : }
     171             : 
     172             : 
     173             : /*
     174             :  * The shared buffer mapping table is partitioned to reduce contention.
     175             :  * To determine which partition lock a given tag requires, compute the tag's
     176             :  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
     177             :  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
     178             :  */
     179             : static inline uint32
     180    96855262 : BufTableHashPartition(uint32 hashcode)
     181             : {
     182    96855262 :     return hashcode % NUM_BUFFER_PARTITIONS;
     183             : }
     184             : 
     185             : static inline LWLock *
     186    96855262 : BufMappingPartitionLock(uint32 hashcode)
     187             : {
     188    96855262 :     return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
     189    96855262 :                             BufTableHashPartition(hashcode)].lock;
     190             : }
     191             : 
     192             : static inline LWLock *
     193             : BufMappingPartitionLockByIndex(uint32 index)
     194             : {
     195             :     return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
     196             : }
     197             : 
     198             : /*
     199             :  *  BufferDesc -- shared descriptor/state data for a single shared buffer.
     200             :  *
     201             :  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
     202             :  * tag, state or wait_backend_pgprocno fields.  In general, buffer header lock
     203             :  * is a spinlock which is combined with flags, refcount and usagecount into
     204             :  * single atomic variable.  This layout allow us to do some operations in a
     205             :  * single atomic operation, without actually acquiring and releasing spinlock;
     206             :  * for instance, increase or decrease refcount.  buf_id field never changes
     207             :  * after initialization, so does not need locking.  freeNext is protected by
     208             :  * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
     209             :  * of itself.  The buffer header lock is *not* used to control access to the
     210             :  * data in the buffer!
     211             :  *
     212             :  * It's assumed that nobody changes the state field while buffer header lock
     213             :  * is held.  Thus buffer header lock holder can do complex updates of the
     214             :  * state variable in single write, simultaneously with lock release (cleaning
     215             :  * BM_LOCKED flag).  On the other hand, updating of state without holding
     216             :  * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
     217             :  * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed.
     218             :  *
     219             :  * An exception is that if we have the buffer pinned, its tag can't change
     220             :  * underneath us, so we can examine the tag without locking the buffer header.
     221             :  * Also, in places we do one-time reads of the flags without bothering to
     222             :  * lock the buffer header; this is generally for situations where we don't
     223             :  * expect the flag bit being tested to be changing.
     224             :  *
     225             :  * We can't physically remove items from a disk page if another backend has
     226             :  * the buffer pinned.  Hence, a backend may need to wait for all other pins
     227             :  * to go away.  This is signaled by storing its own pgprocno into
     228             :  * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER.  At present,
     229             :  * there can be only one such waiter per buffer.
     230             :  *
     231             :  * We use this same struct for local buffer headers, but the locks are not
     232             :  * used and not all of the flag bits are useful either. To avoid unnecessary
     233             :  * overhead, manipulations of the state field should be done without actual
     234             :  * atomic operations (i.e. only pg_atomic_read_u32() and
     235             :  * pg_atomic_unlocked_write_u32()).
     236             :  *
     237             :  * Be careful to avoid increasing the size of the struct when adding or
     238             :  * reordering members.  Keeping it below 64 bytes (the most common CPU
     239             :  * cache line size) is fairly important for performance.
     240             :  *
     241             :  * Per-buffer I/O condition variables are currently kept outside this struct in
     242             :  * a separate array.  They could be moved in here and still fit within that
     243             :  * limit on common systems, but for now that is not done.
     244             :  */
     245             : typedef struct BufferDesc
     246             : {
     247             :     BufferTag   tag;            /* ID of page contained in buffer */
     248             :     int         buf_id;         /* buffer's index number (from 0) */
     249             : 
     250             :     /* state of the tag, containing flags, refcount and usagecount */
     251             :     pg_atomic_uint32 state;
     252             : 
     253             :     int         wait_backend_pgprocno;  /* backend of pin-count waiter */
     254             :     int         freeNext;       /* link in freelist chain */
     255             :     LWLock      content_lock;   /* to lock access to buffer contents */
     256             : } BufferDesc;
     257             : 
     258             : /*
     259             :  * Concurrent access to buffer headers has proven to be more efficient if
     260             :  * they're cache line aligned. So we force the start of the BufferDescriptors
     261             :  * array to be on a cache line boundary and force the elements to be cache
     262             :  * line sized.
     263             :  *
     264             :  * XXX: As this is primarily matters in highly concurrent workloads which
     265             :  * probably all are 64bit these days, and the space wastage would be a bit
     266             :  * more noticeable on 32bit systems, we don't force the stride to be cache
     267             :  * line sized on those. If somebody does actual performance testing, we can
     268             :  * reevaluate.
     269             :  *
     270             :  * Note that local buffer descriptors aren't forced to be aligned - as there's
     271             :  * no concurrent access to those it's unlikely to be beneficial.
     272             :  *
     273             :  * We use a 64-byte cache line size here, because that's the most common
     274             :  * size. Making it bigger would be a waste of memory. Even if running on a
     275             :  * platform with either 32 or 128 byte line sizes, it's good to align to
     276             :  * boundaries and avoid false sharing.
     277             :  */
     278             : #define BUFFERDESC_PAD_TO_SIZE  (SIZEOF_VOID_P == 8 ? 64 : 1)
     279             : 
     280             : typedef union BufferDescPadded
     281             : {
     282             :     BufferDesc  bufferdesc;
     283             :     char        pad[BUFFERDESC_PAD_TO_SIZE];
     284             : } BufferDescPadded;
     285             : 
     286             : /*
     287             :  * The PendingWriteback & WritebackContext structure are used to keep
     288             :  * information about pending flush requests to be issued to the OS.
     289             :  */
     290             : typedef struct PendingWriteback
     291             : {
     292             :     /* could store different types of pending flushes here */
     293             :     BufferTag   tag;
     294             : } PendingWriteback;
     295             : 
     296             : /* struct forward declared in bufmgr.h */
     297             : typedef struct WritebackContext
     298             : {
     299             :     /* pointer to the max number of writeback requests to coalesce */
     300             :     int        *max_pending;
     301             : 
     302             :     /* current number of pending writeback requests */
     303             :     int         nr_pending;
     304             : 
     305             :     /* pending requests */
     306             :     PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
     307             : } WritebackContext;
     308             : 
     309             : /* in buf_init.c */
     310             : extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
     311             : extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
     312             : extern PGDLLIMPORT WritebackContext BackendWritebackContext;
     313             : 
     314             : /* in localbuf.c */
     315             : extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
     316             : 
     317             : 
     318             : static inline BufferDesc *
     319   979669306 : GetBufferDescriptor(uint32 id)
     320             : {
     321   979669306 :     return &(BufferDescriptors[id]).bufferdesc;
     322             : }
     323             : 
     324             : static inline BufferDesc *
     325    15547100 : GetLocalBufferDescriptor(uint32 id)
     326             : {
     327    15547100 :     return &LocalBufferDescriptors[id];
     328             : }
     329             : 
     330             : static inline Buffer
     331   427966374 : BufferDescriptorGetBuffer(const BufferDesc *bdesc)
     332             : {
     333   427966374 :     return (Buffer) (bdesc->buf_id + 1);
     334             : }
     335             : 
     336             : static inline ConditionVariable *
     337    19669242 : BufferDescriptorGetIOCV(const BufferDesc *bdesc)
     338             : {
     339    19669242 :     return &(BufferIOCVArray[bdesc->buf_id]).cv;
     340             : }
     341             : 
     342             : static inline LWLock *
     343   284054772 : BufferDescriptorGetContentLock(const BufferDesc *bdesc)
     344             : {
     345   284054772 :     return (LWLock *) (&bdesc->content_lock);
     346             : }
     347             : 
     348             : /*
     349             :  * The freeNext field is either the index of the next freelist entry,
     350             :  * or one of these special values:
     351             :  */
     352             : #define FREENEXT_END_OF_LIST    (-1)
     353             : #define FREENEXT_NOT_IN_LIST    (-2)
     354             : 
     355             : /*
     356             :  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
     357             :  * not apply these to local buffers!
     358             :  */
     359             : extern uint32 LockBufHdr(BufferDesc *desc);
     360             : 
     361             : static inline void
     362    47544122 : UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
     363             : {
     364    47544122 :     pg_write_barrier();
     365    47544122 :     pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
     366    47544122 : }
     367             : 
     368             : /* in bufmgr.c */
     369             : 
     370             : /*
     371             :  * Structure to sort buffers per file on checkpoints.
     372             :  *
     373             :  * This structure is allocated per buffer in shared memory, so it should be
     374             :  * kept as small as possible.
     375             :  */
     376             : typedef struct CkptSortItem
     377             : {
     378             :     Oid         tsId;
     379             :     RelFileNumber relNumber;
     380             :     ForkNumber  forkNum;
     381             :     BlockNumber blockNum;
     382             :     int         buf_id;
     383             : } CkptSortItem;
     384             : 
     385             : extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
     386             : 
     387             : /* ResourceOwner callbacks to hold buffer I/Os and pins */
     388             : extern PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc;
     389             : extern PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc;
     390             : 
     391             : /* Convenience wrappers over ResourceOwnerRemember/Forget */
     392             : static inline void
     393   114300814 : ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
     394             : {
     395   114300814 :     ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
     396   114300814 : }
     397             : static inline void
     398   114292540 : ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
     399             : {
     400   114292540 :     ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
     401   114292540 : }
     402             : static inline void
     403     3507626 : ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
     404             : {
     405     3507626 :     ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
     406     3507626 : }
     407             : static inline void
     408     3507596 : ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
     409             : {
     410     3507596 :     ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
     411     3507596 : }
     412             : 
     413             : /*
     414             :  * Internal buffer management routines
     415             :  */
     416             : /* bufmgr.c */
     417             : extern void WritebackContextInit(WritebackContext *context, int *max_pending);
     418             : extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
     419             : extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
     420             :                                           IOContext io_context, BufferTag *tag);
     421             : 
     422             : /* freelist.c */
     423             : extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
     424             : extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
     425             :                                      uint32 *buf_state, bool *from_ring);
     426             : extern void StrategyFreeBuffer(BufferDesc *buf);
     427             : extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
     428             :                                  BufferDesc *buf, bool from_ring);
     429             : 
     430             : extern int  StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
     431             : extern void StrategyNotifyBgWriter(int bgwprocno);
     432             : 
     433             : extern Size StrategyShmemSize(void);
     434             : extern void StrategyInitialize(bool init);
     435             : extern bool have_free_buffer(void);
     436             : 
     437             : /* buf_table.c */
     438             : extern Size BufTableShmemSize(int size);
     439             : extern void InitBufTable(int size);
     440             : extern uint32 BufTableHashCode(BufferTag *tagPtr);
     441             : extern int  BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
     442             : extern int  BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
     443             : extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
     444             : 
     445             : /* localbuf.c */
     446             : extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
     447             : extern void UnpinLocalBuffer(Buffer buffer);
     448             : extern void UnpinLocalBufferNoOwner(Buffer buffer);
     449             : extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
     450             :                                                 ForkNumber forkNum,
     451             :                                                 BlockNumber blockNum);
     452             : extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
     453             :                                     BlockNumber blockNum, bool *foundPtr);
     454             : extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
     455             :                                           ForkNumber fork,
     456             :                                           uint32 flags,
     457             :                                           uint32 extend_by,
     458             :                                           BlockNumber extend_upto,
     459             :                                           Buffer *buffers,
     460             :                                           uint32 *extended_by);
     461             : extern void MarkLocalBufferDirty(Buffer buffer);
     462             : extern void DropRelationLocalBuffers(RelFileLocator rlocator,
     463             :                                      ForkNumber forkNum,
     464             :                                      BlockNumber firstDelBlock);
     465             : extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
     466             : extern void AtEOXact_LocalBuffers(bool isCommit);
     467             : 
     468             : #endif                          /* BUFMGR_INTERNALS_H */

Generated by: LCOV version 1.14