LCOV - code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 1890 2102 89.9 %
Date: 2026-02-11 10:18:05 Functions: 127 137 92.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * bufmgr.c
       4             :  *    buffer manager interface routines
       5             :  *
       6             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/storage/buffer/bufmgr.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : /*
      16             :  * Principal entry points:
      17             :  *
      18             :  * ReadBuffer() -- find or create a buffer holding the requested page,
      19             :  *      and pin it so that no one can destroy it while this process
      20             :  *      is using it.
      21             :  *
      22             :  * StartReadBuffer() -- as above, with separate wait step
      23             :  * StartReadBuffers() -- multiple block version
      24             :  * WaitReadBuffers() -- second step of above
      25             :  *
      26             :  * ReleaseBuffer() -- unpin a buffer
      27             :  *
      28             :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
      29             :  *      The disk write is delayed until buffer replacement or checkpoint.
      30             :  *
      31             :  * See also these files:
      32             :  *      freelist.c -- chooses victim for buffer replacement
      33             :  *      buf_table.c -- manages the buffer lookup table
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include <sys/file.h>
      38             : #include <unistd.h>
      39             : 
      40             : #include "access/tableam.h"
      41             : #include "access/xloginsert.h"
      42             : #include "access/xlogutils.h"
      43             : #ifdef USE_ASSERT_CHECKING
      44             : #include "catalog/pg_tablespace_d.h"
      45             : #endif
      46             : #include "catalog/storage.h"
      47             : #include "catalog/storage_xlog.h"
      48             : #include "executor/instrument.h"
      49             : #include "lib/binaryheap.h"
      50             : #include "miscadmin.h"
      51             : #include "pg_trace.h"
      52             : #include "pgstat.h"
      53             : #include "postmaster/bgwriter.h"
      54             : #include "storage/aio.h"
      55             : #include "storage/buf_internals.h"
      56             : #include "storage/bufmgr.h"
      57             : #include "storage/fd.h"
      58             : #include "storage/ipc.h"
      59             : #include "storage/lmgr.h"
      60             : #include "storage/proc.h"
      61             : #include "storage/proclist.h"
      62             : #include "storage/procsignal.h"
      63             : #include "storage/read_stream.h"
      64             : #include "storage/smgr.h"
      65             : #include "storage/standby.h"
      66             : #include "utils/memdebug.h"
      67             : #include "utils/ps_status.h"
      68             : #include "utils/rel.h"
      69             : #include "utils/resowner.h"
      70             : #include "utils/timestamp.h"
      71             : 
      72             : 
      73             : /* Note: these two macros only work on shared buffers, not local ones! */
      74             : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
      75             : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
      76             : 
      77             : /* Note: this macro only works on local buffers, not shared ones! */
      78             : #define LocalBufHdrGetBlock(bufHdr) \
      79             :     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
      80             : 
      81             : /* Bits in SyncOneBuffer's return value */
      82             : #define BUF_WRITTEN             0x01
      83             : #define BUF_REUSABLE            0x02
      84             : 
      85             : #define RELS_BSEARCH_THRESHOLD      20
      86             : 
      87             : /*
      88             :  * This is the size (in the number of blocks) above which we scan the
      89             :  * entire buffer pool to remove the buffers for all the pages of relation
      90             :  * being dropped. For the relations with size below this threshold, we find
      91             :  * the buffers by doing lookups in BufMapping table.
      92             :  */
      93             : #define BUF_DROP_FULL_SCAN_THRESHOLD        (uint64) (NBuffers / 32)
      94             : 
      95             : /*
      96             :  * This is separated out from PrivateRefCountEntry to allow for copying all
      97             :  * the data members via struct assignment.
      98             :  */
      99             : typedef struct PrivateRefCountData
     100             : {
     101             :     /*
     102             :      * How many times has the buffer been pinned by this backend.
     103             :      */
     104             :     int32       refcount;
     105             : 
     106             :     /*
     107             :      * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
     108             :      * the buffer is not locked.
     109             :      */
     110             :     BufferLockMode lockmode;
     111             : } PrivateRefCountData;
     112             : 
     113             : typedef struct PrivateRefCountEntry
     114             : {
     115             :     /*
     116             :      * Note that this needs to be same as the entry's corresponding
     117             :      * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
     118             :      * store it in both places as this is used for the hashtable key and
     119             :      * because it is more convenient (passing around a PrivateRefCountEntry
     120             :      * suffices to identify the buffer) and faster (checking the keys array is
     121             :      * faster when checking many entries, checking the entry is faster if just
     122             :      * checking a single entry).
     123             :      */
     124             :     Buffer      buffer;
     125             : 
     126             :     PrivateRefCountData data;
     127             : } PrivateRefCountEntry;
     128             : 
     129             : /* 64 bytes, about the size of a cache line on common systems */
     130             : #define REFCOUNT_ARRAY_ENTRIES 8
     131             : 
     132             : /*
     133             :  * Status of buffers to checkpoint for a particular tablespace, used
     134             :  * internally in BufferSync.
     135             :  */
     136             : typedef struct CkptTsStatus
     137             : {
     138             :     /* oid of the tablespace */
     139             :     Oid         tsId;
     140             : 
     141             :     /*
     142             :      * Checkpoint progress for this tablespace. To make progress comparable
     143             :      * between tablespaces the progress is, for each tablespace, measured as a
     144             :      * number between 0 and the total number of to-be-checkpointed pages. Each
     145             :      * page checkpointed in this tablespace increments this space's progress
     146             :      * by progress_slice.
     147             :      */
     148             :     float8      progress;
     149             :     float8      progress_slice;
     150             : 
     151             :     /* number of to-be checkpointed pages in this tablespace */
     152             :     int         num_to_scan;
     153             :     /* already processed pages in this tablespace */
     154             :     int         num_scanned;
     155             : 
     156             :     /* current offset in CkptBufferIds for this tablespace */
     157             :     int         index;
     158             : } CkptTsStatus;
     159             : 
     160             : /*
     161             :  * Type for array used to sort SMgrRelations
     162             :  *
     163             :  * FlushRelationsAllBuffers shares the same comparator function with
     164             :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
     165             :  * compatible.
     166             :  */
     167             : typedef struct SMgrSortArray
     168             : {
     169             :     RelFileLocator rlocator;    /* This must be the first member */
     170             :     SMgrRelation srel;
     171             : } SMgrSortArray;
     172             : 
     173             : /* GUC variables */
     174             : bool        zero_damaged_pages = false;
     175             : int         bgwriter_lru_maxpages = 100;
     176             : double      bgwriter_lru_multiplier = 2.0;
     177             : bool        track_io_timing = false;
     178             : 
     179             : /*
     180             :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
     181             :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
     182             :  * for buffers not belonging to tablespaces that have their
     183             :  * effective_io_concurrency parameter set.
     184             :  */
     185             : int         effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
     186             : 
     187             : /*
     188             :  * Like effective_io_concurrency, but used by maintenance code paths that might
     189             :  * benefit from a higher setting because they work on behalf of many sessions.
     190             :  * Overridden by the tablespace setting of the same name.
     191             :  */
     192             : int         maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
     193             : 
     194             : /*
     195             :  * Limit on how many blocks should be handled in single I/O operations.
     196             :  * StartReadBuffers() callers should respect it, as should other operations
     197             :  * that call smgr APIs directly.  It is computed as the minimum of underlying
     198             :  * GUCs io_combine_limit_guc and io_max_combine_limit.
     199             :  */
     200             : int         io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     201             : int         io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
     202             : int         io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
     203             : 
     204             : /*
     205             :  * GUC variables about triggering kernel writeback for buffers written; OS
     206             :  * dependent defaults are set via the GUC mechanism.
     207             :  */
     208             : int         checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
     209             : int         bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
     210             : int         backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
     211             : 
     212             : /* local state for LockBufferForCleanup */
     213             : static BufferDesc *PinCountWaitBuf = NULL;
     214             : 
     215             : /*
     216             :  * Backend-Private refcount management:
     217             :  *
     218             :  * Each buffer also has a private refcount that keeps track of the number of
     219             :  * times the buffer is pinned in the current process.  This is so that the
     220             :  * shared refcount needs to be modified only once if a buffer is pinned more
     221             :  * than once by an individual backend.  It's also used to check that no
     222             :  * buffers are still pinned at the end of transactions and when exiting. We
     223             :  * also use this mechanism to track whether this backend has a buffer locked,
     224             :  * and, if so, in what mode.
     225             :  *
     226             :  *
     227             :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
     228             :  * track of local buffers, we use a small sequentially searched array
     229             :  * (PrivateRefCountArrayKeys, with the corresponding data stored in
     230             :  * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
     231             :  * keep track of backend local pins.
     232             :  *
     233             :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
     234             :  * refcounts are kept track of in the array; after that, new array entries
     235             :  * displace old ones into the hash table. That way a frequently used entry
     236             :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
     237             :  *
     238             :  * Note that in most scenarios the number of pinned buffers will not exceed
     239             :  * REFCOUNT_ARRAY_ENTRIES.
     240             :  *
     241             :  *
     242             :  * To enter a buffer into the refcount tracking mechanism first reserve a free
     243             :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
     244             :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
     245             :  * memory allocations in NewPrivateRefCountEntry() which can be important
     246             :  * because in some scenarios it's called with a spinlock held...
     247             :  */
     248             : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
     249             : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
     250             : static HTAB *PrivateRefCountHash = NULL;
     251             : static int32 PrivateRefCountOverflowed = 0;
     252             : static uint32 PrivateRefCountClock = 0;
     253             : static int  ReservedRefCountSlot = -1;
     254             : static int  PrivateRefCountEntryLast = -1;
     255             : 
     256             : static uint32 MaxProportionalPins;
     257             : 
     258             : static void ReservePrivateRefCountEntry(void);
     259             : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
     260             : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
     261             : static inline int32 GetPrivateRefCount(Buffer buffer);
     262             : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
     263             : 
     264             : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
     265             : static void ResOwnerReleaseBufferIO(Datum res);
     266             : static char *ResOwnerPrintBufferIO(Datum res);
     267             : static void ResOwnerReleaseBuffer(Datum res);
     268             : static char *ResOwnerPrintBuffer(Datum res);
     269             : 
     270             : const ResourceOwnerDesc buffer_io_resowner_desc =
     271             : {
     272             :     .name = "buffer io",
     273             :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     274             :     .release_priority = RELEASE_PRIO_BUFFER_IOS,
     275             :     .ReleaseResource = ResOwnerReleaseBufferIO,
     276             :     .DebugPrint = ResOwnerPrintBufferIO
     277             : };
     278             : 
     279             : const ResourceOwnerDesc buffer_resowner_desc =
     280             : {
     281             :     .name = "buffer",
     282             :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
     283             :     .release_priority = RELEASE_PRIO_BUFFER_PINS,
     284             :     .ReleaseResource = ResOwnerReleaseBuffer,
     285             :     .DebugPrint = ResOwnerPrintBuffer
     286             : };
     287             : 
     288             : /*
     289             :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
     290             :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
     291             :  * a new entry - but it's perfectly fine to not use a reserved entry.
     292             :  */
     293             : static void
     294   131982752 : ReservePrivateRefCountEntry(void)
     295             : {
     296             :     /* Already reserved (or freed), nothing to do */
     297   131982752 :     if (ReservedRefCountSlot != -1)
     298   123575302 :         return;
     299             : 
     300             :     /*
     301             :      * First search for a free entry the array, that'll be sufficient in the
     302             :      * majority of cases.
     303             :      */
     304             :     {
     305             :         int         i;
     306             : 
     307    75667050 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     308             :         {
     309    67259600 :             if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
     310             :             {
     311    49271200 :                 ReservedRefCountSlot = i;
     312             : 
     313             :                 /*
     314             :                  * We could return immediately, but iterating till the end of
     315             :                  * the array allows compiler-autovectorization.
     316             :                  */
     317             :             }
     318             :         }
     319             : 
     320     8407450 :         if (ReservedRefCountSlot != -1)
     321     8042712 :             return;
     322             :     }
     323             : 
     324             :     /*
     325             :      * No luck. All array entries are full. Move one array entry into the hash
     326             :      * table.
     327             :      */
     328             :     {
     329             :         /*
     330             :          * Move entry from the current clock position in the array into the
     331             :          * hashtable. Use that slot.
     332             :          */
     333             :         int         victim_slot;
     334             :         PrivateRefCountEntry *victim_entry;
     335             :         PrivateRefCountEntry *hashent;
     336             :         bool        found;
     337             : 
     338             :         /* select victim slot */
     339      364738 :         victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
     340      364738 :         victim_entry = &PrivateRefCountArray[victim_slot];
     341      364738 :         ReservedRefCountSlot = victim_slot;
     342             : 
     343             :         /* Better be used, otherwise we shouldn't get here. */
     344             :         Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
     345             :         Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
     346             :         Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
     347             : 
     348             :         /* enter victim array entry into hashtable */
     349      364738 :         hashent = hash_search(PrivateRefCountHash,
     350      364738 :                               &PrivateRefCountArrayKeys[victim_slot],
     351             :                               HASH_ENTER,
     352             :                               &found);
     353             :         Assert(!found);
     354             :         /* move data from the entry in the array to the hash entry */
     355      364738 :         hashent->data = victim_entry->data;
     356             : 
     357             :         /* clear the now free array slot */
     358      364738 :         PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
     359      364738 :         victim_entry->buffer = InvalidBuffer;
     360             : 
     361             :         /* clear the whole data member, just for future proofing */
     362      364738 :         memset(&victim_entry->data, 0, sizeof(victim_entry->data));
     363      364738 :         victim_entry->data.refcount = 0;
     364      364738 :         victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
     365             : 
     366      364738 :         PrivateRefCountOverflowed++;
     367             :     }
     368             : }
     369             : 
     370             : /*
     371             :  * Fill a previously reserved refcount entry.
     372             :  */
     373             : static PrivateRefCountEntry *
     374   119412602 : NewPrivateRefCountEntry(Buffer buffer)
     375             : {
     376             :     PrivateRefCountEntry *res;
     377             : 
     378             :     /* only allowed to be called when a reservation has been made */
     379             :     Assert(ReservedRefCountSlot != -1);
     380             : 
     381             :     /* use up the reserved entry */
     382   119412602 :     res = &PrivateRefCountArray[ReservedRefCountSlot];
     383             : 
     384             :     /* and fill it */
     385   119412602 :     PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     386   119412602 :     res->buffer = buffer;
     387   119412602 :     res->data.refcount = 0;
     388   119412602 :     res->data.lockmode = BUFFER_LOCK_UNLOCK;
     389             : 
     390             :     /* update cache for the next lookup */
     391   119412602 :     PrivateRefCountEntryLast = ReservedRefCountSlot;
     392             : 
     393   119412602 :     ReservedRefCountSlot = -1;
     394             : 
     395   119412602 :     return res;
     396             : }
     397             : 
     398             : /*
     399             :  * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
     400             :  * inlining. This particularly seems to be true if the compiler is capable of
     401             :  * auto-vectorizing the code, as that imposes additional stack-alignment
     402             :  * requirements etc.
     403             :  */
     404             : static pg_noinline PrivateRefCountEntry *
     405   148042326 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
     406             : {
     407             :     PrivateRefCountEntry *res;
     408   148042326 :     int         match = -1;
     409             :     int         i;
     410             : 
     411             :     /*
     412             :      * First search for references in the array, that'll be sufficient in the
     413             :      * majority of cases.
     414             :      */
     415  1332380934 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
     416             :     {
     417  1184338608 :         if (PrivateRefCountArrayKeys[i] == buffer)
     418             :         {
     419    32779714 :             match = i;
     420             :             /* see ReservePrivateRefCountEntry() for why we don't return */
     421             :         }
     422             :     }
     423             : 
     424   148042326 :     if (likely(match != -1))
     425             :     {
     426             :         /* update cache for the next lookup */
     427    32779714 :         PrivateRefCountEntryLast = match;
     428             : 
     429    32779714 :         return &PrivateRefCountArray[match];
     430             :     }
     431             : 
     432             :     /*
     433             :      * By here we know that the buffer, if already pinned, isn't residing in
     434             :      * the array.
     435             :      *
     436             :      * Only look up the buffer in the hashtable if we've previously overflowed
     437             :      * into it.
     438             :      */
     439   115262612 :     if (PrivateRefCountOverflowed == 0)
     440   114391930 :         return NULL;
     441             : 
     442      870682 :     res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
     443             : 
     444      870682 :     if (res == NULL)
     445      415766 :         return NULL;
     446      454916 :     else if (!do_move)
     447             :     {
     448             :         /* caller doesn't want us to move the hash entry into the array */
     449      261100 :         return res;
     450             :     }
     451             :     else
     452             :     {
     453             :         /* move buffer from hashtable into the free array slot */
     454             :         bool        found;
     455             :         PrivateRefCountEntry *free;
     456             : 
     457             :         /* Ensure there's a free array slot */
     458      193816 :         ReservePrivateRefCountEntry();
     459             : 
     460             :         /* Use up the reserved slot */
     461             :         Assert(ReservedRefCountSlot != -1);
     462      193816 :         free = &PrivateRefCountArray[ReservedRefCountSlot];
     463             :         Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
     464             :         Assert(free->buffer == InvalidBuffer);
     465             : 
     466             :         /* and fill it */
     467      193816 :         free->buffer = buffer;
     468      193816 :         free->data = res->data;
     469      193816 :         PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
     470             :         /* update cache for the next lookup */
     471      193816 :         PrivateRefCountEntryLast = match;
     472             : 
     473      193816 :         ReservedRefCountSlot = -1;
     474             : 
     475             : 
     476             :         /* delete from hashtable */
     477      193816 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     478             :         Assert(found);
     479             :         Assert(PrivateRefCountOverflowed > 0);
     480      193816 :         PrivateRefCountOverflowed--;
     481             : 
     482      193816 :         return free;
     483             :     }
     484             : }
     485             : 
     486             : /*
     487             :  * Return the PrivateRefCount entry for the passed buffer.
     488             :  *
     489             :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
     490             :  * do_move is true, and the entry resides in the hashtable the entry is
     491             :  * optimized for frequent access by moving it to the array.
     492             :  */
     493             : static inline PrivateRefCountEntry *
     494   628125582 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
     495             : {
     496             :     Assert(BufferIsValid(buffer));
     497             :     Assert(!BufferIsLocal(buffer));
     498             : 
     499             :     /*
     500             :      * It's very common to look up the same buffer repeatedly. To make that
     501             :      * fast, we have a one-entry cache.
     502             :      *
     503             :      * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
     504             :      * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
     505             :      * fewer addresses are computed and fewer cachelines are accessed. Whereas
     506             :      * in GetPrivateRefCountEntrySlow()'s case, checking
     507             :      * PrivateRefCountArrayKeys saves a lot of memory accesses.
     508             :      */
     509   628125582 :     if (likely(PrivateRefCountEntryLast != -1) &&
     510   628000154 :         likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
     511             :     {
     512   480083256 :         return &PrivateRefCountArray[PrivateRefCountEntryLast];
     513             :     }
     514             : 
     515             :     /*
     516             :      * The code for the cached lookup is small enough to be worth inlining
     517             :      * into the caller. In the miss case however, that empirically doesn't
     518             :      * seem worth it.
     519             :      */
     520   148042326 :     return GetPrivateRefCountEntrySlow(buffer, do_move);
     521             : }
     522             : 
     523             : /*
     524             :  * Returns how many times the passed buffer is pinned by this backend.
     525             :  *
     526             :  * Only works for shared memory buffers!
     527             :  */
     528             : static inline int32
     529     5766674 : GetPrivateRefCount(Buffer buffer)
     530             : {
     531             :     PrivateRefCountEntry *ref;
     532             : 
     533             :     Assert(BufferIsValid(buffer));
     534             :     Assert(!BufferIsLocal(buffer));
     535             : 
     536             :     /*
     537             :      * Not moving the entry - that's ok for the current users, but we might
     538             :      * want to change this one day.
     539             :      */
     540     5766674 :     ref = GetPrivateRefCountEntry(buffer, false);
     541             : 
     542     5766674 :     if (ref == NULL)
     543          58 :         return 0;
     544     5766616 :     return ref->data.refcount;
     545             : }
     546             : 
     547             : /*
     548             :  * Release resources used to track the reference count of a buffer which we no
     549             :  * longer have pinned and don't want to pin again immediately.
     550             :  */
     551             : static void
     552   119412602 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
     553             : {
     554             :     Assert(ref->data.refcount == 0);
     555             :     Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
     556             : 
     557   119412602 :     if (ref >= &PrivateRefCountArray[0] &&
     558             :         ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
     559             :     {
     560   119241680 :         ref->buffer = InvalidBuffer;
     561   119241680 :         PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
     562             : 
     563             : 
     564             :         /*
     565             :          * Mark the just used entry as reserved - in many scenarios that
     566             :          * allows us to avoid ever having to search the array/hash for free
     567             :          * entries.
     568             :          */
     569   119241680 :         ReservedRefCountSlot = ref - PrivateRefCountArray;
     570             :     }
     571             :     else
     572             :     {
     573             :         bool        found;
     574      170922 :         Buffer      buffer = ref->buffer;
     575             : 
     576      170922 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
     577             :         Assert(found);
     578             :         Assert(PrivateRefCountOverflowed > 0);
     579      170922 :         PrivateRefCountOverflowed--;
     580             :     }
     581   119412602 : }
     582             : 
     583             : /*
     584             :  * BufferIsPinned
     585             :  *      True iff the buffer is pinned (also checks for valid buffer number).
     586             :  *
     587             :  *      NOTE: what we check here is that *this* backend holds a pin on
     588             :  *      the buffer.  We do not care whether some other backend does.
     589             :  */
     590             : #define BufferIsPinned(bufnum) \
     591             : ( \
     592             :     !BufferIsValid(bufnum) ? \
     593             :         false \
     594             :     : \
     595             :         BufferIsLocal(bufnum) ? \
     596             :             (LocalRefCount[-(bufnum) - 1] > 0) \
     597             :         : \
     598             :     (GetPrivateRefCount(bufnum) > 0) \
     599             : )
     600             : 
     601             : 
     602             : static Buffer ReadBuffer_common(Relation rel,
     603             :                                 SMgrRelation smgr, char smgr_persistence,
     604             :                                 ForkNumber forkNum, BlockNumber blockNum,
     605             :                                 ReadBufferMode mode, BufferAccessStrategy strategy);
     606             : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
     607             :                                            ForkNumber fork,
     608             :                                            BufferAccessStrategy strategy,
     609             :                                            uint32 flags,
     610             :                                            uint32 extend_by,
     611             :                                            BlockNumber extend_upto,
     612             :                                            Buffer *buffers,
     613             :                                            uint32 *extended_by);
     614             : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
     615             :                                            ForkNumber fork,
     616             :                                            BufferAccessStrategy strategy,
     617             :                                            uint32 flags,
     618             :                                            uint32 extend_by,
     619             :                                            BlockNumber extend_upto,
     620             :                                            Buffer *buffers,
     621             :                                            uint32 *extended_by);
     622             : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
     623             :                       bool skip_if_not_valid);
     624             : static void PinBuffer_Locked(BufferDesc *buf);
     625             : static void UnpinBuffer(BufferDesc *buf);
     626             : static void UnpinBufferNoOwner(BufferDesc *buf);
     627             : static void BufferSync(int flags);
     628             : static int  SyncOneBuffer(int buf_id, bool skip_recently_used,
     629             :                           WritebackContext *wb_context);
     630             : static void WaitIO(BufferDesc *buf);
     631             : static void AbortBufferIO(Buffer buffer);
     632             : static void shared_buffer_write_error_callback(void *arg);
     633             : static void local_buffer_write_error_callback(void *arg);
     634             : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
     635             :                                       char relpersistence,
     636             :                                       ForkNumber forkNum,
     637             :                                       BlockNumber blockNum,
     638             :                                       BufferAccessStrategy strategy,
     639             :                                       bool *foundPtr, IOContext io_context);
     640             : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
     641             : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
     642             : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
     643             : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
     644             :                                 IOObject io_object, IOContext io_context);
     645             : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
     646             :                         IOObject io_object, IOContext io_context);
     647             : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
     648             :                                        ForkNumber forkNum,
     649             :                                        BlockNumber nForkBlock,
     650             :                                        BlockNumber firstDelBlock);
     651             : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
     652             :                                            RelFileLocator dstlocator,
     653             :                                            ForkNumber forkNum, bool permanent);
     654             : static void AtProcExit_Buffers(int code, Datum arg);
     655             : static void CheckForBufferLeaks(void);
     656             : #ifdef USE_ASSERT_CHECKING
     657             : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
     658             : #endif
     659             : static int  rlocator_comparator(const void *p1, const void *p2);
     660             : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
     661             : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
     662             : static int  ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
     663             : 
     664             : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     665             : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
     666             : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
     667             : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
     668             : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
     669             : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
     670             : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
     671             : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
     672             : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
     673             : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
     674             : static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
     675             : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
     676             : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
     677             : 
     678             : 
     679             : /*
     680             :  * Implementation of PrefetchBuffer() for shared buffers.
     681             :  */
     682             : PrefetchBufferResult
     683       64564 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
     684             :                      ForkNumber forkNum,
     685             :                      BlockNumber blockNum)
     686             : {
     687       64564 :     PrefetchBufferResult result = {InvalidBuffer, false};
     688             :     BufferTag   newTag;         /* identity of requested block */
     689             :     uint32      newHash;        /* hash value for newTag */
     690             :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
     691             :     int         buf_id;
     692             : 
     693             :     Assert(BlockNumberIsValid(blockNum));
     694             : 
     695             :     /* create a tag so we can lookup the buffer */
     696       64564 :     InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
     697             :                   forkNum, blockNum);
     698             : 
     699             :     /* determine its hash code and partition lock ID */
     700       64564 :     newHash = BufTableHashCode(&newTag);
     701       64564 :     newPartitionLock = BufMappingPartitionLock(newHash);
     702             : 
     703             :     /* see if the block is in the buffer pool already */
     704       64564 :     LWLockAcquire(newPartitionLock, LW_SHARED);
     705       64564 :     buf_id = BufTableLookup(&newTag, newHash);
     706       64564 :     LWLockRelease(newPartitionLock);
     707             : 
     708             :     /* If not in buffers, initiate prefetch */
     709       64564 :     if (buf_id < 0)
     710             :     {
     711             : #ifdef USE_PREFETCH
     712             :         /*
     713             :          * Try to initiate an asynchronous read.  This returns false in
     714             :          * recovery if the relation file doesn't exist.
     715             :          */
     716       35104 :         if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
     717       17328 :             smgrprefetch(smgr_reln, forkNum, blockNum, 1))
     718             :         {
     719       17328 :             result.initiated_io = true;
     720             :         }
     721             : #endif                          /* USE_PREFETCH */
     722             :     }
     723             :     else
     724             :     {
     725             :         /*
     726             :          * Report the buffer it was in at that time.  The caller may be able
     727             :          * to avoid a buffer table lookup, but it's not pinned and it must be
     728             :          * rechecked!
     729             :          */
     730       46788 :         result.recent_buffer = buf_id + 1;
     731             :     }
     732             : 
     733             :     /*
     734             :      * If the block *is* in buffers, we do nothing.  This is not really ideal:
     735             :      * the block might be just about to be evicted, which would be stupid
     736             :      * since we know we are going to need it soon.  But the only easy answer
     737             :      * is to bump the usage_count, which does not seem like a great solution:
     738             :      * when the caller does ultimately touch the block, usage_count would get
     739             :      * bumped again, resulting in too much favoritism for blocks that are
     740             :      * involved in a prefetch sequence. A real fix would involve some
     741             :      * additional per-buffer state, and it's not clear that there's enough of
     742             :      * a problem to justify that.
     743             :      */
     744             : 
     745       64564 :     return result;
     746             : }
     747             : 
     748             : /*
     749             :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
     750             :  *
     751             :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
     752             :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
     753             :  * block will not be delayed by the I/O.  Prefetching is optional.
     754             :  *
     755             :  * There are three possible outcomes:
     756             :  *
     757             :  * 1.  If the block is already cached, the result includes a valid buffer that
     758             :  * could be used by the caller to avoid the need for a later buffer lookup, but
     759             :  * it's not pinned, so the caller must recheck it.
     760             :  *
     761             :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
     762             :  * true.  Currently there is no way to know if the data was already cached by
     763             :  * the kernel and therefore didn't really initiate I/O, and no way to know when
     764             :  * the I/O completes other than using synchronous ReadBuffer().
     765             :  *
     766             :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
     767             :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
     768             :  * lack of a kernel facility), direct I/O is enabled, or the underlying
     769             :  * relation file wasn't found and we are in recovery.  (If the relation file
     770             :  * wasn't found and we are not in recovery, an error is raised).
     771             :  */
     772             : PrefetchBufferResult
     773       43024 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
     774             : {
     775             :     Assert(RelationIsValid(reln));
     776             :     Assert(BlockNumberIsValid(blockNum));
     777             : 
     778       43024 :     if (RelationUsesLocalBuffers(reln))
     779             :     {
     780             :         /* see comments in ReadBufferExtended */
     781        1566 :         if (RELATION_IS_OTHER_TEMP(reln))
     782           0 :             ereport(ERROR,
     783             :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     784             :                      errmsg("cannot access temporary tables of other sessions")));
     785             : 
     786             :         /* pass it off to localbuf.c */
     787        1566 :         return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     788             :     }
     789             :     else
     790             :     {
     791             :         /* pass it to the shared buffer version */
     792       41458 :         return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
     793             :     }
     794             : }
     795             : 
     796             : /*
     797             :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
     798             :  *
     799             :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
     800             :  * successful.  Return true if the buffer is valid and still has the expected
     801             :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
     802             :  */
     803             : bool
     804        9146 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
     805             :                  Buffer recent_buffer)
     806             : {
     807             :     BufferDesc *bufHdr;
     808             :     BufferTag   tag;
     809             :     uint64      buf_state;
     810             : 
     811             :     Assert(BufferIsValid(recent_buffer));
     812             : 
     813        9146 :     ResourceOwnerEnlarge(CurrentResourceOwner);
     814        9146 :     ReservePrivateRefCountEntry();
     815        9146 :     InitBufferTag(&tag, &rlocator, forkNum, blockNum);
     816             : 
     817        9146 :     if (BufferIsLocal(recent_buffer))
     818             :     {
     819          64 :         int         b = -recent_buffer - 1;
     820             : 
     821          64 :         bufHdr = GetLocalBufferDescriptor(b);
     822          64 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     823             : 
     824             :         /* Is it still valid and holding the right tag? */
     825          64 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
     826             :         {
     827          64 :             PinLocalBuffer(bufHdr, true);
     828             : 
     829          64 :             pgBufferUsage.local_blks_hit++;
     830             : 
     831          64 :             return true;
     832             :         }
     833             :     }
     834             :     else
     835             :     {
     836        9082 :         bufHdr = GetBufferDescriptor(recent_buffer - 1);
     837             : 
     838             :         /*
     839             :          * Is it still valid and holding the right tag?  We do an unlocked tag
     840             :          * comparison first, to make it unlikely that we'll increment the
     841             :          * usage counter of the wrong buffer, if someone calls us with a very
     842             :          * out of date recent_buffer.  Then we'll check it again if we get the
     843             :          * pin.
     844             :          */
     845       18090 :         if (BufferTagsEqual(&tag, &bufHdr->tag) &&
     846        9008 :             PinBuffer(bufHdr, NULL, true))
     847             :         {
     848        8996 :             if (BufferTagsEqual(&tag, &bufHdr->tag))
     849             :             {
     850        8996 :                 pgBufferUsage.shared_blks_hit++;
     851        8996 :                 return true;
     852             :             }
     853           0 :             UnpinBuffer(bufHdr);
     854             :         }
     855             :     }
     856             : 
     857          86 :     return false;
     858             : }
     859             : 
     860             : /*
     861             :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
     862             :  *      fork with RBM_NORMAL mode and default strategy.
     863             :  */
     864             : Buffer
     865    88086076 : ReadBuffer(Relation reln, BlockNumber blockNum)
     866             : {
     867    88086076 :     return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
     868             : }
     869             : 
     870             : /*
     871             :  * ReadBufferExtended -- returns a buffer containing the requested
     872             :  *      block of the requested relation.  If the blknum
     873             :  *      requested is P_NEW, extend the relation file and
     874             :  *      allocate a new block.  (Caller is responsible for
     875             :  *      ensuring that only one backend tries to extend a
     876             :  *      relation at the same time!)
     877             :  *
     878             :  * Returns: the buffer number for the buffer containing
     879             :  *      the block read.  The returned buffer has been pinned.
     880             :  *      Does not return on error --- elog's instead.
     881             :  *
     882             :  * Assume when this function is called, that reln has been opened already.
     883             :  *
     884             :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
     885             :  * validated.  An error is thrown if the page header is not valid.  (But
     886             :  * note that an all-zero page is considered "valid"; see
     887             :  * PageIsVerified().)
     888             :  *
     889             :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
     890             :  * valid, the page is zeroed instead of throwing an error. This is intended
     891             :  * for non-critical data, where the caller is prepared to repair errors.
     892             :  *
     893             :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
     894             :  * filled with zeros instead of reading it from disk.  Useful when the caller
     895             :  * is going to fill the page from scratch, since this saves I/O and avoids
     896             :  * unnecessary failure if the page-on-disk has corrupt page headers.
     897             :  * The page is returned locked to ensure that the caller has a chance to
     898             :  * initialize the page before it's made visible to others.
     899             :  * Caution: do not use this mode to read a page that is beyond the relation's
     900             :  * current physical EOF; that is likely to cause problems in md.c when
     901             :  * the page is modified and written out. P_NEW is OK, though.
     902             :  *
     903             :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
     904             :  * a cleanup-strength lock on the page.
     905             :  *
     906             :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
     907             :  *
     908             :  * If strategy is not NULL, a nondefault buffer access strategy is used.
     909             :  * See buffer/README for details.
     910             :  */
     911             : inline Buffer
     912   105756294 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
     913             :                    ReadBufferMode mode, BufferAccessStrategy strategy)
     914             : {
     915             :     Buffer      buf;
     916             : 
     917             :     /*
     918             :      * Reject attempts to read non-local temporary relations; we would be
     919             :      * likely to get wrong data since we have no visibility into the owning
     920             :      * session's local buffers.
     921             :      */
     922   105756294 :     if (RELATION_IS_OTHER_TEMP(reln))
     923           0 :         ereport(ERROR,
     924             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     925             :                  errmsg("cannot access temporary tables of other sessions")));
     926             : 
     927             :     /*
     928             :      * Read the buffer, and update pgstat counters to reflect a cache hit or
     929             :      * miss.
     930             :      */
     931   105756294 :     buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
     932             :                             forkNum, blockNum, mode, strategy);
     933             : 
     934   105756248 :     return buf;
     935             : }
     936             : 
     937             : 
     938             : /*
     939             :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
     940             :  *      a relcache entry for the relation.
     941             :  *
     942             :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
     943             :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
     944             :  * cannot be used for temporary relations (and making that work might be
     945             :  * difficult, unless we only want to read temporary relations for our own
     946             :  * ProcNumber).
     947             :  */
     948             : Buffer
     949    11595278 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
     950             :                           BlockNumber blockNum, ReadBufferMode mode,
     951             :                           BufferAccessStrategy strategy, bool permanent)
     952             : {
     953    11595278 :     SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
     954             : 
     955    11595278 :     return ReadBuffer_common(NULL, smgr,
     956             :                              permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
     957             :                              forkNum, blockNum,
     958             :                              mode, strategy);
     959             : }
     960             : 
     961             : /*
     962             :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
     963             :  */
     964             : Buffer
     965       91926 : ExtendBufferedRel(BufferManagerRelation bmr,
     966             :                   ForkNumber forkNum,
     967             :                   BufferAccessStrategy strategy,
     968             :                   uint32 flags)
     969             : {
     970             :     Buffer      buf;
     971       91926 :     uint32      extend_by = 1;
     972             : 
     973       91926 :     ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
     974             :                         &buf, &extend_by);
     975             : 
     976       91926 :     return buf;
     977             : }
     978             : 
     979             : /*
     980             :  * Extend relation by multiple blocks.
     981             :  *
     982             :  * Tries to extend the relation by extend_by blocks. Depending on the
     983             :  * availability of resources the relation may end up being extended by a
     984             :  * smaller number of pages (unless an error is thrown, always by at least one
     985             :  * page). *extended_by is updated to the number of pages the relation has been
     986             :  * extended to.
     987             :  *
     988             :  * buffers needs to be an array that is at least extend_by long. Upon
     989             :  * completion, the first extend_by array elements will point to a pinned
     990             :  * buffer.
     991             :  *
     992             :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
     993             :  * locked. This is useful for callers that want a buffer that is guaranteed to
     994             :  * be empty.
     995             :  */
     996             : BlockNumber
     997      321928 : ExtendBufferedRelBy(BufferManagerRelation bmr,
     998             :                     ForkNumber fork,
     999             :                     BufferAccessStrategy strategy,
    1000             :                     uint32 flags,
    1001             :                     uint32 extend_by,
    1002             :                     Buffer *buffers,
    1003             :                     uint32 *extended_by)
    1004             : {
    1005             :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1006             :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1007             :     Assert(extend_by > 0);
    1008             : 
    1009      321928 :     if (bmr.relpersistence == '\0')
    1010      321928 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1011             : 
    1012      321928 :     return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1013             :                                    extend_by, InvalidBlockNumber,
    1014             :                                    buffers, extended_by);
    1015             : }
    1016             : 
    1017             : /*
    1018             :  * Extend the relation so it is at least extend_to blocks large, return buffer
    1019             :  * (extend_to - 1).
    1020             :  *
    1021             :  * This is useful for callers that want to write a specific page, regardless
    1022             :  * of the current size of the relation (e.g. useful for visibilitymap and for
    1023             :  * crash recovery).
    1024             :  */
    1025             : Buffer
    1026      104076 : ExtendBufferedRelTo(BufferManagerRelation bmr,
    1027             :                     ForkNumber fork,
    1028             :                     BufferAccessStrategy strategy,
    1029             :                     uint32 flags,
    1030             :                     BlockNumber extend_to,
    1031             :                     ReadBufferMode mode)
    1032             : {
    1033             :     BlockNumber current_size;
    1034      104076 :     uint32      extended_by = 0;
    1035      104076 :     Buffer      buffer = InvalidBuffer;
    1036             :     Buffer      buffers[64];
    1037             : 
    1038             :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
    1039             :     Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
    1040             :     Assert(extend_to != InvalidBlockNumber && extend_to > 0);
    1041             : 
    1042      104076 :     if (bmr.relpersistence == '\0')
    1043       14358 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
    1044             : 
    1045             :     /*
    1046             :      * If desired, create the file if it doesn't exist.  If
    1047             :      * smgr_cached_nblocks[fork] is positive then it must exist, no need for
    1048             :      * an smgrexists call.
    1049             :      */
    1050      104076 :     if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
    1051       14358 :         (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
    1052          38 :          BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
    1053       14320 :         !smgrexists(BMR_GET_SMGR(bmr), fork))
    1054             :     {
    1055       14294 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
    1056             : 
    1057             :         /* recheck, fork might have been created concurrently */
    1058       14294 :         if (!smgrexists(BMR_GET_SMGR(bmr), fork))
    1059       14288 :             smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
    1060             : 
    1061       14294 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    1062             :     }
    1063             : 
    1064             :     /*
    1065             :      * If requested, invalidate size cache, so that smgrnblocks asks the
    1066             :      * kernel.
    1067             :      */
    1068      104076 :     if (flags & EB_CLEAR_SIZE_CACHE)
    1069       14358 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    1070             : 
    1071             :     /*
    1072             :      * Estimate how many pages we'll need to extend by. This avoids acquiring
    1073             :      * unnecessarily many victim buffers.
    1074             :      */
    1075      104076 :     current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    1076             : 
    1077             :     /*
    1078             :      * Since no-one else can be looking at the page contents yet, there is no
    1079             :      * difference between an exclusive lock and a cleanup-strength lock. Note
    1080             :      * that we pass the original mode to ReadBuffer_common() below, when
    1081             :      * falling back to reading the buffer to a concurrent relation extension.
    1082             :      */
    1083      104076 :     if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1084       88986 :         flags |= EB_LOCK_TARGET;
    1085             : 
    1086      212460 :     while (current_size < extend_to)
    1087             :     {
    1088      108384 :         uint32      num_pages = lengthof(buffers);
    1089             :         BlockNumber first_block;
    1090             : 
    1091      108384 :         if ((uint64) current_size + num_pages > extend_to)
    1092      108252 :             num_pages = extend_to - current_size;
    1093             : 
    1094      108384 :         first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
    1095             :                                               num_pages, extend_to,
    1096             :                                               buffers, &extended_by);
    1097             : 
    1098      108384 :         current_size = first_block + extended_by;
    1099             :         Assert(num_pages != 0 || current_size >= extend_to);
    1100             : 
    1101      231434 :         for (uint32 i = 0; i < extended_by; i++)
    1102             :         {
    1103      123050 :             if (first_block + i != extend_to - 1)
    1104       18980 :                 ReleaseBuffer(buffers[i]);
    1105             :             else
    1106      104070 :                 buffer = buffers[i];
    1107             :         }
    1108             :     }
    1109             : 
    1110             :     /*
    1111             :      * It's possible that another backend concurrently extended the relation.
    1112             :      * In that case read the buffer.
    1113             :      *
    1114             :      * XXX: Should we control this via a flag?
    1115             :      */
    1116      104076 :     if (buffer == InvalidBuffer)
    1117             :     {
    1118             :         Assert(extended_by == 0);
    1119           6 :         buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
    1120             :                                    fork, extend_to - 1, mode, strategy);
    1121             :     }
    1122             : 
    1123      104076 :     return buffer;
    1124             : }
    1125             : 
    1126             : /*
    1127             :  * Lock and optionally zero a buffer, as part of the implementation of
    1128             :  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
    1129             :  * pinned.  If the buffer is not already valid, it is zeroed and made valid.
    1130             :  */
    1131             : static void
    1132      654240 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
    1133             : {
    1134             :     BufferDesc *bufHdr;
    1135             :     bool        need_to_zero;
    1136      654240 :     bool        isLocalBuf = BufferIsLocal(buffer);
    1137             : 
    1138             :     Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
    1139             : 
    1140      654240 :     if (already_valid)
    1141             :     {
    1142             :         /*
    1143             :          * If the caller already knew the buffer was valid, we can skip some
    1144             :          * header interaction.  The caller just wants to lock the buffer.
    1145             :          */
    1146       75254 :         need_to_zero = false;
    1147             :     }
    1148      578986 :     else if (isLocalBuf)
    1149             :     {
    1150             :         /* Simple case for non-shared buffers. */
    1151          48 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    1152          48 :         need_to_zero = StartLocalBufferIO(bufHdr, true, false);
    1153             :     }
    1154             :     else
    1155             :     {
    1156             :         /*
    1157             :          * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
    1158             :          * concurrently.  Even though we aren't doing I/O, that ensures that
    1159             :          * we don't zero a page that someone else has pinned.  An exclusive
    1160             :          * content lock wouldn't be enough, because readers are allowed to
    1161             :          * drop the content lock after determining that a tuple is visible
    1162             :          * (see buffer access rules in README).
    1163             :          */
    1164      578938 :         bufHdr = GetBufferDescriptor(buffer - 1);
    1165      578938 :         need_to_zero = StartBufferIO(bufHdr, true, false);
    1166             :     }
    1167             : 
    1168      654240 :     if (need_to_zero)
    1169             :     {
    1170      578986 :         memset(BufferGetPage(buffer), 0, BLCKSZ);
    1171             : 
    1172             :         /*
    1173             :          * Grab the buffer content lock before marking the page as valid, to
    1174             :          * make sure that no other backend sees the zeroed page before the
    1175             :          * caller has had a chance to initialize it.
    1176             :          *
    1177             :          * Since no-one else can be looking at the page contents yet, there is
    1178             :          * no difference between an exclusive lock and a cleanup-strength
    1179             :          * lock. (Note that we cannot use LockBuffer() or
    1180             :          * LockBufferForCleanup() here, because they assert that the buffer is
    1181             :          * already valid.)
    1182             :          */
    1183      578986 :         if (!isLocalBuf)
    1184      578938 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1185             : 
    1186             :         /* Set BM_VALID, terminate IO, and wake up any waiters */
    1187      578986 :         if (isLocalBuf)
    1188          48 :             TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
    1189             :         else
    1190      578938 :             TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
    1191             :     }
    1192       75254 :     else if (!isLocalBuf)
    1193             :     {
    1194             :         /*
    1195             :          * The buffer is valid, so we can't zero it.  The caller still expects
    1196             :          * the page to be locked on return.
    1197             :          */
    1198       75214 :         if (mode == RBM_ZERO_AND_LOCK)
    1199       75028 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    1200             :         else
    1201         186 :             LockBufferForCleanup(buffer);
    1202             :     }
    1203      654240 : }
    1204             : 
    1205             : /*
    1206             :  * Pin a buffer for a given block.  *foundPtr is set to true if the block was
    1207             :  * already present, or false if more work is required to either read it in or
    1208             :  * zero it.
    1209             :  */
    1210             : static pg_attribute_always_inline Buffer
    1211   125417564 : PinBufferForBlock(Relation rel,
    1212             :                   SMgrRelation smgr,
    1213             :                   char persistence,
    1214             :                   ForkNumber forkNum,
    1215             :                   BlockNumber blockNum,
    1216             :                   BufferAccessStrategy strategy,
    1217             :                   bool *foundPtr)
    1218             : {
    1219             :     BufferDesc *bufHdr;
    1220             :     IOContext   io_context;
    1221             :     IOObject    io_object;
    1222             : 
    1223             :     Assert(blockNum != P_NEW);
    1224             : 
    1225             :     /* Persistence should be set before */
    1226             :     Assert((persistence == RELPERSISTENCE_TEMP ||
    1227             :             persistence == RELPERSISTENCE_PERMANENT ||
    1228             :             persistence == RELPERSISTENCE_UNLOGGED));
    1229             : 
    1230   125417564 :     if (persistence == RELPERSISTENCE_TEMP)
    1231             :     {
    1232     2554570 :         io_context = IOCONTEXT_NORMAL;
    1233     2554570 :         io_object = IOOBJECT_TEMP_RELATION;
    1234             :     }
    1235             :     else
    1236             :     {
    1237   122862994 :         io_context = IOContextForStrategy(strategy);
    1238   122862994 :         io_object = IOOBJECT_RELATION;
    1239             :     }
    1240             : 
    1241             :     TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
    1242             :                                        smgr->smgr_rlocator.locator.spcOid,
    1243             :                                        smgr->smgr_rlocator.locator.dbOid,
    1244             :                                        smgr->smgr_rlocator.locator.relNumber,
    1245             :                                        smgr->smgr_rlocator.backend);
    1246             : 
    1247   125417564 :     if (persistence == RELPERSISTENCE_TEMP)
    1248             :     {
    1249     2554570 :         bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
    1250     2554558 :         if (*foundPtr)
    1251     2537776 :             pgBufferUsage.local_blks_hit++;
    1252             :     }
    1253             :     else
    1254             :     {
    1255   122862994 :         bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
    1256             :                              strategy, foundPtr, io_context);
    1257   122862994 :         if (*foundPtr)
    1258   119334244 :             pgBufferUsage.shared_blks_hit++;
    1259             :     }
    1260   125417552 :     if (rel)
    1261             :     {
    1262             :         /*
    1263             :          * While pgBufferUsage's "read" counter isn't bumped unless we reach
    1264             :          * WaitReadBuffers() (so, not for hits, and not for buffers that are
    1265             :          * zeroed instead), the per-relation stats always count them.
    1266             :          */
    1267   113345366 :         pgstat_count_buffer_read(rel);
    1268   113345366 :         if (*foundPtr)
    1269   110750342 :             pgstat_count_buffer_hit(rel);
    1270             :     }
    1271   125417552 :     if (*foundPtr)
    1272             :     {
    1273   121872020 :         pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
    1274   121872020 :         if (VacuumCostActive)
    1275     4776544 :             VacuumCostBalance += VacuumCostPageHit;
    1276             : 
    1277             :         TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
    1278             :                                           smgr->smgr_rlocator.locator.spcOid,
    1279             :                                           smgr->smgr_rlocator.locator.dbOid,
    1280             :                                           smgr->smgr_rlocator.locator.relNumber,
    1281             :                                           smgr->smgr_rlocator.backend,
    1282             :                                           true);
    1283             :     }
    1284             : 
    1285   125417552 :     return BufferDescriptorGetBuffer(bufHdr);
    1286             : }
    1287             : 
    1288             : /*
    1289             :  * ReadBuffer_common -- common logic for all ReadBuffer variants
    1290             :  *
    1291             :  * smgr is required, rel is optional unless using P_NEW.
    1292             :  */
    1293             : static pg_attribute_always_inline Buffer
    1294   117352476 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
    1295             :                   ForkNumber forkNum,
    1296             :                   BlockNumber blockNum, ReadBufferMode mode,
    1297             :                   BufferAccessStrategy strategy)
    1298             : {
    1299             :     ReadBuffersOperation operation;
    1300             :     Buffer      buffer;
    1301             :     int         flags;
    1302             :     char        persistence;
    1303             : 
    1304             :     /*
    1305             :      * Backward compatibility path, most code should use ExtendBufferedRel()
    1306             :      * instead, as acquiring the extension lock inside ExtendBufferedRel()
    1307             :      * scales a lot better.
    1308             :      */
    1309   117352476 :     if (unlikely(blockNum == P_NEW))
    1310             :     {
    1311         522 :         uint32      flags = EB_SKIP_EXTENSION_LOCK;
    1312             : 
    1313             :         /*
    1314             :          * Since no-one else can be looking at the page contents yet, there is
    1315             :          * no difference between an exclusive lock and a cleanup-strength
    1316             :          * lock.
    1317             :          */
    1318         522 :         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
    1319           0 :             flags |= EB_LOCK_FIRST;
    1320             : 
    1321         522 :         return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
    1322             :     }
    1323             : 
    1324   117351954 :     if (rel)
    1325   105756676 :         persistence = rel->rd_rel->relpersistence;
    1326             :     else
    1327    11595278 :         persistence = smgr_persistence;
    1328             : 
    1329   117351954 :     if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
    1330             :                  mode == RBM_ZERO_AND_LOCK))
    1331             :     {
    1332             :         bool        found;
    1333             : 
    1334      654240 :         buffer = PinBufferForBlock(rel, smgr, persistence,
    1335             :                                    forkNum, blockNum, strategy, &found);
    1336      654240 :         ZeroAndLockBuffer(buffer, mode, found);
    1337      654240 :         return buffer;
    1338             :     }
    1339             : 
    1340             :     /*
    1341             :      * Signal that we are going to immediately wait. If we're immediately
    1342             :      * waiting, there is no benefit in actually executing the IO
    1343             :      * asynchronously, it would just add dispatch overhead.
    1344             :      */
    1345   116697714 :     flags = READ_BUFFERS_SYNCHRONOUSLY;
    1346   116697714 :     if (mode == RBM_ZERO_ON_ERROR)
    1347     2650546 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1348   116697714 :     operation.smgr = smgr;
    1349   116697714 :     operation.rel = rel;
    1350   116697714 :     operation.persistence = persistence;
    1351   116697714 :     operation.forknum = forkNum;
    1352   116697714 :     operation.strategy = strategy;
    1353   116697714 :     if (StartReadBuffer(&operation,
    1354             :                         &buffer,
    1355             :                         blockNum,
    1356             :                         flags))
    1357     1459418 :         WaitReadBuffers(&operation);
    1358             : 
    1359   116697668 :     return buffer;
    1360             : }
    1361             : 
    1362             : static pg_attribute_always_inline bool
    1363   124418222 : StartReadBuffersImpl(ReadBuffersOperation *operation,
    1364             :                      Buffer *buffers,
    1365             :                      BlockNumber blockNum,
    1366             :                      int *nblocks,
    1367             :                      int flags,
    1368             :                      bool allow_forwarding)
    1369             : {
    1370   124418222 :     int         actual_nblocks = *nblocks;
    1371   124418222 :     int         maxcombine = 0;
    1372             :     bool        did_start_io;
    1373             : 
    1374             :     Assert(*nblocks == 1 || allow_forwarding);
    1375             :     Assert(*nblocks > 0);
    1376             :     Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
    1377             : 
    1378   127384770 :     for (int i = 0; i < actual_nblocks; ++i)
    1379             :     {
    1380             :         bool        found;
    1381             : 
    1382   124766630 :         if (allow_forwarding && buffers[i] != InvalidBuffer)
    1383        3306 :         {
    1384             :             BufferDesc *bufHdr;
    1385             : 
    1386             :             /*
    1387             :              * This is a buffer that was pinned by an earlier call to
    1388             :              * StartReadBuffers(), but couldn't be handled in one operation at
    1389             :              * that time.  The operation was split, and the caller has passed
    1390             :              * an already pinned buffer back to us to handle the rest of the
    1391             :              * operation.  It must continue at the expected block number.
    1392             :              */
    1393             :             Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
    1394             : 
    1395             :             /*
    1396             :              * It might be an already valid buffer (a hit) that followed the
    1397             :              * final contiguous block of an earlier I/O (a miss) marking the
    1398             :              * end of it, or a buffer that some other backend has since made
    1399             :              * valid by performing the I/O for us, in which case we can handle
    1400             :              * it as a hit now.  It is safe to check for a BM_VALID flag with
    1401             :              * a relaxed load, because we got a fresh view of it while pinning
    1402             :              * it in the previous call.
    1403             :              *
    1404             :              * On the other hand if we don't see BM_VALID yet, it must be an
    1405             :              * I/O that was split by the previous call and we need to try to
    1406             :              * start a new I/O from this block.  We're also racing against any
    1407             :              * other backend that might start the I/O or even manage to mark
    1408             :              * it BM_VALID after this check, but StartBufferIO() will handle
    1409             :              * those cases.
    1410             :              */
    1411        3306 :             if (BufferIsLocal(buffers[i]))
    1412           4 :                 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
    1413             :             else
    1414        3302 :                 bufHdr = GetBufferDescriptor(buffers[i] - 1);
    1415             :             Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
    1416        3306 :             found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
    1417             :         }
    1418             :         else
    1419             :         {
    1420   124763312 :             buffers[i] = PinBufferForBlock(operation->rel,
    1421             :                                            operation->smgr,
    1422   124763324 :                                            operation->persistence,
    1423             :                                            operation->forknum,
    1424             :                                            blockNum + i,
    1425             :                                            operation->strategy,
    1426             :                                            &found);
    1427             :         }
    1428             : 
    1429   124766618 :         if (found)
    1430             :         {
    1431             :             /*
    1432             :              * We have a hit.  If it's the first block in the requested range,
    1433             :              * we can return it immediately and report that WaitReadBuffers()
    1434             :              * does not need to be called.  If the initial value of *nblocks
    1435             :              * was larger, the caller will have to call again for the rest.
    1436             :              */
    1437   121800070 :             if (i == 0)
    1438             :             {
    1439   121796764 :                 *nblocks = 1;
    1440             : 
    1441             : #ifdef USE_ASSERT_CHECKING
    1442             : 
    1443             :                 /*
    1444             :                  * Initialize enough of ReadBuffersOperation to make
    1445             :                  * CheckReadBuffersOperation() work. Outside of assertions
    1446             :                  * that's not necessary when no IO is issued.
    1447             :                  */
    1448             :                 operation->buffers = buffers;
    1449             :                 operation->blocknum = blockNum;
    1450             :                 operation->nblocks = 1;
    1451             :                 operation->nblocks_done = 1;
    1452             :                 CheckReadBuffersOperation(operation, true);
    1453             : #endif
    1454   121796764 :                 return false;
    1455             :             }
    1456             : 
    1457             :             /*
    1458             :              * Otherwise we already have an I/O to perform, but this block
    1459             :              * can't be included as it is already valid.  Split the I/O here.
    1460             :              * There may or may not be more blocks requiring I/O after this
    1461             :              * one, we haven't checked, but they can't be contiguous with this
    1462             :              * one in the way.  We'll leave this buffer pinned, forwarding it
    1463             :              * to the next call, avoiding the need to unpin it here and re-pin
    1464             :              * it in the next call.
    1465             :              */
    1466        3306 :             actual_nblocks = i;
    1467        3306 :             break;
    1468             :         }
    1469             :         else
    1470             :         {
    1471             :             /*
    1472             :              * Check how many blocks we can cover with the same IO. The smgr
    1473             :              * implementation might e.g. be limited due to a segment boundary.
    1474             :              */
    1475     2966548 :             if (i == 0 && actual_nblocks > 1)
    1476             :             {
    1477       70028 :                 maxcombine = smgrmaxcombine(operation->smgr,
    1478             :                                             operation->forknum,
    1479             :                                             blockNum);
    1480       70028 :                 if (unlikely(maxcombine < actual_nblocks))
    1481             :                 {
    1482           0 :                     elog(DEBUG2, "limiting nblocks at %u from %u to %u",
    1483             :                          blockNum, actual_nblocks, maxcombine);
    1484           0 :                     actual_nblocks = maxcombine;
    1485             :                 }
    1486             :             }
    1487             :         }
    1488             :     }
    1489     2621446 :     *nblocks = actual_nblocks;
    1490             : 
    1491             :     /* Populate information needed for I/O. */
    1492     2621446 :     operation->buffers = buffers;
    1493     2621446 :     operation->blocknum = blockNum;
    1494     2621446 :     operation->flags = flags;
    1495     2621446 :     operation->nblocks = actual_nblocks;
    1496     2621446 :     operation->nblocks_done = 0;
    1497     2621446 :     pgaio_wref_clear(&operation->io_wref);
    1498             : 
    1499             :     /*
    1500             :      * When using AIO, start the IO in the background. If not, issue prefetch
    1501             :      * requests if desired by the caller.
    1502             :      *
    1503             :      * The reason we have a dedicated path for IOMETHOD_SYNC here is to
    1504             :      * de-risk the introduction of AIO somewhat. It's a large architectural
    1505             :      * change, with lots of chances for unanticipated performance effects.
    1506             :      *
    1507             :      * Use of IOMETHOD_SYNC already leads to not actually performing IO
    1508             :      * asynchronously, but without the check here we'd execute IO earlier than
    1509             :      * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
    1510             :      */
    1511     2621446 :     if (io_method != IOMETHOD_SYNC)
    1512             :     {
    1513             :         /*
    1514             :          * Try to start IO asynchronously. It's possible that no IO needs to
    1515             :          * be started, if another backend already performed the IO.
    1516             :          *
    1517             :          * Note that if an IO is started, it might not cover the entire
    1518             :          * requested range, e.g. because an intermediary block has been read
    1519             :          * in by another backend.  In that case any "trailing" buffers we
    1520             :          * already pinned above will be "forwarded" by read_stream.c to the
    1521             :          * next call to StartReadBuffers().
    1522             :          *
    1523             :          * This is signalled to the caller by decrementing *nblocks *and*
    1524             :          * reducing operation->nblocks. The latter is done here, but not below
    1525             :          * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
    1526             :          * overall read size anymore, we need to retry until done in its
    1527             :          * entirety or until failed.
    1528             :          */
    1529     2619290 :         did_start_io = AsyncReadBuffers(operation, nblocks);
    1530             : 
    1531     2619260 :         operation->nblocks = *nblocks;
    1532             :     }
    1533             :     else
    1534             :     {
    1535        2156 :         operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
    1536             : 
    1537        2156 :         if (flags & READ_BUFFERS_ISSUE_ADVICE)
    1538             :         {
    1539             :             /*
    1540             :              * In theory we should only do this if PinBufferForBlock() had to
    1541             :              * allocate new buffers above.  That way, if two calls to
    1542             :              * StartReadBuffers() were made for the same blocks before
    1543             :              * WaitReadBuffers(), only the first would issue the advice.
    1544             :              * That'd be a better simulation of true asynchronous I/O, which
    1545             :              * would only start the I/O once, but isn't done here for
    1546             :              * simplicity.
    1547             :              */
    1548           4 :             smgrprefetch(operation->smgr,
    1549             :                          operation->forknum,
    1550             :                          blockNum,
    1551             :                          actual_nblocks);
    1552             :         }
    1553             : 
    1554             :         /*
    1555             :          * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
    1556             :          * will initiate the necessary IO.
    1557             :          */
    1558        2156 :         did_start_io = true;
    1559             :     }
    1560             : 
    1561     2621416 :     CheckReadBuffersOperation(operation, !did_start_io);
    1562             : 
    1563     2621416 :     return did_start_io;
    1564             : }
    1565             : 
    1566             : /*
    1567             :  * Begin reading a range of blocks beginning at blockNum and extending for
    1568             :  * *nblocks.  *nblocks and the buffers array are in/out parameters.  On entry,
    1569             :  * the buffers elements covered by *nblocks must hold either InvalidBuffer or
    1570             :  * buffers forwarded by an earlier call to StartReadBuffers() that was split
    1571             :  * and is now being continued.  On return, *nblocks holds the number of blocks
    1572             :  * accepted by this operation.  If it is less than the original number then
    1573             :  * this operation has been split, but buffer elements up to the original
    1574             :  * requested size may hold forwarded buffers to be used for a continuing
    1575             :  * operation.  The caller must either start a new I/O beginning at the block
    1576             :  * immediately following the blocks accepted by this call and pass those
    1577             :  * buffers back in, or release them if it chooses not to.  It shouldn't make
    1578             :  * any other use of or assumptions about forwarded buffers.
    1579             :  *
    1580             :  * If false is returned, no I/O is necessary and the buffers covered by
    1581             :  * *nblocks on exit are valid and ready to be accessed.  If true is returned,
    1582             :  * an I/O has been started, and WaitReadBuffers() must be called with the same
    1583             :  * operation object before the buffers covered by *nblocks on exit can be
    1584             :  * accessed.  Along with the operation object, the caller-supplied array of
    1585             :  * buffers must remain valid until WaitReadBuffers() is called, and any
    1586             :  * forwarded buffers must also be preserved for a continuing call unless
    1587             :  * they are explicitly released.
    1588             :  */
    1589             : bool
    1590     3655994 : StartReadBuffers(ReadBuffersOperation *operation,
    1591             :                  Buffer *buffers,
    1592             :                  BlockNumber blockNum,
    1593             :                  int *nblocks,
    1594             :                  int flags)
    1595             : {
    1596     3655994 :     return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
    1597             :                                 true /* expect forwarded buffers */ );
    1598             : }
    1599             : 
    1600             : /*
    1601             :  * Single block version of the StartReadBuffers().  This might save a few
    1602             :  * instructions when called from another translation unit, because it is
    1603             :  * specialized for nblocks == 1.
    1604             :  *
    1605             :  * This version does not support "forwarded" buffers: they cannot be created
    1606             :  * by reading only one block and *buffer is ignored on entry.
    1607             :  */
    1608             : bool
    1609   120762228 : StartReadBuffer(ReadBuffersOperation *operation,
    1610             :                 Buffer *buffer,
    1611             :                 BlockNumber blocknum,
    1612             :                 int flags)
    1613             : {
    1614   120762228 :     int         nblocks = 1;
    1615             :     bool        result;
    1616             : 
    1617   120762228 :     result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
    1618             :                                   false /* single block, no forwarding */ );
    1619             :     Assert(nblocks == 1);       /* single block can't be short */
    1620             : 
    1621   120762198 :     return result;
    1622             : }
    1623             : 
    1624             : /*
    1625             :  * Perform sanity checks on the ReadBuffersOperation.
    1626             :  */
    1627             : static void
    1628     7855736 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
    1629             : {
    1630             : #ifdef USE_ASSERT_CHECKING
    1631             :     Assert(operation->nblocks_done <= operation->nblocks);
    1632             :     Assert(!is_complete || operation->nblocks == operation->nblocks_done);
    1633             : 
    1634             :     for (int i = 0; i < operation->nblocks; i++)
    1635             :     {
    1636             :         Buffer      buffer = operation->buffers[i];
    1637             :         BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
    1638             :             GetLocalBufferDescriptor(-buffer - 1) :
    1639             :             GetBufferDescriptor(buffer - 1);
    1640             : 
    1641             :         Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
    1642             :         Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
    1643             : 
    1644             :         if (i < operation->nblocks_done)
    1645             :             Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
    1646             :     }
    1647             : #endif
    1648     7855736 : }
    1649             : 
    1650             : /* helper for ReadBuffersCanStartIO(), to avoid repetition */
    1651             : static inline bool
    1652     2966580 : ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
    1653             : {
    1654     2966580 :     if (BufferIsLocal(buffer))
    1655       16734 :         return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
    1656             :                                   true, nowait);
    1657             :     else
    1658     2949846 :         return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
    1659             : }
    1660             : 
    1661             : /*
    1662             :  * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
    1663             :  */
    1664             : static inline bool
    1665     2966580 : ReadBuffersCanStartIO(Buffer buffer, bool nowait)
    1666             : {
    1667             :     /*
    1668             :      * If this backend currently has staged IO, we need to submit the pending
    1669             :      * IO before waiting for the right to issue IO, to avoid the potential for
    1670             :      * deadlocks (and, more commonly, unnecessary delays for other backends).
    1671             :      */
    1672     2966580 :     if (!nowait && pgaio_have_staged())
    1673             :     {
    1674        1156 :         if (ReadBuffersCanStartIOOnce(buffer, true))
    1675        1156 :             return true;
    1676             : 
    1677             :         /*
    1678             :          * Unfortunately StartBufferIO() returning false doesn't allow to
    1679             :          * distinguish between the buffer already being valid and IO already
    1680             :          * being in progress. Since IO already being in progress is quite
    1681             :          * rare, this approach seems fine.
    1682             :          */
    1683           0 :         pgaio_submit_staged();
    1684             :     }
    1685             : 
    1686     2965424 :     return ReadBuffersCanStartIOOnce(buffer, nowait);
    1687             : }
    1688             : 
    1689             : /*
    1690             :  * Helper for WaitReadBuffers() that processes the results of a readv
    1691             :  * operation, raising an error if necessary.
    1692             :  */
    1693             : static void
    1694     2616120 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
    1695             : {
    1696     2616120 :     PgAioReturn *aio_ret = &operation->io_return;
    1697     2616120 :     PgAioResultStatus rs = aio_ret->result.status;
    1698     2616120 :     int         newly_read_blocks = 0;
    1699             : 
    1700             :     Assert(pgaio_wref_valid(&operation->io_wref));
    1701             :     Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
    1702             : 
    1703             :     /*
    1704             :      * SMGR reports the number of blocks successfully read as the result of
    1705             :      * the IO operation. Thus we can simply add that to ->nblocks_done.
    1706             :      */
    1707             : 
    1708     2616120 :     if (likely(rs != PGAIO_RS_ERROR))
    1709     2616062 :         newly_read_blocks = aio_ret->result.result;
    1710             : 
    1711     2616120 :     if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
    1712          90 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data,
    1713             :                             rs == PGAIO_RS_ERROR ? ERROR : WARNING);
    1714     2616030 :     else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
    1715             :     {
    1716             :         /*
    1717             :          * We'll retry, so we just emit a debug message to the server log (or
    1718             :          * not even that in prod scenarios).
    1719             :          */
    1720          20 :         pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
    1721          20 :         elog(DEBUG3, "partial read, will retry");
    1722             :     }
    1723             : 
    1724             :     Assert(newly_read_blocks > 0);
    1725             :     Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
    1726             : 
    1727     2616062 :     operation->nblocks_done += newly_read_blocks;
    1728             : 
    1729             :     Assert(operation->nblocks_done <= operation->nblocks);
    1730     2616062 : }
    1731             : 
    1732             : void
    1733     2616102 : WaitReadBuffers(ReadBuffersOperation *operation)
    1734             : {
    1735     2616102 :     PgAioReturn *aio_ret = &operation->io_return;
    1736             :     IOContext   io_context;
    1737             :     IOObject    io_object;
    1738             : 
    1739     2616102 :     if (operation->persistence == RELPERSISTENCE_TEMP)
    1740             :     {
    1741        2980 :         io_context = IOCONTEXT_NORMAL;
    1742        2980 :         io_object = IOOBJECT_TEMP_RELATION;
    1743             :     }
    1744             :     else
    1745             :     {
    1746     2613122 :         io_context = IOContextForStrategy(operation->strategy);
    1747     2613122 :         io_object = IOOBJECT_RELATION;
    1748             :     }
    1749             : 
    1750             :     /*
    1751             :      * If we get here without an IO operation having been issued, the
    1752             :      * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
    1753             :      * caller should not have called WaitReadBuffers().
    1754             :      *
    1755             :      * In the case of IOMETHOD_SYNC, we start - as we used to before the
    1756             :      * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
    1757             :      * of the retry logic below, no extra code is required.
    1758             :      *
    1759             :      * This path is expected to eventually go away.
    1760             :      */
    1761     2616102 :     if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
    1762           0 :         elog(ERROR, "waiting for read operation that didn't read");
    1763             : 
    1764             :     /*
    1765             :      * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
    1766             :      * done. We may need multiple retries, not just because we could get
    1767             :      * multiple partial reads, but also because some of the remaining
    1768             :      * to-be-read buffers may have been read in by other backends, limiting
    1769             :      * the IO size.
    1770             :      */
    1771             :     while (true)
    1772        2176 :     {
    1773             :         int         ignored_nblocks_progress;
    1774             : 
    1775     2618278 :         CheckReadBuffersOperation(operation, false);
    1776             : 
    1777             :         /*
    1778             :          * If there is an IO associated with the operation, we may need to
    1779             :          * wait for it.
    1780             :          */
    1781     2618278 :         if (pgaio_wref_valid(&operation->io_wref))
    1782             :         {
    1783             :             /*
    1784             :              * Track the time spent waiting for the IO to complete. As
    1785             :              * tracking a wait even if we don't actually need to wait
    1786             :              *
    1787             :              * a) is not cheap, due to the timestamping overhead
    1788             :              *
    1789             :              * b) reports some time as waiting, even if we never waited
    1790             :              *
    1791             :              * we first check if we already know the IO is complete.
    1792             :              */
    1793     2616122 :             if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
    1794     1141032 :                 !pgaio_wref_check_done(&operation->io_wref))
    1795             :             {
    1796      286554 :                 instr_time  io_start = pgstat_prepare_io_time(track_io_timing);
    1797             : 
    1798      286554 :                 pgaio_wref_wait(&operation->io_wref);
    1799             : 
    1800             :                 /*
    1801             :                  * The IO operation itself was already counted earlier, in
    1802             :                  * AsyncReadBuffers(), this just accounts for the wait time.
    1803             :                  */
    1804      286552 :                 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    1805             :                                         io_start, 0, 0);
    1806             :             }
    1807             :             else
    1808             :             {
    1809             :                 Assert(pgaio_wref_check_done(&operation->io_wref));
    1810             :             }
    1811             : 
    1812             :             /*
    1813             :              * We now are sure the IO completed. Check the results. This
    1814             :              * includes reporting on errors if there were any.
    1815             :              */
    1816     2616120 :             ProcessReadBuffersResult(operation);
    1817             :         }
    1818             : 
    1819             :         /*
    1820             :          * Most of the time, the one IO we already started, will read in
    1821             :          * everything.  But we need to deal with partial reads and buffers not
    1822             :          * needing IO anymore.
    1823             :          */
    1824     2618218 :         if (operation->nblocks_done == operation->nblocks)
    1825     2616042 :             break;
    1826             : 
    1827        2176 :         CHECK_FOR_INTERRUPTS();
    1828             : 
    1829             :         /*
    1830             :          * This may only complete the IO partially, either because some
    1831             :          * buffers were already valid, or because of a partial read.
    1832             :          *
    1833             :          * NB: In contrast to after the AsyncReadBuffers() call in
    1834             :          * StartReadBuffers(), we do *not* reduce
    1835             :          * ReadBuffersOperation->nblocks here, callers expect the full
    1836             :          * operation to be completed at this point (as more operations may
    1837             :          * have been queued).
    1838             :          */
    1839        2176 :         AsyncReadBuffers(operation, &ignored_nblocks_progress);
    1840             :     }
    1841             : 
    1842     2616042 :     CheckReadBuffersOperation(operation, true);
    1843             : 
    1844             :     /* NB: READ_DONE tracepoint was already executed in completion callback */
    1845     2616042 : }
    1846             : 
    1847             : /*
    1848             :  * Initiate IO for the ReadBuffersOperation
    1849             :  *
    1850             :  * This function only starts a single IO at a time. The size of the IO may be
    1851             :  * limited to below the to-be-read blocks, if one of the buffers has
    1852             :  * concurrently been read in. If the first to-be-read buffer is already valid,
    1853             :  * no IO will be issued.
    1854             :  *
    1855             :  * To support retries after partial reads, the first operation->nblocks_done
    1856             :  * buffers are skipped.
    1857             :  *
    1858             :  * On return *nblocks_progress is updated to reflect the number of buffers
    1859             :  * affected by the call. If the first buffer is valid, *nblocks_progress is
    1860             :  * set to 1 and operation->nblocks_done is incremented.
    1861             :  *
    1862             :  * Returns true if IO was initiated, false if no IO was necessary.
    1863             :  */
    1864             : static bool
    1865     2621466 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
    1866             : {
    1867     2621466 :     Buffer     *buffers = &operation->buffers[0];
    1868     2621466 :     int         flags = operation->flags;
    1869     2621466 :     BlockNumber blocknum = operation->blocknum;
    1870     2621466 :     ForkNumber  forknum = operation->forknum;
    1871     2621466 :     char        persistence = operation->persistence;
    1872     2621466 :     int16       nblocks_done = operation->nblocks_done;
    1873     2621466 :     Buffer     *io_buffers = &operation->buffers[nblocks_done];
    1874     2621466 :     int         io_buffers_len = 0;
    1875             :     PgAioHandle *ioh;
    1876     2621466 :     uint32      ioh_flags = 0;
    1877             :     void       *io_pages[MAX_IO_COMBINE_LIMIT];
    1878             :     IOContext   io_context;
    1879             :     IOObject    io_object;
    1880             :     bool        did_start_io;
    1881             : 
    1882             :     /*
    1883             :      * When this IO is executed synchronously, either because the caller will
    1884             :      * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
    1885             :      * the AIO subsystem needs to know.
    1886             :      */
    1887     2621466 :     if (flags & READ_BUFFERS_SYNCHRONOUSLY)
    1888     1462238 :         ioh_flags |= PGAIO_HF_SYNCHRONOUS;
    1889             : 
    1890     2621466 :     if (persistence == RELPERSISTENCE_TEMP)
    1891             :     {
    1892        3568 :         io_context = IOCONTEXT_NORMAL;
    1893        3568 :         io_object = IOOBJECT_TEMP_RELATION;
    1894        3568 :         ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
    1895             :     }
    1896             :     else
    1897             :     {
    1898     2617898 :         io_context = IOContextForStrategy(operation->strategy);
    1899     2617898 :         io_object = IOOBJECT_RELATION;
    1900             :     }
    1901             : 
    1902             :     /*
    1903             :      * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
    1904             :      * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
    1905             :      * set globally, but on a per-session basis. The completion callback,
    1906             :      * which may be run in other processes, e.g. in IO workers, may have a
    1907             :      * different value of the zero_damaged_pages GUC.
    1908             :      *
    1909             :      * XXX: We probably should eventually use a different flag for
    1910             :      * zero_damaged_pages, so we can report different log levels / error codes
    1911             :      * for zero_damaged_pages and ZERO_ON_ERROR.
    1912             :      */
    1913     2621466 :     if (zero_damaged_pages)
    1914          32 :         flags |= READ_BUFFERS_ZERO_ON_ERROR;
    1915             : 
    1916             :     /*
    1917             :      * For the same reason as with zero_damaged_pages we need to use this
    1918             :      * backend's ignore_checksum_failure value.
    1919             :      */
    1920     2621466 :     if (ignore_checksum_failure)
    1921          16 :         flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
    1922             : 
    1923             : 
    1924             :     /*
    1925             :      * To be allowed to report stats in the local completion callback we need
    1926             :      * to prepare to report stats now. This ensures we can safely report the
    1927             :      * checksum failure even in a critical section.
    1928             :      */
    1929     2621466 :     pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
    1930             : 
    1931             :     /*
    1932             :      * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
    1933             :      * might block, which we don't want after setting IO_IN_PROGRESS.
    1934             :      *
    1935             :      * If we need to wait for IO before we can get a handle, submit
    1936             :      * already-staged IO first, so that other backends don't need to wait.
    1937             :      * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
    1938             :      * wait for already submitted IO, which doesn't require additional locks,
    1939             :      * but it could still cause undesirable waits.
    1940             :      *
    1941             :      * A secondary benefit is that this would allow us to measure the time in
    1942             :      * pgaio_io_acquire() without causing undue timer overhead in the common,
    1943             :      * non-blocking, case.  However, currently the pgstats infrastructure
    1944             :      * doesn't really allow that, as it a) asserts that an operation can't
    1945             :      * have time without operations b) doesn't have an API to report
    1946             :      * "accumulated" time.
    1947             :      */
    1948     2621466 :     ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
    1949     2621466 :     if (unlikely(!ioh))
    1950             :     {
    1951        6028 :         pgaio_submit_staged();
    1952             : 
    1953        6028 :         ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
    1954             :     }
    1955             : 
    1956             :     /*
    1957             :      * Check if we can start IO on the first to-be-read buffer.
    1958             :      *
    1959             :      * If an I/O is already in progress in another backend, we want to wait
    1960             :      * for the outcome: either done, or something went wrong and we will
    1961             :      * retry.
    1962             :      */
    1963     2621466 :     if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
    1964             :     {
    1965             :         /*
    1966             :          * Someone else has already completed this block, we're done.
    1967             :          *
    1968             :          * When IO is necessary, ->nblocks_done is updated in
    1969             :          * ProcessReadBuffersResult(), but that is not called if no IO is
    1970             :          * necessary. Thus update here.
    1971             :          */
    1972        4714 :         operation->nblocks_done += 1;
    1973        4714 :         *nblocks_progress = 1;
    1974             : 
    1975        4714 :         pgaio_io_release(ioh);
    1976        4714 :         pgaio_wref_clear(&operation->io_wref);
    1977        4714 :         did_start_io = false;
    1978             : 
    1979             :         /*
    1980             :          * Report and track this as a 'hit' for this backend, even though it
    1981             :          * must have started out as a miss in PinBufferForBlock(). The other
    1982             :          * backend will track this as a 'read'.
    1983             :          */
    1984             :         TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
    1985             :                                           operation->smgr->smgr_rlocator.locator.spcOid,
    1986             :                                           operation->smgr->smgr_rlocator.locator.dbOid,
    1987             :                                           operation->smgr->smgr_rlocator.locator.relNumber,
    1988             :                                           operation->smgr->smgr_rlocator.backend,
    1989             :                                           true);
    1990             : 
    1991        4714 :         if (persistence == RELPERSISTENCE_TEMP)
    1992           0 :             pgBufferUsage.local_blks_hit += 1;
    1993             :         else
    1994        4714 :             pgBufferUsage.shared_blks_hit += 1;
    1995             : 
    1996        4714 :         if (operation->rel)
    1997        4714 :             pgstat_count_buffer_hit(operation->rel);
    1998             : 
    1999        4714 :         pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
    2000             : 
    2001        4714 :         if (VacuumCostActive)
    2002          38 :             VacuumCostBalance += VacuumCostPageHit;
    2003             :     }
    2004             :     else
    2005             :     {
    2006             :         instr_time  io_start;
    2007             : 
    2008             :         /* We found a buffer that we need to read in. */
    2009             :         Assert(io_buffers[0] == buffers[nblocks_done]);
    2010     2616752 :         io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
    2011     2616752 :         io_buffers_len = 1;
    2012             : 
    2013             :         /*
    2014             :          * How many neighboring-on-disk blocks can we scatter-read into other
    2015             :          * buffers at the same time?  In this case we don't wait if we see an
    2016             :          * I/O already in progress.  We already set BM_IO_IN_PROGRESS for the
    2017             :          * head block, so we should get on with that I/O as soon as possible.
    2018             :          */
    2019     2961866 :         for (int i = nblocks_done + 1; i < operation->nblocks; i++)
    2020             :         {
    2021      345114 :             if (!ReadBuffersCanStartIO(buffers[i], true))
    2022           0 :                 break;
    2023             :             /* Must be consecutive block numbers. */
    2024             :             Assert(BufferGetBlockNumber(buffers[i - 1]) ==
    2025             :                    BufferGetBlockNumber(buffers[i]) - 1);
    2026             :             Assert(io_buffers[io_buffers_len] == buffers[i]);
    2027             : 
    2028      345114 :             io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
    2029             :         }
    2030             : 
    2031             :         /* get a reference to wait for in WaitReadBuffers() */
    2032     2616752 :         pgaio_io_get_wref(ioh, &operation->io_wref);
    2033             : 
    2034             :         /* provide the list of buffers to the completion callbacks */
    2035     2616752 :         pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
    2036             : 
    2037     2616752 :         pgaio_io_register_callbacks(ioh,
    2038             :                                     persistence == RELPERSISTENCE_TEMP ?
    2039             :                                     PGAIO_HCB_LOCAL_BUFFER_READV :
    2040             :                                     PGAIO_HCB_SHARED_BUFFER_READV,
    2041             :                                     flags);
    2042             : 
    2043     2616752 :         pgaio_io_set_flag(ioh, ioh_flags);
    2044             : 
    2045             :         /* ---
    2046             :          * Even though we're trying to issue IO asynchronously, track the time
    2047             :          * in smgrstartreadv():
    2048             :          * - if io_method == IOMETHOD_SYNC, we will always perform the IO
    2049             :          *   immediately
    2050             :          * - the io method might not support the IO (e.g. worker IO for a temp
    2051             :          *   table)
    2052             :          * ---
    2053             :          */
    2054     2616752 :         io_start = pgstat_prepare_io_time(track_io_timing);
    2055     2616752 :         smgrstartreadv(ioh, operation->smgr, forknum,
    2056             :                        blocknum + nblocks_done,
    2057             :                        io_pages, io_buffers_len);
    2058     2616722 :         pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
    2059     2616722 :                                 io_start, 1, io_buffers_len * BLCKSZ);
    2060             : 
    2061     2616722 :         if (persistence == RELPERSISTENCE_TEMP)
    2062        3568 :             pgBufferUsage.local_blks_read += io_buffers_len;
    2063             :         else
    2064     2613154 :             pgBufferUsage.shared_blks_read += io_buffers_len;
    2065             : 
    2066             :         /*
    2067             :          * Track vacuum cost when issuing IO, not after waiting for it.
    2068             :          * Otherwise we could end up issuing a lot of IO in a short timespan,
    2069             :          * despite a low cost limit.
    2070             :          */
    2071     2616722 :         if (VacuumCostActive)
    2072       49874 :             VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
    2073             : 
    2074     2616722 :         *nblocks_progress = io_buffers_len;
    2075     2616722 :         did_start_io = true;
    2076             :     }
    2077             : 
    2078     2621436 :     return did_start_io;
    2079             : }
    2080             : 
    2081             : /*
    2082             :  * BufferAlloc -- subroutine for PinBufferForBlock.  Handles lookup of a shared
    2083             :  *      buffer.  If no buffer exists already, selects a replacement victim and
    2084             :  *      evicts the old page, but does NOT read in new page.
    2085             :  *
    2086             :  * "strategy" can be a buffer replacement strategy object, or NULL for
    2087             :  * the default strategy.  The selected buffer's usage_count is advanced when
    2088             :  * using the default strategy, but otherwise possibly not (see PinBuffer).
    2089             :  *
    2090             :  * The returned buffer is pinned and is already marked as holding the
    2091             :  * desired page.  If it already did have the desired page, *foundPtr is
    2092             :  * set true.  Otherwise, *foundPtr is set false.
    2093             :  *
    2094             :  * io_context is passed as an output parameter to avoid calling
    2095             :  * IOContextForStrategy() when there is a shared buffers hit and no IO
    2096             :  * statistics need be captured.
    2097             :  *
    2098             :  * No locks are held either at entry or exit.
    2099             :  */
    2100             : static pg_attribute_always_inline BufferDesc *
    2101   122862994 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    2102             :             BlockNumber blockNum,
    2103             :             BufferAccessStrategy strategy,
    2104             :             bool *foundPtr, IOContext io_context)
    2105             : {
    2106             :     BufferTag   newTag;         /* identity of requested block */
    2107             :     uint32      newHash;        /* hash value for newTag */
    2108             :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
    2109             :     int         existing_buf_id;
    2110             :     Buffer      victim_buffer;
    2111             :     BufferDesc *victim_buf_hdr;
    2112             :     uint64      victim_buf_state;
    2113   122862994 :     uint64      set_bits = 0;
    2114             : 
    2115             :     /* Make sure we will have room to remember the buffer pin */
    2116   122862994 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    2117   122862994 :     ReservePrivateRefCountEntry();
    2118             : 
    2119             :     /* create a tag so we can lookup the buffer */
    2120   122862994 :     InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
    2121             : 
    2122             :     /* determine its hash code and partition lock ID */
    2123   122862994 :     newHash = BufTableHashCode(&newTag);
    2124   122862994 :     newPartitionLock = BufMappingPartitionLock(newHash);
    2125             : 
    2126             :     /* see if the block is in the buffer pool already */
    2127   122862994 :     LWLockAcquire(newPartitionLock, LW_SHARED);
    2128   122862994 :     existing_buf_id = BufTableLookup(&newTag, newHash);
    2129   122862994 :     if (existing_buf_id >= 0)
    2130             :     {
    2131             :         BufferDesc *buf;
    2132             :         bool        valid;
    2133             : 
    2134             :         /*
    2135             :          * Found it.  Now, pin the buffer so no one can steal it from the
    2136             :          * buffer pool, and check to see if the correct data has been loaded
    2137             :          * into the buffer.
    2138             :          */
    2139   119337894 :         buf = GetBufferDescriptor(existing_buf_id);
    2140             : 
    2141   119337894 :         valid = PinBuffer(buf, strategy, false);
    2142             : 
    2143             :         /* Can release the mapping lock as soon as we've pinned it */
    2144   119337894 :         LWLockRelease(newPartitionLock);
    2145             : 
    2146   119337894 :         *foundPtr = true;
    2147             : 
    2148   119337894 :         if (!valid)
    2149             :         {
    2150             :             /*
    2151             :              * We can only get here if (a) someone else is still reading in
    2152             :              * the page, (b) a previous read attempt failed, or (c) someone
    2153             :              * called StartReadBuffers() but not yet WaitReadBuffers().
    2154             :              */
    2155        4238 :             *foundPtr = false;
    2156             :         }
    2157             : 
    2158   119337894 :         return buf;
    2159             :     }
    2160             : 
    2161             :     /*
    2162             :      * Didn't find it in the buffer pool.  We'll have to initialize a new
    2163             :      * buffer.  Remember to unlock the mapping lock while doing the work.
    2164             :      */
    2165     3525100 :     LWLockRelease(newPartitionLock);
    2166             : 
    2167             :     /*
    2168             :      * Acquire a victim buffer. Somebody else might try to do the same, we
    2169             :      * don't hold any conflicting locks. If so we'll have to undo our work
    2170             :      * later.
    2171             :      */
    2172     3525100 :     victim_buffer = GetVictimBuffer(strategy, io_context);
    2173     3525100 :     victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
    2174             : 
    2175             :     /*
    2176             :      * Try to make a hashtable entry for the buffer under its new tag. If
    2177             :      * somebody else inserted another buffer for the tag, we'll release the
    2178             :      * victim buffer we acquired and use the already inserted one.
    2179             :      */
    2180     3525100 :     LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
    2181     3525100 :     existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
    2182     3525100 :     if (existing_buf_id >= 0)
    2183             :     {
    2184             :         BufferDesc *existing_buf_hdr;
    2185             :         bool        valid;
    2186             : 
    2187             :         /*
    2188             :          * Got a collision. Someone has already done what we were about to do.
    2189             :          * We'll just handle this as if it were found in the buffer pool in
    2190             :          * the first place.  First, give up the buffer we were planning to
    2191             :          * use.
    2192             :          *
    2193             :          * We could do this after releasing the partition lock, but then we'd
    2194             :          * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
    2195             :          * before acquiring the lock, for the rare case of such a collision.
    2196             :          */
    2197        1164 :         UnpinBuffer(victim_buf_hdr);
    2198             : 
    2199             :         /* remaining code should match code at top of routine */
    2200             : 
    2201        1164 :         existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
    2202             : 
    2203        1164 :         valid = PinBuffer(existing_buf_hdr, strategy, false);
    2204             : 
    2205             :         /* Can release the mapping lock as soon as we've pinned it */
    2206        1164 :         LWLockRelease(newPartitionLock);
    2207             : 
    2208        1164 :         *foundPtr = true;
    2209             : 
    2210        1164 :         if (!valid)
    2211             :         {
    2212             :             /*
    2213             :              * We can only get here if (a) someone else is still reading in
    2214             :              * the page, (b) a previous read attempt failed, or (c) someone
    2215             :              * called StartReadBuffers() but not yet WaitReadBuffers().
    2216             :              */
    2217         576 :             *foundPtr = false;
    2218             :         }
    2219             : 
    2220        1164 :         return existing_buf_hdr;
    2221             :     }
    2222             : 
    2223             :     /*
    2224             :      * Need to lock the buffer header too in order to change its tag.
    2225             :      */
    2226     3523936 :     victim_buf_state = LockBufHdr(victim_buf_hdr);
    2227             : 
    2228             :     /* some sanity checks while we hold the buffer header lock */
    2229             :     Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
    2230             :     Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
    2231             : 
    2232     3523936 :     victim_buf_hdr->tag = newTag;
    2233             : 
    2234             :     /*
    2235             :      * Make sure BM_PERMANENT is set for buffers that must be written at every
    2236             :      * checkpoint.  Unlogged buffers only need to be written at shutdown
    2237             :      * checkpoints, except for their "init" forks, which need to be treated
    2238             :      * just like permanent relations.
    2239             :      */
    2240     3523936 :     set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2241     3523936 :     if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
    2242     3523228 :         set_bits |= BM_PERMANENT;
    2243             : 
    2244     3523936 :     UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
    2245             :                     set_bits, 0, 0);
    2246             : 
    2247     3523936 :     LWLockRelease(newPartitionLock);
    2248             : 
    2249             :     /*
    2250             :      * Buffer contents are currently invalid.
    2251             :      */
    2252     3523936 :     *foundPtr = false;
    2253             : 
    2254     3523936 :     return victim_buf_hdr;
    2255             : }
    2256             : 
    2257             : /*
    2258             :  * InvalidateBuffer -- mark a shared buffer invalid.
    2259             :  *
    2260             :  * The buffer header spinlock must be held at entry.  We drop it before
    2261             :  * returning.  (This is sane because the caller must have locked the
    2262             :  * buffer in order to be sure it should be dropped.)
    2263             :  *
    2264             :  * This is used only in contexts such as dropping a relation.  We assume
    2265             :  * that no other backend could possibly be interested in using the page,
    2266             :  * so the only reason the buffer might be pinned is if someone else is
    2267             :  * trying to write it out.  We have to let them finish before we can
    2268             :  * reclaim the buffer.
    2269             :  *
    2270             :  * The buffer could get reclaimed by someone else while we are waiting
    2271             :  * to acquire the necessary locks; if so, don't mess it up.
    2272             :  */
    2273             : static void
    2274      213218 : InvalidateBuffer(BufferDesc *buf)
    2275             : {
    2276             :     BufferTag   oldTag;
    2277             :     uint32      oldHash;        /* hash value for oldTag */
    2278             :     LWLock     *oldPartitionLock;   /* buffer partition lock for it */
    2279             :     uint32      oldFlags;
    2280             :     uint64      buf_state;
    2281             : 
    2282             :     /* Save the original buffer tag before dropping the spinlock */
    2283      213218 :     oldTag = buf->tag;
    2284             : 
    2285      213218 :     UnlockBufHdr(buf);
    2286             : 
    2287             :     /*
    2288             :      * Need to compute the old tag's hashcode and partition lock ID. XXX is it
    2289             :      * worth storing the hashcode in BufferDesc so we need not recompute it
    2290             :      * here?  Probably not.
    2291             :      */
    2292      213218 :     oldHash = BufTableHashCode(&oldTag);
    2293      213218 :     oldPartitionLock = BufMappingPartitionLock(oldHash);
    2294             : 
    2295      213228 : retry:
    2296             : 
    2297             :     /*
    2298             :      * Acquire exclusive mapping lock in preparation for changing the buffer's
    2299             :      * association.
    2300             :      */
    2301      213228 :     LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
    2302             : 
    2303             :     /* Re-lock the buffer header */
    2304      213228 :     buf_state = LockBufHdr(buf);
    2305             : 
    2306             :     /* If it's changed while we were waiting for lock, do nothing */
    2307      213228 :     if (!BufferTagsEqual(&buf->tag, &oldTag))
    2308             :     {
    2309          10 :         UnlockBufHdr(buf);
    2310          10 :         LWLockRelease(oldPartitionLock);
    2311          10 :         return;
    2312             :     }
    2313             : 
    2314             :     /*
    2315             :      * We assume the reason for it to be pinned is that either we were
    2316             :      * asynchronously reading the page in before erroring out or someone else
    2317             :      * is flushing the page out.  Wait for the IO to finish.  (This could be
    2318             :      * an infinite loop if the refcount is messed up... it would be nice to
    2319             :      * time out after awhile, but there seems no way to be sure how many loops
    2320             :      * may be needed.  Note that if the other guy has pinned the buffer but
    2321             :      * not yet done StartBufferIO, WaitIO will fall through and we'll
    2322             :      * effectively be busy-looping here.)
    2323             :      */
    2324      213218 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
    2325             :     {
    2326          10 :         UnlockBufHdr(buf);
    2327          10 :         LWLockRelease(oldPartitionLock);
    2328             :         /* safety check: should definitely not be our *own* pin */
    2329          10 :         if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
    2330           0 :             elog(ERROR, "buffer is pinned in InvalidateBuffer");
    2331          10 :         WaitIO(buf);
    2332          10 :         goto retry;
    2333             :     }
    2334             : 
    2335             :     /*
    2336             :      * An invalidated buffer should not have any backends waiting to lock the
    2337             :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2338             :      */
    2339             :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2340             : 
    2341             :     /*
    2342             :      * Clear out the buffer's tag and flags.  We must do this to ensure that
    2343             :      * linear scans of the buffer array don't think the buffer is valid.
    2344             :      */
    2345      213208 :     oldFlags = buf_state & BUF_FLAG_MASK;
    2346      213208 :     ClearBufferTag(&buf->tag);
    2347             : 
    2348      213208 :     UnlockBufHdrExt(buf, buf_state,
    2349             :                     0,
    2350             :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2351             :                     0);
    2352             : 
    2353             :     /*
    2354             :      * Remove the buffer from the lookup hashtable, if it was in there.
    2355             :      */
    2356      213208 :     if (oldFlags & BM_TAG_VALID)
    2357      213208 :         BufTableDelete(&oldTag, oldHash);
    2358             : 
    2359             :     /*
    2360             :      * Done with mapping lock.
    2361             :      */
    2362      213208 :     LWLockRelease(oldPartitionLock);
    2363             : }
    2364             : 
    2365             : /*
    2366             :  * Helper routine for GetVictimBuffer()
    2367             :  *
    2368             :  * Needs to be called on a buffer with a valid tag, pinned, but without the
    2369             :  * buffer header spinlock held.
    2370             :  *
    2371             :  * Returns true if the buffer can be reused, in which case the buffer is only
    2372             :  * pinned by this backend and marked as invalid, false otherwise.
    2373             :  */
    2374             : static bool
    2375     2490482 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
    2376             : {
    2377             :     uint64      buf_state;
    2378             :     uint32      hash;
    2379             :     LWLock     *partition_lock;
    2380             :     BufferTag   tag;
    2381             : 
    2382             :     Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
    2383             : 
    2384             :     /* have buffer pinned, so it's safe to read tag without lock */
    2385     2490482 :     tag = buf_hdr->tag;
    2386             : 
    2387     2490482 :     hash = BufTableHashCode(&tag);
    2388     2490482 :     partition_lock = BufMappingPartitionLock(hash);
    2389             : 
    2390     2490482 :     LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2391             : 
    2392             :     /* lock the buffer header */
    2393     2490482 :     buf_state = LockBufHdr(buf_hdr);
    2394             : 
    2395             :     /*
    2396             :      * We have the buffer pinned nobody else should have been able to unset
    2397             :      * this concurrently.
    2398             :      */
    2399             :     Assert(buf_state & BM_TAG_VALID);
    2400             :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2401             :     Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
    2402             : 
    2403             :     /*
    2404             :      * If somebody else pinned the buffer since, or even worse, dirtied it,
    2405             :      * give up on this buffer: It's clearly in use.
    2406             :      */
    2407     2490482 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
    2408             :     {
    2409             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2410             : 
    2411        1114 :         UnlockBufHdr(buf_hdr);
    2412        1114 :         LWLockRelease(partition_lock);
    2413             : 
    2414        1114 :         return false;
    2415             :     }
    2416             : 
    2417             :     /*
    2418             :      * An invalidated buffer should not have any backends waiting to lock the
    2419             :      * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
    2420             :      */
    2421             :     Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
    2422             : 
    2423             :     /*
    2424             :      * Clear out the buffer's tag and flags and usagecount.  This is not
    2425             :      * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
    2426             :      * doing anything with the buffer. But currently it's beneficial, as the
    2427             :      * cheaper pre-check for several linear scans of shared buffers use the
    2428             :      * tag (see e.g. FlushDatabaseBuffers()).
    2429             :      */
    2430     2489368 :     ClearBufferTag(&buf_hdr->tag);
    2431     2489368 :     UnlockBufHdrExt(buf_hdr, buf_state,
    2432             :                     0,
    2433             :                     BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
    2434             :                     0);
    2435             : 
    2436             :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2437             : 
    2438             :     /* finally delete buffer from the buffer mapping table */
    2439     2489368 :     BufTableDelete(&tag, hash);
    2440             : 
    2441     2489368 :     LWLockRelease(partition_lock);
    2442             : 
    2443     2489368 :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2444             :     Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
    2445             :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    2446             :     Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
    2447             : 
    2448     2489368 :     return true;
    2449             : }
    2450             : 
    2451             : static Buffer
    2452     3975192 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
    2453             : {
    2454             :     BufferDesc *buf_hdr;
    2455             :     Buffer      buf;
    2456             :     uint64      buf_state;
    2457             :     bool        from_ring;
    2458             : 
    2459             :     /*
    2460             :      * Ensure, before we pin a victim buffer, that there's a free refcount
    2461             :      * entry and resource owner slot for the pin.
    2462             :      */
    2463     3975192 :     ReservePrivateRefCountEntry();
    2464     3975192 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    2465             : 
    2466             :     /* we return here if a prospective victim buffer gets used concurrently */
    2467       13344 : again:
    2468             : 
    2469             :     /*
    2470             :      * Select a victim buffer.  The buffer is returned pinned and owned by
    2471             :      * this backend.
    2472             :      */
    2473     3988536 :     buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
    2474     3988536 :     buf = BufferDescriptorGetBuffer(buf_hdr);
    2475             : 
    2476             :     /*
    2477             :      * We shouldn't have any other pins for this buffer.
    2478             :      */
    2479     3988536 :     CheckBufferIsPinnedOnce(buf);
    2480             : 
    2481             :     /*
    2482             :      * If the buffer was dirty, try to write it out.  There is a race
    2483             :      * condition here, in that someone might dirty it after we released the
    2484             :      * buffer header lock above, or even while we are writing it out (since
    2485             :      * our share-lock won't prevent hint-bit updates).  We will recheck the
    2486             :      * dirty bit after re-locking the buffer header.
    2487             :      */
    2488     3988536 :     if (buf_state & BM_DIRTY)
    2489             :     {
    2490             :         Assert(buf_state & BM_TAG_VALID);
    2491             :         Assert(buf_state & BM_VALID);
    2492             : 
    2493             :         /*
    2494             :          * We need a share-lock on the buffer contents to write it out (else
    2495             :          * we might write invalid data, eg because someone else is compacting
    2496             :          * the page contents while we write).  We must use a conditional lock
    2497             :          * acquisition here to avoid deadlock.  Even though the buffer was not
    2498             :          * pinned (and therefore surely not locked) when StrategyGetBuffer
    2499             :          * returned it, someone else could have pinned and exclusive-locked it
    2500             :          * by the time we get here. If we try to get the lock unconditionally,
    2501             :          * we'd block waiting for them; if they later block waiting for us,
    2502             :          * deadlock ensues. (This has been observed to happen when two
    2503             :          * backends are both trying to split btree index pages, and the second
    2504             :          * one just happens to be trying to split the page the first one got
    2505             :          * from StrategyGetBuffer.)
    2506             :          */
    2507      549310 :         if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE))
    2508             :         {
    2509             :             /*
    2510             :              * Someone else has locked the buffer, so give it up and loop back
    2511             :              * to get another one.
    2512             :              */
    2513           0 :             UnpinBuffer(buf_hdr);
    2514           0 :             goto again;
    2515             :         }
    2516             : 
    2517             :         /*
    2518             :          * If using a nondefault strategy, and writing the buffer would
    2519             :          * require a WAL flush, let the strategy decide whether to go ahead
    2520             :          * and write/reuse the buffer or to choose another victim.  We need a
    2521             :          * lock to inspect the page LSN, so this can't be done inside
    2522             :          * StrategyGetBuffer.
    2523             :          */
    2524      549310 :         if (strategy != NULL)
    2525             :         {
    2526             :             XLogRecPtr  lsn;
    2527             : 
    2528             :             /* Read the LSN while holding buffer header lock */
    2529      160640 :             buf_state = LockBufHdr(buf_hdr);
    2530      160640 :             lsn = BufferGetLSN(buf_hdr);
    2531      160640 :             UnlockBufHdr(buf_hdr);
    2532             : 
    2533      160640 :             if (XLogNeedsFlush(lsn)
    2534       19458 :                 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
    2535             :             {
    2536       12230 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2537       12230 :                 UnpinBuffer(buf_hdr);
    2538       12230 :                 goto again;
    2539             :             }
    2540             :         }
    2541             : 
    2542             :         /* OK, do the I/O */
    2543      537080 :         FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
    2544      537080 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    2545             : 
    2546      537080 :         ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
    2547             :                                       &buf_hdr->tag);
    2548             :     }
    2549             : 
    2550             : 
    2551     3976306 :     if (buf_state & BM_VALID)
    2552             :     {
    2553             :         /*
    2554             :          * When a BufferAccessStrategy is in use, blocks evicted from shared
    2555             :          * buffers are counted as IOOP_EVICT in the corresponding context
    2556             :          * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
    2557             :          * strategy in two cases: 1) while initially claiming buffers for the
    2558             :          * strategy ring 2) to replace an existing strategy ring buffer
    2559             :          * because it is pinned or in use and cannot be reused.
    2560             :          *
    2561             :          * Blocks evicted from buffers already in the strategy ring are
    2562             :          * counted as IOOP_REUSE in the corresponding strategy context.
    2563             :          *
    2564             :          * At this point, we can accurately count evictions and reuses,
    2565             :          * because we have successfully claimed the valid buffer. Previously,
    2566             :          * we may have been forced to release the buffer due to concurrent
    2567             :          * pinners or erroring out.
    2568             :          */
    2569     2486196 :         pgstat_count_io_op(IOOBJECT_RELATION, io_context,
    2570     2486196 :                            from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
    2571             :     }
    2572             : 
    2573             :     /*
    2574             :      * If the buffer has an entry in the buffer mapping table, delete it. This
    2575             :      * can fail because another backend could have pinned or dirtied the
    2576             :      * buffer.
    2577             :      */
    2578     3976306 :     if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
    2579             :     {
    2580        1114 :         UnpinBuffer(buf_hdr);
    2581        1114 :         goto again;
    2582             :     }
    2583             : 
    2584             :     /* a final set of sanity checks */
    2585             : #ifdef USE_ASSERT_CHECKING
    2586             :     buf_state = pg_atomic_read_u64(&buf_hdr->state);
    2587             : 
    2588             :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2589             :     Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
    2590             : 
    2591             :     CheckBufferIsPinnedOnce(buf);
    2592             : #endif
    2593             : 
    2594     3975192 :     return buf;
    2595             : }
    2596             : 
    2597             : /*
    2598             :  * Return the maximum number of buffers that a backend should try to pin once,
    2599             :  * to avoid exceeding its fair share.  This is the highest value that
    2600             :  * GetAdditionalPinLimit() could ever return.  Note that it may be zero on a
    2601             :  * system with a very small buffer pool relative to max_connections.
    2602             :  */
    2603             : uint32
    2604     1279056 : GetPinLimit(void)
    2605             : {
    2606     1279056 :     return MaxProportionalPins;
    2607             : }
    2608             : 
    2609             : /*
    2610             :  * Return the maximum number of additional buffers that this backend should
    2611             :  * pin if it wants to stay under the per-backend limit, considering the number
    2612             :  * of buffers it has already pinned.  Unlike LimitAdditionalPins(), the limit
    2613             :  * return by this function can be zero.
    2614             :  */
    2615             : uint32
    2616     7289978 : GetAdditionalPinLimit(void)
    2617             : {
    2618             :     uint32      estimated_pins_held;
    2619             : 
    2620             :     /*
    2621             :      * We get the number of "overflowed" pins for free, but don't know the
    2622             :      * number of pins in PrivateRefCountArray.  The cost of calculating that
    2623             :      * exactly doesn't seem worth it, so just assume the max.
    2624             :      */
    2625     7289978 :     estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
    2626             : 
    2627             :     /* Is this backend already holding more than its fair share? */
    2628     7289978 :     if (estimated_pins_held > MaxProportionalPins)
    2629     2490880 :         return 0;
    2630             : 
    2631     4799098 :     return MaxProportionalPins - estimated_pins_held;
    2632             : }
    2633             : 
    2634             : /*
    2635             :  * Limit the number of pins a batch operation may additionally acquire, to
    2636             :  * avoid running out of pinnable buffers.
    2637             :  *
    2638             :  * One additional pin is always allowed, on the assumption that the operation
    2639             :  * requires at least one to make progress.
    2640             :  */
    2641             : void
    2642      407414 : LimitAdditionalPins(uint32 *additional_pins)
    2643             : {
    2644             :     uint32      limit;
    2645             : 
    2646      407414 :     if (*additional_pins <= 1)
    2647      387264 :         return;
    2648             : 
    2649       20150 :     limit = GetAdditionalPinLimit();
    2650       20150 :     limit = Max(limit, 1);
    2651       20150 :     if (limit < *additional_pins)
    2652       11032 :         *additional_pins = limit;
    2653             : }
    2654             : 
    2655             : /*
    2656             :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
    2657             :  * avoid duplicating the tracing and relpersistence related logic.
    2658             :  */
    2659             : static BlockNumber
    2660      430312 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
    2661             :                         ForkNumber fork,
    2662             :                         BufferAccessStrategy strategy,
    2663             :                         uint32 flags,
    2664             :                         uint32 extend_by,
    2665             :                         BlockNumber extend_upto,
    2666             :                         Buffer *buffers,
    2667             :                         uint32 *extended_by)
    2668             : {
    2669             :     BlockNumber first_block;
    2670             : 
    2671             :     TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
    2672             :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2673             :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2674             :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2675             :                                          BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2676             :                                          extend_by);
    2677             : 
    2678      430312 :     if (bmr.relpersistence == RELPERSISTENCE_TEMP)
    2679       22898 :         first_block = ExtendBufferedRelLocal(bmr, fork, flags,
    2680             :                                              extend_by, extend_upto,
    2681             :                                              buffers, &extend_by);
    2682             :     else
    2683      407414 :         first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
    2684             :                                               extend_by, extend_upto,
    2685             :                                               buffers, &extend_by);
    2686      430312 :     *extended_by = extend_by;
    2687             : 
    2688             :     TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
    2689             :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
    2690             :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
    2691             :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
    2692             :                                         BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
    2693             :                                         *extended_by,
    2694             :                                         first_block);
    2695             : 
    2696      430312 :     return first_block;
    2697             : }
    2698             : 
    2699             : /*
    2700             :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
    2701             :  * shared buffers.
    2702             :  */
    2703             : static BlockNumber
    2704      407414 : ExtendBufferedRelShared(BufferManagerRelation bmr,
    2705             :                         ForkNumber fork,
    2706             :                         BufferAccessStrategy strategy,
    2707             :                         uint32 flags,
    2708             :                         uint32 extend_by,
    2709             :                         BlockNumber extend_upto,
    2710             :                         Buffer *buffers,
    2711             :                         uint32 *extended_by)
    2712             : {
    2713             :     BlockNumber first_block;
    2714      407414 :     IOContext   io_context = IOContextForStrategy(strategy);
    2715             :     instr_time  io_start;
    2716             : 
    2717      407414 :     LimitAdditionalPins(&extend_by);
    2718             : 
    2719             :     /*
    2720             :      * Acquire victim buffers for extension without holding extension lock.
    2721             :      * Writing out victim buffers is the most expensive part of extending the
    2722             :      * relation, particularly when doing so requires WAL flushes. Zeroing out
    2723             :      * the buffers is also quite expensive, so do that before holding the
    2724             :      * extension lock as well.
    2725             :      *
    2726             :      * These pages are pinned by us and not valid. While we hold the pin they
    2727             :      * can't be acquired as victim buffers by another backend.
    2728             :      */
    2729      857506 :     for (uint32 i = 0; i < extend_by; i++)
    2730             :     {
    2731             :         Block       buf_block;
    2732             : 
    2733      450092 :         buffers[i] = GetVictimBuffer(strategy, io_context);
    2734      450092 :         buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
    2735             : 
    2736             :         /* new buffers are zero-filled */
    2737      450092 :         MemSet(buf_block, 0, BLCKSZ);
    2738             :     }
    2739             : 
    2740             :     /*
    2741             :      * Lock relation against concurrent extensions, unless requested not to.
    2742             :      *
    2743             :      * We use the same extension lock for all forks. That's unnecessarily
    2744             :      * restrictive, but currently extensions for forks don't happen often
    2745             :      * enough to make it worth locking more granularly.
    2746             :      *
    2747             :      * Note that another backend might have extended the relation by the time
    2748             :      * we get the lock.
    2749             :      */
    2750      407414 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2751      302968 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
    2752             : 
    2753             :     /*
    2754             :      * If requested, invalidate size cache, so that smgrnblocks asks the
    2755             :      * kernel.
    2756             :      */
    2757      407414 :     if (flags & EB_CLEAR_SIZE_CACHE)
    2758       15740 :         BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
    2759             : 
    2760      407414 :     first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
    2761             : 
    2762             :     /*
    2763             :      * Now that we have the accurate relation size, check if the caller wants
    2764             :      * us to extend to only up to a specific size. If there were concurrent
    2765             :      * extensions, we might have acquired too many buffers and need to release
    2766             :      * them.
    2767             :      */
    2768      407414 :     if (extend_upto != InvalidBlockNumber)
    2769             :     {
    2770      108046 :         uint32      orig_extend_by = extend_by;
    2771             : 
    2772      108046 :         if (first_block > extend_upto)
    2773           0 :             extend_by = 0;
    2774      108046 :         else if ((uint64) first_block + extend_by > extend_upto)
    2775           6 :             extend_by = extend_upto - first_block;
    2776             : 
    2777      108064 :         for (uint32 i = extend_by; i < orig_extend_by; i++)
    2778             :         {
    2779          18 :             BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
    2780             : 
    2781          18 :             UnpinBuffer(buf_hdr);
    2782             :         }
    2783             : 
    2784      108046 :         if (extend_by == 0)
    2785             :         {
    2786           6 :             if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2787           6 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    2788           6 :             *extended_by = extend_by;
    2789           6 :             return first_block;
    2790             :         }
    2791             :     }
    2792             : 
    2793             :     /* Fail if relation is already at maximum possible length */
    2794      407408 :     if ((uint64) first_block + extend_by >= MaxBlockNumber)
    2795           0 :         ereport(ERROR,
    2796             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    2797             :                  errmsg("cannot extend relation %s beyond %u blocks",
    2798             :                         relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
    2799             :                         MaxBlockNumber)));
    2800             : 
    2801             :     /*
    2802             :      * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
    2803             :      *
    2804             :      * This needs to happen before we extend the relation, because as soon as
    2805             :      * we do, other backends can start to read in those pages.
    2806             :      */
    2807      857482 :     for (uint32 i = 0; i < extend_by; i++)
    2808             :     {
    2809      450074 :         Buffer      victim_buf = buffers[i];
    2810      450074 :         BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
    2811             :         BufferTag   tag;
    2812             :         uint32      hash;
    2813             :         LWLock     *partition_lock;
    2814             :         int         existing_id;
    2815             : 
    2816             :         /* in case we need to pin an existing buffer below */
    2817      450074 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    2818      450074 :         ReservePrivateRefCountEntry();
    2819             : 
    2820      450074 :         InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
    2821             :                       first_block + i);
    2822      450074 :         hash = BufTableHashCode(&tag);
    2823      450074 :         partition_lock = BufMappingPartitionLock(hash);
    2824             : 
    2825      450074 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
    2826             : 
    2827      450074 :         existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
    2828             : 
    2829             :         /*
    2830             :          * We get here only in the corner case where we are trying to extend
    2831             :          * the relation but we found a pre-existing buffer. This can happen
    2832             :          * because a prior attempt at extending the relation failed, and
    2833             :          * because mdread doesn't complain about reads beyond EOF (when
    2834             :          * zero_damaged_pages is ON) and so a previous attempt to read a block
    2835             :          * beyond EOF could have left a "valid" zero-filled buffer.
    2836             :          *
    2837             :          * This has also been observed when relation was overwritten by
    2838             :          * external process. Since the legitimate cases should always have
    2839             :          * left a zero-filled buffer, complain if not PageIsNew.
    2840             :          */
    2841      450074 :         if (existing_id >= 0)
    2842             :         {
    2843           0 :             BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
    2844             :             Block       buf_block;
    2845             :             bool        valid;
    2846             : 
    2847             :             /*
    2848             :              * Pin the existing buffer before releasing the partition lock,
    2849             :              * preventing it from being evicted.
    2850             :              */
    2851           0 :             valid = PinBuffer(existing_hdr, strategy, false);
    2852             : 
    2853           0 :             LWLockRelease(partition_lock);
    2854           0 :             UnpinBuffer(victim_buf_hdr);
    2855             : 
    2856           0 :             buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
    2857           0 :             buf_block = BufHdrGetBlock(existing_hdr);
    2858             : 
    2859           0 :             if (valid && !PageIsNew((Page) buf_block))
    2860           0 :                 ereport(ERROR,
    2861             :                         (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
    2862             :                                 existing_hdr->tag.blockNum,
    2863             :                                 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
    2864             : 
    2865             :             /*
    2866             :              * We *must* do smgr[zero]extend before succeeding, else the page
    2867             :              * will not be reserved by the kernel, and the next P_NEW call
    2868             :              * will decide to return the same page.  Clear the BM_VALID bit,
    2869             :              * do StartBufferIO() and proceed.
    2870             :              *
    2871             :              * Loop to handle the very small possibility that someone re-sets
    2872             :              * BM_VALID between our clearing it and StartBufferIO inspecting
    2873             :              * it.
    2874             :              */
    2875             :             do
    2876             :             {
    2877           0 :                 pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
    2878           0 :             } while (!StartBufferIO(existing_hdr, true, false));
    2879             :         }
    2880             :         else
    2881             :         {
    2882             :             uint64      buf_state;
    2883      450074 :             uint64      set_bits = 0;
    2884             : 
    2885      450074 :             buf_state = LockBufHdr(victim_buf_hdr);
    2886             : 
    2887             :             /* some sanity checks while we hold the buffer header lock */
    2888             :             Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
    2889             :             Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
    2890             : 
    2891      450074 :             victim_buf_hdr->tag = tag;
    2892             : 
    2893      450074 :             set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
    2894      450074 :             if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
    2895      439418 :                 set_bits |= BM_PERMANENT;
    2896             : 
    2897      450074 :             UnlockBufHdrExt(victim_buf_hdr, buf_state,
    2898             :                             set_bits, 0,
    2899             :                             0);
    2900             : 
    2901      450074 :             LWLockRelease(partition_lock);
    2902             : 
    2903             :             /* XXX: could combine the locked operations in it with the above */
    2904      450074 :             StartBufferIO(victim_buf_hdr, true, false);
    2905             :         }
    2906             :     }
    2907             : 
    2908      407408 :     io_start = pgstat_prepare_io_time(track_io_timing);
    2909             : 
    2910             :     /*
    2911             :      * Note: if smgrzeroextend fails, we will end up with buffers that are
    2912             :      * allocated but not marked BM_VALID.  The next relation extension will
    2913             :      * still select the same block number (because the relation didn't get any
    2914             :      * longer on disk) and so future attempts to extend the relation will find
    2915             :      * the same buffers (if they have not been recycled) but come right back
    2916             :      * here to try smgrzeroextend again.
    2917             :      *
    2918             :      * We don't need to set checksum for all-zero pages.
    2919             :      */
    2920      407408 :     smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
    2921             : 
    2922             :     /*
    2923             :      * Release the file-extension lock; it's now OK for someone else to extend
    2924             :      * the relation some more.
    2925             :      *
    2926             :      * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
    2927             :      * take noticeable time.
    2928             :      */
    2929      407408 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
    2930      302962 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
    2931             : 
    2932      407408 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
    2933      407408 :                             io_start, 1, extend_by * BLCKSZ);
    2934             : 
    2935             :     /* Set BM_VALID, terminate IO, and wake up any waiters */
    2936      857482 :     for (uint32 i = 0; i < extend_by; i++)
    2937             :     {
    2938      450074 :         Buffer      buf = buffers[i];
    2939      450074 :         BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
    2940      450074 :         bool        lock = false;
    2941             : 
    2942      450074 :         if (flags & EB_LOCK_FIRST && i == 0)
    2943      298834 :             lock = true;
    2944      151240 :         else if (flags & EB_LOCK_TARGET)
    2945             :         {
    2946             :             Assert(extend_upto != InvalidBlockNumber);
    2947       90182 :             if (first_block + i + 1 == extend_upto)
    2948       88986 :                 lock = true;
    2949             :         }
    2950             : 
    2951      450074 :         if (lock)
    2952      387820 :             LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    2953             : 
    2954      450074 :         TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
    2955             :     }
    2956             : 
    2957      407408 :     pgBufferUsage.shared_blks_written += extend_by;
    2958             : 
    2959      407408 :     *extended_by = extend_by;
    2960             : 
    2961      407408 :     return first_block;
    2962             : }
    2963             : 
    2964             : /*
    2965             :  * BufferIsLockedByMe
    2966             :  *
    2967             :  *      Checks if this backend has the buffer locked in any mode.
    2968             :  *
    2969             :  * Buffer must be pinned.
    2970             :  */
    2971             : bool
    2972           0 : BufferIsLockedByMe(Buffer buffer)
    2973             : {
    2974             :     BufferDesc *bufHdr;
    2975             : 
    2976             :     Assert(BufferIsPinned(buffer));
    2977             : 
    2978           0 :     if (BufferIsLocal(buffer))
    2979             :     {
    2980             :         /* Content locks are not maintained for local buffers. */
    2981           0 :         return true;
    2982             :     }
    2983             :     else
    2984             :     {
    2985           0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    2986           0 :         return BufferLockHeldByMe(bufHdr);
    2987             :     }
    2988             : }
    2989             : 
    2990             : /*
    2991             :  * BufferIsLockedByMeInMode
    2992             :  *
    2993             :  *      Checks if this backend has the buffer locked in the specified mode.
    2994             :  *
    2995             :  * Buffer must be pinned.
    2996             :  */
    2997             : bool
    2998           0 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
    2999             : {
    3000             :     BufferDesc *bufHdr;
    3001             : 
    3002             :     Assert(BufferIsPinned(buffer));
    3003             : 
    3004           0 :     if (BufferIsLocal(buffer))
    3005             :     {
    3006             :         /* Content locks are not maintained for local buffers. */
    3007           0 :         return true;
    3008             :     }
    3009             :     else
    3010             :     {
    3011           0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3012           0 :         return BufferLockHeldByMeInMode(bufHdr, mode);
    3013             :     }
    3014             : }
    3015             : 
    3016             : /*
    3017             :  * BufferIsDirty
    3018             :  *
    3019             :  *      Checks if buffer is already dirty.
    3020             :  *
    3021             :  * Buffer must be pinned and exclusive-locked.  (Without an exclusive lock,
    3022             :  * the result may be stale before it's returned.)
    3023             :  */
    3024             : bool
    3025           0 : BufferIsDirty(Buffer buffer)
    3026             : {
    3027             :     BufferDesc *bufHdr;
    3028             : 
    3029             :     Assert(BufferIsPinned(buffer));
    3030             : 
    3031           0 :     if (BufferIsLocal(buffer))
    3032             :     {
    3033           0 :         int         bufid = -buffer - 1;
    3034             : 
    3035           0 :         bufHdr = GetLocalBufferDescriptor(bufid);
    3036             :         /* Content locks are not maintained for local buffers. */
    3037             :     }
    3038             :     else
    3039             :     {
    3040           0 :         bufHdr = GetBufferDescriptor(buffer - 1);
    3041             :         Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3042             :     }
    3043             : 
    3044           0 :     return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
    3045             : }
    3046             : 
    3047             : /*
    3048             :  * MarkBufferDirty
    3049             :  *
    3050             :  *      Marks buffer contents as dirty (actual write happens later).
    3051             :  *
    3052             :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
    3053             :  * exclusive lock, then somebody could be in process of writing the buffer,
    3054             :  * leading to risk of bad data written to disk.)
    3055             :  */
    3056             : void
    3057    43758036 : MarkBufferDirty(Buffer buffer)
    3058             : {
    3059             :     BufferDesc *bufHdr;
    3060             :     uint64      buf_state;
    3061             :     uint64      old_buf_state;
    3062             : 
    3063    43758036 :     if (!BufferIsValid(buffer))
    3064           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    3065             : 
    3066    43758036 :     if (BufferIsLocal(buffer))
    3067             :     {
    3068     2444796 :         MarkLocalBufferDirty(buffer);
    3069     2444796 :         return;
    3070             :     }
    3071             : 
    3072    41313240 :     bufHdr = GetBufferDescriptor(buffer - 1);
    3073             : 
    3074             :     Assert(BufferIsPinned(buffer));
    3075             :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    3076             : 
    3077             :     /*
    3078             :      * NB: We have to wait for the buffer header spinlock to be not held, as
    3079             :      * TerminateBufferIO() relies on the spinlock.
    3080             :      */
    3081    41313240 :     old_buf_state = pg_atomic_read_u64(&bufHdr->state);
    3082             :     for (;;)
    3083             :     {
    3084    41313770 :         if (old_buf_state & BM_LOCKED)
    3085         696 :             old_buf_state = WaitBufHdrUnlocked(bufHdr);
    3086             : 
    3087    41313770 :         buf_state = old_buf_state;
    3088             : 
    3089             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    3090    41313770 :         buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
    3091             : 
    3092    41313770 :         if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
    3093             :                                            buf_state))
    3094    41313240 :             break;
    3095             :     }
    3096             : 
    3097             :     /*
    3098             :      * If the buffer was not dirty already, do vacuum accounting.
    3099             :      */
    3100    41313240 :     if (!(old_buf_state & BM_DIRTY))
    3101             :     {
    3102     1331324 :         pgBufferUsage.shared_blks_dirtied++;
    3103     1331324 :         if (VacuumCostActive)
    3104       17546 :             VacuumCostBalance += VacuumCostPageDirty;
    3105             :     }
    3106             : }
    3107             : 
    3108             : /*
    3109             :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
    3110             :  *
    3111             :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
    3112             :  * compared to calling the two routines separately.  Now it's mainly just
    3113             :  * a convenience function.  However, if the passed buffer is valid and
    3114             :  * already contains the desired block, we just return it as-is; and that
    3115             :  * does save considerable work compared to a full release and reacquire.
    3116             :  *
    3117             :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
    3118             :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
    3119             :  * but can save some tests in the caller.
    3120             :  */
    3121             : Buffer
    3122    58812920 : ReleaseAndReadBuffer(Buffer buffer,
    3123             :                      Relation relation,
    3124             :                      BlockNumber blockNum)
    3125             : {
    3126    58812920 :     ForkNumber  forkNum = MAIN_FORKNUM;
    3127             :     BufferDesc *bufHdr;
    3128             : 
    3129    58812920 :     if (BufferIsValid(buffer))
    3130             :     {
    3131             :         Assert(BufferIsPinned(buffer));
    3132    35493098 :         if (BufferIsLocal(buffer))
    3133             :         {
    3134       73728 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    3135       80772 :             if (bufHdr->tag.blockNum == blockNum &&
    3136       14088 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3137        7044 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3138        7044 :                 return buffer;
    3139       66684 :             UnpinLocalBuffer(buffer);
    3140             :         }
    3141             :         else
    3142             :         {
    3143    35419370 :             bufHdr = GetBufferDescriptor(buffer - 1);
    3144             :             /* we have pin, so it's ok to examine tag without spinlock */
    3145    47462094 :             if (bufHdr->tag.blockNum == blockNum &&
    3146    24085448 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
    3147    12042724 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
    3148    12042724 :                 return buffer;
    3149    23376646 :             UnpinBuffer(bufHdr);
    3150             :         }
    3151             :     }
    3152             : 
    3153    46763152 :     return ReadBuffer(relation, blockNum);
    3154             : }
    3155             : 
    3156             : /*
    3157             :  * PinBuffer -- make buffer unavailable for replacement.
    3158             :  *
    3159             :  * For the default access strategy, the buffer's usage_count is incremented
    3160             :  * when we first pin it; for other strategies we just make sure the usage_count
    3161             :  * isn't zero.  (The idea of the latter is that we don't want synchronized
    3162             :  * heap scans to inflate the count, but we need it to not be zero to discourage
    3163             :  * other backends from stealing buffers from our ring.  As long as we cycle
    3164             :  * through the ring faster than the global clock-sweep cycles, buffers in
    3165             :  * our ring won't be chosen as victims for replacement by other backends.)
    3166             :  *
    3167             :  * This should be applied only to shared buffers, never local ones.
    3168             :  *
    3169             :  * Since buffers are pinned/unpinned very frequently, pin buffers without
    3170             :  * taking the buffer header lock; instead update the state variable in loop of
    3171             :  * CAS operations. Hopefully it's just a single CAS.
    3172             :  *
    3173             :  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
    3174             :  * must have been done already.
    3175             :  *
    3176             :  * Returns true if buffer is BM_VALID, else false.  This provision allows
    3177             :  * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
    3178             :  * true, then a false return value also indicates that the buffer was
    3179             :  * (recently) invalid and has not been pinned.
    3180             :  */
    3181             : static bool
    3182   119348066 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
    3183             :           bool skip_if_not_valid)
    3184             : {
    3185   119348066 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3186             :     bool        result;
    3187             :     PrivateRefCountEntry *ref;
    3188             : 
    3189             :     Assert(!BufferIsLocal(b));
    3190             :     Assert(ReservedRefCountSlot != -1);
    3191             : 
    3192   119348066 :     ref = GetPrivateRefCountEntry(b, true);
    3193             : 
    3194   119348066 :     if (ref == NULL)
    3195             :     {
    3196             :         uint64      buf_state;
    3197             :         uint64      old_buf_state;
    3198             : 
    3199   114808558 :         old_buf_state = pg_atomic_read_u64(&buf->state);
    3200             :         for (;;)
    3201             :         {
    3202   114852816 :             if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
    3203          12 :                 return false;
    3204             : 
    3205             :             /*
    3206             :              * We're not allowed to increase the refcount while the buffer
    3207             :              * header spinlock is held. Wait for the lock to be released.
    3208             :              */
    3209   114852804 :             if (old_buf_state & BM_LOCKED)
    3210         830 :                 old_buf_state = WaitBufHdrUnlocked(buf);
    3211             : 
    3212   114852804 :             buf_state = old_buf_state;
    3213             : 
    3214             :             /* increase refcount */
    3215   114852804 :             buf_state += BUF_REFCOUNT_ONE;
    3216             : 
    3217   114852804 :             if (strategy == NULL)
    3218             :             {
    3219             :                 /* Default case: increase usagecount unless already max. */
    3220   113416286 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
    3221     6797918 :                     buf_state += BUF_USAGECOUNT_ONE;
    3222             :             }
    3223             :             else
    3224             :             {
    3225             :                 /*
    3226             :                  * Ring buffers shouldn't evict others from pool.  Thus we
    3227             :                  * don't make usagecount more than 1.
    3228             :                  */
    3229     1436518 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    3230       70064 :                     buf_state += BUF_USAGECOUNT_ONE;
    3231             :             }
    3232             : 
    3233   114852804 :             if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
    3234             :                                                buf_state))
    3235             :             {
    3236   114808546 :                 result = (buf_state & BM_VALID) != 0;
    3237             : 
    3238   114808546 :                 TrackNewBufferPin(b);
    3239   114808546 :                 break;
    3240             :             }
    3241             :         }
    3242             :     }
    3243             :     else
    3244             :     {
    3245             :         /*
    3246             :          * If we previously pinned the buffer, it is likely to be valid, but
    3247             :          * it may not be if StartReadBuffers() was called and
    3248             :          * WaitReadBuffers() hasn't been called yet.  We'll check by loading
    3249             :          * the flags without locking.  This is racy, but it's OK to return
    3250             :          * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
    3251             :          * it'll see that it's now valid.
    3252             :          *
    3253             :          * Note: We deliberately avoid a Valgrind client request here.
    3254             :          * Individual access methods can optionally superimpose buffer page
    3255             :          * client requests on top of our client requests to enforce that
    3256             :          * buffers are only accessed while locked (and pinned).  It's possible
    3257             :          * that the buffer page is legitimately non-accessible here.  We
    3258             :          * cannot meddle with that.
    3259             :          */
    3260     4539508 :         result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
    3261             : 
    3262             :         Assert(ref->data.refcount > 0);
    3263     4539508 :         ref->data.refcount++;
    3264     4539508 :         ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
    3265             :     }
    3266             : 
    3267   119348054 :     return result;
    3268             : }
    3269             : 
    3270             : /*
    3271             :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
    3272             :  * The spinlock is released before return.
    3273             :  *
    3274             :  * As this function is called with the spinlock held, the caller has to
    3275             :  * previously call ReservePrivateRefCountEntry() and
    3276             :  * ResourceOwnerEnlarge(CurrentResourceOwner);
    3277             :  *
    3278             :  * Currently, no callers of this function want to modify the buffer's
    3279             :  * usage_count at all, so there's no need for a strategy parameter.
    3280             :  * Also we don't bother with a BM_VALID test (the caller could check that for
    3281             :  * itself).
    3282             :  *
    3283             :  * Also all callers only ever use this function when it's known that the
    3284             :  * buffer can't have a preexisting pin by this backend. That allows us to skip
    3285             :  * searching the private refcount array & hash, which is a boon, because the
    3286             :  * spinlock is still held.
    3287             :  *
    3288             :  * Note: use of this routine is frequently mandatory, not just an optimization
    3289             :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
    3290             :  * its state can change under us.
    3291             :  */
    3292             : static void
    3293      616450 : PinBuffer_Locked(BufferDesc *buf)
    3294             : {
    3295             :     uint64      old_buf_state;
    3296             : 
    3297             :     /*
    3298             :      * As explained, We don't expect any preexisting pins. That allows us to
    3299             :      * manipulate the PrivateRefCount after releasing the spinlock
    3300             :      */
    3301             :     Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
    3302             : 
    3303             :     /*
    3304             :      * Since we hold the buffer spinlock, we can update the buffer state and
    3305             :      * release the lock in one operation.
    3306             :      */
    3307      616450 :     old_buf_state = pg_atomic_read_u64(&buf->state);
    3308             : 
    3309      616450 :     UnlockBufHdrExt(buf, old_buf_state,
    3310             :                     0, 0, 1);
    3311             : 
    3312      616450 :     TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
    3313      616450 : }
    3314             : 
    3315             : /*
    3316             :  * Support for waking up another backend that is waiting for the cleanup lock
    3317             :  * to be released using BM_PIN_COUNT_WAITER.
    3318             :  *
    3319             :  * See LockBufferForCleanup().
    3320             :  *
    3321             :  * Expected to be called just after releasing a buffer pin (in a BufferDesc,
    3322             :  * not just reducing the backend-local pincount for the buffer).
    3323             :  */
    3324             : static void
    3325         164 : WakePinCountWaiter(BufferDesc *buf)
    3326             : {
    3327             :     /*
    3328             :      * Acquire the buffer header lock, re-check that there's a waiter. Another
    3329             :      * backend could have unpinned this buffer, and already woken up the
    3330             :      * waiter.
    3331             :      *
    3332             :      * There's no danger of the buffer being replaced after we unpinned it
    3333             :      * above, as it's pinned by the waiter. The waiter removes
    3334             :      * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
    3335             :      * backend waking it up.
    3336             :      */
    3337         164 :     uint64      buf_state = LockBufHdr(buf);
    3338             : 
    3339         164 :     if ((buf_state & BM_PIN_COUNT_WAITER) &&
    3340         164 :         BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    3341         164 :     {
    3342             :         /* we just released the last pin other than the waiter's */
    3343         164 :         int         wait_backend_pgprocno = buf->wait_backend_pgprocno;
    3344             : 
    3345         164 :         UnlockBufHdrExt(buf, buf_state,
    3346             :                         0, BM_PIN_COUNT_WAITER,
    3347             :                         0);
    3348         164 :         ProcSendSignal(wait_backend_pgprocno);
    3349             :     }
    3350             :     else
    3351           0 :         UnlockBufHdr(buf);
    3352         164 : }
    3353             : 
    3354             : /*
    3355             :  * UnpinBuffer -- make buffer available for replacement.
    3356             :  *
    3357             :  * This should be applied only to shared buffers, never local ones.  This
    3358             :  * always adjusts CurrentResourceOwner.
    3359             :  */
    3360             : static void
    3361   147014112 : UnpinBuffer(BufferDesc *buf)
    3362             : {
    3363   147014112 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3364             : 
    3365   147014112 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
    3366   147014112 :     UnpinBufferNoOwner(buf);
    3367   147014112 : }
    3368             : 
    3369             : static void
    3370   147023338 : UnpinBufferNoOwner(BufferDesc *buf)
    3371             : {
    3372             :     PrivateRefCountEntry *ref;
    3373   147023338 :     Buffer      b = BufferDescriptorGetBuffer(buf);
    3374             : 
    3375             :     Assert(!BufferIsLocal(b));
    3376             : 
    3377             :     /* not moving as we're likely deleting it soon anyway */
    3378   147023338 :     ref = GetPrivateRefCountEntry(b, false);
    3379             :     Assert(ref != NULL);
    3380             :     Assert(ref->data.refcount > 0);
    3381   147023338 :     ref->data.refcount--;
    3382   147023338 :     if (ref->data.refcount == 0)
    3383             :     {
    3384             :         uint64      old_buf_state;
    3385             : 
    3386             :         /*
    3387             :          * Mark buffer non-accessible to Valgrind.
    3388             :          *
    3389             :          * Note that the buffer may have already been marked non-accessible
    3390             :          * within access method code that enforces that buffers are only
    3391             :          * accessed while a buffer lock is held.
    3392             :          */
    3393             :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
    3394             : 
    3395             :         /*
    3396             :          * I'd better not still hold the buffer content lock. Can't use
    3397             :          * BufferIsLockedByMe(), as that asserts the buffer is pinned.
    3398             :          */
    3399             :         Assert(!BufferLockHeldByMe(buf));
    3400             : 
    3401             :         /* decrement the shared reference count */
    3402   119413532 :         old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
    3403             : 
    3404             :         /* Support LockBufferForCleanup() */
    3405   119413532 :         if (old_buf_state & BM_PIN_COUNT_WAITER)
    3406         164 :             WakePinCountWaiter(buf);
    3407             : 
    3408   119413532 :         ForgetPrivateRefCountEntry(ref);
    3409             :     }
    3410   147023338 : }
    3411             : 
    3412             : /*
    3413             :  * Set up backend-local tracking of a buffer pinned the first time by this
    3414             :  * backend.
    3415             :  */
    3416             : inline void
    3417   119413532 : TrackNewBufferPin(Buffer buf)
    3418             : {
    3419             :     PrivateRefCountEntry *ref;
    3420             : 
    3421   119413532 :     ref = NewPrivateRefCountEntry(buf);
    3422   119413532 :     ref->data.refcount++;
    3423             : 
    3424   119413532 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
    3425             : 
    3426             :     /*
    3427             :      * This is the first pin for this page by this backend, mark its page as
    3428             :      * defined to valgrind. While the page contents might not actually be
    3429             :      * valid yet, we don't currently guarantee that such pages are marked
    3430             :      * undefined or non-accessible.
    3431             :      *
    3432             :      * It's not necessarily the prettiest to do this here, but otherwise we'd
    3433             :      * need this block of code in multiple places.
    3434             :      */
    3435             :     VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
    3436             :                               BLCKSZ);
    3437   119413532 : }
    3438             : 
    3439             : #define ST_SORT sort_checkpoint_bufferids
    3440             : #define ST_ELEMENT_TYPE CkptSortItem
    3441             : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
    3442             : #define ST_SCOPE static
    3443             : #define ST_DEFINE
    3444             : #include "lib/sort_template.h"
    3445             : 
    3446             : /*
    3447             :  * BufferSync -- Write out all dirty buffers in the pool.
    3448             :  *
    3449             :  * This is called at checkpoint time to write out all dirty shared buffers.
    3450             :  * The checkpoint request flags should be passed in.  If CHECKPOINT_FAST is
    3451             :  * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
    3452             :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
    3453             :  * even unlogged buffers, which are otherwise skipped.  The remaining flags
    3454             :  * currently have no effect here.
    3455             :  */
    3456             : static void
    3457        3574 : BufferSync(int flags)
    3458             : {
    3459             :     uint64      buf_state;
    3460             :     int         buf_id;
    3461             :     int         num_to_scan;
    3462             :     int         num_spaces;
    3463             :     int         num_processed;
    3464             :     int         num_written;
    3465        3574 :     CkptTsStatus *per_ts_stat = NULL;
    3466             :     Oid         last_tsid;
    3467             :     binaryheap *ts_heap;
    3468             :     int         i;
    3469        3574 :     uint64      mask = BM_DIRTY;
    3470             :     WritebackContext wb_context;
    3471             : 
    3472             :     /*
    3473             :      * Unless this is a shutdown checkpoint or we have been explicitly told,
    3474             :      * we write only permanent, dirty buffers.  But at shutdown or end of
    3475             :      * recovery, we write all dirty buffers.
    3476             :      */
    3477        3574 :     if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
    3478             :                     CHECKPOINT_FLUSH_UNLOGGED))))
    3479        1994 :         mask |= BM_PERMANENT;
    3480             : 
    3481             :     /*
    3482             :      * Loop over all buffers, and mark the ones that need to be written with
    3483             :      * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
    3484             :      * can estimate how much work needs to be done.
    3485             :      *
    3486             :      * This allows us to write only those pages that were dirty when the
    3487             :      * checkpoint began, and not those that get dirtied while it proceeds.
    3488             :      * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
    3489             :      * later in this function, or by normal backends or the bgwriter cleaning
    3490             :      * scan, the flag is cleared.  Any buffer dirtied after this point won't
    3491             :      * have the flag set.
    3492             :      *
    3493             :      * Note that if we fail to write some buffer, we may leave buffers with
    3494             :      * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
    3495             :      * certainly need to be written for the next checkpoint attempt, too.
    3496             :      */
    3497        3574 :     num_to_scan = 0;
    3498    24746902 :     for (buf_id = 0; buf_id < NBuffers; buf_id++)
    3499             :     {
    3500    24743328 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    3501    24743328 :         uint64      set_bits = 0;
    3502             : 
    3503             :         /*
    3504             :          * Header spinlock is enough to examine BM_DIRTY, see comment in
    3505             :          * SyncOneBuffer.
    3506             :          */
    3507    24743328 :         buf_state = LockBufHdr(bufHdr);
    3508             : 
    3509    24743328 :         if ((buf_state & mask) == mask)
    3510             :         {
    3511             :             CkptSortItem *item;
    3512             : 
    3513      589498 :             set_bits = BM_CHECKPOINT_NEEDED;
    3514             : 
    3515      589498 :             item = &CkptBufferIds[num_to_scan++];
    3516      589498 :             item->buf_id = buf_id;
    3517      589498 :             item->tsId = bufHdr->tag.spcOid;
    3518      589498 :             item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
    3519      589498 :             item->forkNum = BufTagGetForkNum(&bufHdr->tag);
    3520      589498 :             item->blockNum = bufHdr->tag.blockNum;
    3521             :         }
    3522             : 
    3523    24743328 :         UnlockBufHdrExt(bufHdr, buf_state,
    3524             :                         set_bits, 0,
    3525             :                         0);
    3526             : 
    3527             :         /* Check for barrier events in case NBuffers is large. */
    3528    24743328 :         if (ProcSignalBarrierPending)
    3529           0 :             ProcessProcSignalBarrier();
    3530             :     }
    3531             : 
    3532        3574 :     if (num_to_scan == 0)
    3533        1374 :         return;                 /* nothing to do */
    3534             : 
    3535        2200 :     WritebackContextInit(&wb_context, &checkpoint_flush_after);
    3536             : 
    3537             :     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
    3538             : 
    3539             :     /*
    3540             :      * Sort buffers that need to be written to reduce the likelihood of random
    3541             :      * IO. The sorting is also important for the implementation of balancing
    3542             :      * writes between tablespaces. Without balancing writes we'd potentially
    3543             :      * end up writing to the tablespaces one-by-one; possibly overloading the
    3544             :      * underlying system.
    3545             :      */
    3546        2200 :     sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
    3547             : 
    3548        2200 :     num_spaces = 0;
    3549             : 
    3550             :     /*
    3551             :      * Allocate progress status for each tablespace with buffers that need to
    3552             :      * be flushed. This requires the to-be-flushed array to be sorted.
    3553             :      */
    3554        2200 :     last_tsid = InvalidOid;
    3555      591698 :     for (i = 0; i < num_to_scan; i++)
    3556             :     {
    3557             :         CkptTsStatus *s;
    3558             :         Oid         cur_tsid;
    3559             : 
    3560      589498 :         cur_tsid = CkptBufferIds[i].tsId;
    3561             : 
    3562             :         /*
    3563             :          * Grow array of per-tablespace status structs, every time a new
    3564             :          * tablespace is found.
    3565             :          */
    3566      589498 :         if (last_tsid == InvalidOid || last_tsid != cur_tsid)
    3567        3314 :         {
    3568             :             Size        sz;
    3569             : 
    3570        3314 :             num_spaces++;
    3571             : 
    3572             :             /*
    3573             :              * Not worth adding grow-by-power-of-2 logic here - even with a
    3574             :              * few hundred tablespaces this should be fine.
    3575             :              */
    3576        3314 :             sz = sizeof(CkptTsStatus) * num_spaces;
    3577             : 
    3578        3314 :             if (per_ts_stat == NULL)
    3579        2200 :                 per_ts_stat = (CkptTsStatus *) palloc(sz);
    3580             :             else
    3581        1114 :                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
    3582             : 
    3583        3314 :             s = &per_ts_stat[num_spaces - 1];
    3584        3314 :             memset(s, 0, sizeof(*s));
    3585        3314 :             s->tsId = cur_tsid;
    3586             : 
    3587             :             /*
    3588             :              * The first buffer in this tablespace. As CkptBufferIds is sorted
    3589             :              * by tablespace all (s->num_to_scan) buffers in this tablespace
    3590             :              * will follow afterwards.
    3591             :              */
    3592        3314 :             s->index = i;
    3593             : 
    3594             :             /*
    3595             :              * progress_slice will be determined once we know how many buffers
    3596             :              * are in each tablespace, i.e. after this loop.
    3597             :              */
    3598             : 
    3599        3314 :             last_tsid = cur_tsid;
    3600             :         }
    3601             :         else
    3602             :         {
    3603      586184 :             s = &per_ts_stat[num_spaces - 1];
    3604             :         }
    3605             : 
    3606      589498 :         s->num_to_scan++;
    3607             : 
    3608             :         /* Check for barrier events. */
    3609      589498 :         if (ProcSignalBarrierPending)
    3610           0 :             ProcessProcSignalBarrier();
    3611             :     }
    3612             : 
    3613             :     Assert(num_spaces > 0);
    3614             : 
    3615             :     /*
    3616             :      * Build a min-heap over the write-progress in the individual tablespaces,
    3617             :      * and compute how large a portion of the total progress a single
    3618             :      * processed buffer is.
    3619             :      */
    3620        2200 :     ts_heap = binaryheap_allocate(num_spaces,
    3621             :                                   ts_ckpt_progress_comparator,
    3622             :                                   NULL);
    3623             : 
    3624        5514 :     for (i = 0; i < num_spaces; i++)
    3625             :     {
    3626        3314 :         CkptTsStatus *ts_stat = &per_ts_stat[i];
    3627             : 
    3628        3314 :         ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
    3629             : 
    3630        3314 :         binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
    3631             :     }
    3632             : 
    3633        2200 :     binaryheap_build(ts_heap);
    3634             : 
    3635             :     /*
    3636             :      * Iterate through to-be-checkpointed buffers and write the ones (still)
    3637             :      * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
    3638             :      * tablespaces; otherwise the sorting would lead to only one tablespace
    3639             :      * receiving writes at a time, making inefficient use of the hardware.
    3640             :      */
    3641        2200 :     num_processed = 0;
    3642        2200 :     num_written = 0;
    3643      591698 :     while (!binaryheap_empty(ts_heap))
    3644             :     {
    3645      589498 :         BufferDesc *bufHdr = NULL;
    3646             :         CkptTsStatus *ts_stat = (CkptTsStatus *)
    3647      589498 :             DatumGetPointer(binaryheap_first(ts_heap));
    3648             : 
    3649      589498 :         buf_id = CkptBufferIds[ts_stat->index].buf_id;
    3650             :         Assert(buf_id != -1);
    3651             : 
    3652      589498 :         bufHdr = GetBufferDescriptor(buf_id);
    3653             : 
    3654      589498 :         num_processed++;
    3655             : 
    3656             :         /*
    3657             :          * We don't need to acquire the lock here, because we're only looking
    3658             :          * at a single bit. It's possible that someone else writes the buffer
    3659             :          * and clears the flag right after we check, but that doesn't matter
    3660             :          * since SyncOneBuffer will then do nothing.  However, there is a
    3661             :          * further race condition: it's conceivable that between the time we
    3662             :          * examine the bit here and the time SyncOneBuffer acquires the lock,
    3663             :          * someone else not only wrote the buffer but replaced it with another
    3664             :          * page and dirtied it.  In that improbable case, SyncOneBuffer will
    3665             :          * write the buffer though we didn't need to.  It doesn't seem worth
    3666             :          * guarding against this, though.
    3667             :          */
    3668      589498 :         if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
    3669             :         {
    3670      549116 :             if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
    3671             :             {
    3672             :                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
    3673      549116 :                 PendingCheckpointerStats.buffers_written++;
    3674      549116 :                 num_written++;
    3675             :             }
    3676             :         }
    3677             : 
    3678             :         /*
    3679             :          * Measure progress independent of actually having to flush the buffer
    3680             :          * - otherwise writing become unbalanced.
    3681             :          */
    3682      589498 :         ts_stat->progress += ts_stat->progress_slice;
    3683      589498 :         ts_stat->num_scanned++;
    3684      589498 :         ts_stat->index++;
    3685             : 
    3686             :         /* Have all the buffers from the tablespace been processed? */
    3687      589498 :         if (ts_stat->num_scanned == ts_stat->num_to_scan)
    3688             :         {
    3689        3314 :             binaryheap_remove_first(ts_heap);
    3690             :         }
    3691             :         else
    3692             :         {
    3693             :             /* update heap with the new progress */
    3694      586184 :             binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
    3695             :         }
    3696             : 
    3697             :         /*
    3698             :          * Sleep to throttle our I/O rate.
    3699             :          *
    3700             :          * (This will check for barrier events even if it doesn't sleep.)
    3701             :          */
    3702      589498 :         CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    3703             :     }
    3704             : 
    3705             :     /*
    3706             :      * Issue all pending flushes. Only checkpointer calls BufferSync(), so
    3707             :      * IOContext will always be IOCONTEXT_NORMAL.
    3708             :      */
    3709        2200 :     IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
    3710             : 
    3711        2200 :     pfree(per_ts_stat);
    3712        2200 :     per_ts_stat = NULL;
    3713        2200 :     binaryheap_free(ts_heap);
    3714             : 
    3715             :     /*
    3716             :      * Update checkpoint statistics. As noted above, this doesn't include
    3717             :      * buffers written by other backends or bgwriter scan.
    3718             :      */
    3719        2200 :     CheckpointStats.ckpt_bufs_written += num_written;
    3720             : 
    3721             :     TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
    3722             : }
    3723             : 
    3724             : /*
    3725             :  * BgBufferSync -- Write out some dirty buffers in the pool.
    3726             :  *
    3727             :  * This is called periodically by the background writer process.
    3728             :  *
    3729             :  * Returns true if it's appropriate for the bgwriter process to go into
    3730             :  * low-power hibernation mode.  (This happens if the strategy clock-sweep
    3731             :  * has been "lapped" and no buffer allocations have occurred recently,
    3732             :  * or if the bgwriter has been effectively disabled by setting
    3733             :  * bgwriter_lru_maxpages to 0.)
    3734             :  */
    3735             : bool
    3736       28976 : BgBufferSync(WritebackContext *wb_context)
    3737             : {
    3738             :     /* info obtained from freelist.c */
    3739             :     int         strategy_buf_id;
    3740             :     uint32      strategy_passes;
    3741             :     uint32      recent_alloc;
    3742             : 
    3743             :     /*
    3744             :      * Information saved between calls so we can determine the strategy
    3745             :      * point's advance rate and avoid scanning already-cleaned buffers.
    3746             :      */
    3747             :     static bool saved_info_valid = false;
    3748             :     static int  prev_strategy_buf_id;
    3749             :     static uint32 prev_strategy_passes;
    3750             :     static int  next_to_clean;
    3751             :     static uint32 next_passes;
    3752             : 
    3753             :     /* Moving averages of allocation rate and clean-buffer density */
    3754             :     static float smoothed_alloc = 0;
    3755             :     static float smoothed_density = 10.0;
    3756             : 
    3757             :     /* Potentially these could be tunables, but for now, not */
    3758       28976 :     float       smoothing_samples = 16;
    3759       28976 :     float       scan_whole_pool_milliseconds = 120000.0;
    3760             : 
    3761             :     /* Used to compute how far we scan ahead */
    3762             :     long        strategy_delta;
    3763             :     int         bufs_to_lap;
    3764             :     int         bufs_ahead;
    3765             :     float       scans_per_alloc;
    3766             :     int         reusable_buffers_est;
    3767             :     int         upcoming_alloc_est;
    3768             :     int         min_scan_buffers;
    3769             : 
    3770             :     /* Variables for the scanning loop proper */
    3771             :     int         num_to_scan;
    3772             :     int         num_written;
    3773             :     int         reusable_buffers;
    3774             : 
    3775             :     /* Variables for final smoothed_density update */
    3776             :     long        new_strategy_delta;
    3777             :     uint32      new_recent_alloc;
    3778             : 
    3779             :     /*
    3780             :      * Find out where the clock-sweep currently is, and how many buffer
    3781             :      * allocations have happened since our last call.
    3782             :      */
    3783       28976 :     strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
    3784             : 
    3785             :     /* Report buffer alloc counts to pgstat */
    3786       28976 :     PendingBgWriterStats.buf_alloc += recent_alloc;
    3787             : 
    3788             :     /*
    3789             :      * If we're not running the LRU scan, just stop after doing the stats
    3790             :      * stuff.  We mark the saved state invalid so that we can recover sanely
    3791             :      * if LRU scan is turned back on later.
    3792             :      */
    3793       28976 :     if (bgwriter_lru_maxpages <= 0)
    3794             :     {
    3795          74 :         saved_info_valid = false;
    3796          74 :         return true;
    3797             :     }
    3798             : 
    3799             :     /*
    3800             :      * Compute strategy_delta = how many buffers have been scanned by the
    3801             :      * clock-sweep since last time.  If first time through, assume none. Then
    3802             :      * see if we are still ahead of the clock-sweep, and if so, how many
    3803             :      * buffers we could scan before we'd catch up with it and "lap" it. Note:
    3804             :      * weird-looking coding of xxx_passes comparisons are to avoid bogus
    3805             :      * behavior when the passes counts wrap around.
    3806             :      */
    3807       28902 :     if (saved_info_valid)
    3808             :     {
    3809       27772 :         int32       passes_delta = strategy_passes - prev_strategy_passes;
    3810             : 
    3811       27772 :         strategy_delta = strategy_buf_id - prev_strategy_buf_id;
    3812       27772 :         strategy_delta += (long) passes_delta * NBuffers;
    3813             : 
    3814             :         Assert(strategy_delta >= 0);
    3815             : 
    3816       27772 :         if ((int32) (next_passes - strategy_passes) > 0)
    3817             :         {
    3818             :             /* we're one pass ahead of the strategy point */
    3819        5796 :             bufs_to_lap = strategy_buf_id - next_to_clean;
    3820             : #ifdef BGW_DEBUG
    3821             :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3822             :                  next_passes, next_to_clean,
    3823             :                  strategy_passes, strategy_buf_id,
    3824             :                  strategy_delta, bufs_to_lap);
    3825             : #endif
    3826             :         }
    3827       21976 :         else if (next_passes == strategy_passes &&
    3828       16596 :                  next_to_clean >= strategy_buf_id)
    3829             :         {
    3830             :             /* on same pass, but ahead or at least not behind */
    3831       14908 :             bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
    3832             : #ifdef BGW_DEBUG
    3833             :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
    3834             :                  next_passes, next_to_clean,
    3835             :                  strategy_passes, strategy_buf_id,
    3836             :                  strategy_delta, bufs_to_lap);
    3837             : #endif
    3838             :         }
    3839             :         else
    3840             :         {
    3841             :             /*
    3842             :              * We're behind, so skip forward to the strategy point and start
    3843             :              * cleaning from there.
    3844             :              */
    3845             : #ifdef BGW_DEBUG
    3846             :             elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
    3847             :                  next_passes, next_to_clean,
    3848             :                  strategy_passes, strategy_buf_id,
    3849             :                  strategy_delta);
    3850             : #endif
    3851        7068 :             next_to_clean = strategy_buf_id;
    3852        7068 :             next_passes = strategy_passes;
    3853        7068 :             bufs_to_lap = NBuffers;
    3854             :         }
    3855             :     }
    3856             :     else
    3857             :     {
    3858             :         /*
    3859             :          * Initializing at startup or after LRU scanning had been off. Always
    3860             :          * start at the strategy point.
    3861             :          */
    3862             : #ifdef BGW_DEBUG
    3863             :         elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
    3864             :              strategy_passes, strategy_buf_id);
    3865             : #endif
    3866        1130 :         strategy_delta = 0;
    3867        1130 :         next_to_clean = strategy_buf_id;
    3868        1130 :         next_passes = strategy_passes;
    3869        1130 :         bufs_to_lap = NBuffers;
    3870             :     }
    3871             : 
    3872             :     /* Update saved info for next time */
    3873       28902 :     prev_strategy_buf_id = strategy_buf_id;
    3874       28902 :     prev_strategy_passes = strategy_passes;
    3875       28902 :     saved_info_valid = true;
    3876             : 
    3877             :     /*
    3878             :      * Compute how many buffers had to be scanned for each new allocation, ie,
    3879             :      * 1/density of reusable buffers, and track a moving average of that.
    3880             :      *
    3881             :      * If the strategy point didn't move, we don't update the density estimate
    3882             :      */
    3883       28902 :     if (strategy_delta > 0 && recent_alloc > 0)
    3884             :     {
    3885       15186 :         scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
    3886       15186 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    3887             :             smoothing_samples;
    3888             :     }
    3889             : 
    3890             :     /*
    3891             :      * Estimate how many reusable buffers there are between the current
    3892             :      * strategy point and where we've scanned ahead to, based on the smoothed
    3893             :      * density estimate.
    3894             :      */
    3895       28902 :     bufs_ahead = NBuffers - bufs_to_lap;
    3896       28902 :     reusable_buffers_est = (float) bufs_ahead / smoothed_density;
    3897             : 
    3898             :     /*
    3899             :      * Track a moving average of recent buffer allocations.  Here, rather than
    3900             :      * a true average we want a fast-attack, slow-decline behavior: we
    3901             :      * immediately follow any increase.
    3902             :      */
    3903       28902 :     if (smoothed_alloc <= (float) recent_alloc)
    3904        7866 :         smoothed_alloc = recent_alloc;
    3905             :     else
    3906       21036 :         smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
    3907             :             smoothing_samples;
    3908             : 
    3909             :     /* Scale the estimate by a GUC to allow more aggressive tuning. */
    3910       28902 :     upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
    3911             : 
    3912             :     /*
    3913             :      * If recent_alloc remains at zero for many cycles, smoothed_alloc will
    3914             :      * eventually underflow to zero, and the underflows produce annoying
    3915             :      * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
    3916             :      * zero, there's no point in tracking smaller and smaller values of
    3917             :      * smoothed_alloc, so just reset it to exactly zero to avoid this
    3918             :      * syndrome.  It will pop back up as soon as recent_alloc increases.
    3919             :      */
    3920       28902 :     if (upcoming_alloc_est == 0)
    3921        4814 :         smoothed_alloc = 0;
    3922             : 
    3923             :     /*
    3924             :      * Even in cases where there's been little or no buffer allocation
    3925             :      * activity, we want to make a small amount of progress through the buffer
    3926             :      * cache so that as many reusable buffers as possible are clean after an
    3927             :      * idle period.
    3928             :      *
    3929             :      * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
    3930             :      * the BGW will be called during the scan_whole_pool time; slice the
    3931             :      * buffer pool into that many sections.
    3932             :      */
    3933       28902 :     min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
    3934             : 
    3935       28902 :     if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
    3936             :     {
    3937             : #ifdef BGW_DEBUG
    3938             :         elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
    3939             :              upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
    3940             : #endif
    3941       13138 :         upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    3942             :     }
    3943             : 
    3944             :     /*
    3945             :      * Now write out dirty reusable buffers, working forward from the
    3946             :      * next_to_clean point, until we have lapped the strategy scan, or cleaned
    3947             :      * enough buffers to match our estimate of the next cycle's allocation
    3948             :      * requirements, or hit the bgwriter_lru_maxpages limit.
    3949             :      */
    3950             : 
    3951       28902 :     num_to_scan = bufs_to_lap;
    3952       28902 :     num_written = 0;
    3953       28902 :     reusable_buffers = reusable_buffers_est;
    3954             : 
    3955             :     /* Execute the LRU scan */
    3956     3962042 :     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    3957             :     {
    3958     3933146 :         int         sync_state = SyncOneBuffer(next_to_clean, true,
    3959             :                                                wb_context);
    3960             : 
    3961     3933146 :         if (++next_to_clean >= NBuffers)
    3962             :         {
    3963        6906 :             next_to_clean = 0;
    3964        6906 :             next_passes++;
    3965             :         }
    3966     3933146 :         num_to_scan--;
    3967             : 
    3968     3933146 :         if (sync_state & BUF_WRITTEN)
    3969             :         {
    3970       58164 :             reusable_buffers++;
    3971       58164 :             if (++num_written >= bgwriter_lru_maxpages)
    3972             :             {
    3973           6 :                 PendingBgWriterStats.maxwritten_clean++;
    3974           6 :                 break;
    3975             :             }
    3976             :         }
    3977     3874982 :         else if (sync_state & BUF_REUSABLE)
    3978     2938564 :             reusable_buffers++;
    3979             :     }
    3980             : 
    3981       28902 :     PendingBgWriterStats.buf_written_clean += num_written;
    3982             : 
    3983             : #ifdef BGW_DEBUG
    3984             :     elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
    3985             :          recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
    3986             :          smoothed_density, reusable_buffers_est, upcoming_alloc_est,
    3987             :          bufs_to_lap - num_to_scan,
    3988             :          num_written,
    3989             :          reusable_buffers - reusable_buffers_est);
    3990             : #endif
    3991             : 
    3992             :     /*
    3993             :      * Consider the above scan as being like a new allocation scan.
    3994             :      * Characterize its density and update the smoothed one based on it. This
    3995             :      * effectively halves the moving average period in cases where both the
    3996             :      * strategy and the background writer are doing some useful scanning,
    3997             :      * which is helpful because a long memory isn't as desirable on the
    3998             :      * density estimates.
    3999             :      */
    4000       28902 :     new_strategy_delta = bufs_to_lap - num_to_scan;
    4001       28902 :     new_recent_alloc = reusable_buffers - reusable_buffers_est;
    4002       28902 :     if (new_strategy_delta > 0 && new_recent_alloc > 0)
    4003             :     {
    4004       23632 :         scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
    4005       23632 :         smoothed_density += (scans_per_alloc - smoothed_density) /
    4006             :             smoothing_samples;
    4007             : 
    4008             : #ifdef BGW_DEBUG
    4009             :         elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
    4010             :              new_recent_alloc, new_strategy_delta,
    4011             :              scans_per_alloc, smoothed_density);
    4012             : #endif
    4013             :     }
    4014             : 
    4015             :     /* Return true if OK to hibernate */
    4016       28902 :     return (bufs_to_lap == 0 && recent_alloc == 0);
    4017             : }
    4018             : 
    4019             : /*
    4020             :  * SyncOneBuffer -- process a single buffer during syncing.
    4021             :  *
    4022             :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
    4023             :  * buffers marked recently used, as these are not replacement candidates.
    4024             :  *
    4025             :  * Returns a bitmask containing the following flag bits:
    4026             :  *  BUF_WRITTEN: we wrote the buffer.
    4027             :  *  BUF_REUSABLE: buffer is available for replacement, ie, it has
    4028             :  *      pin count 0 and usage count 0.
    4029             :  *
    4030             :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
    4031             :  * after locking it, but we don't care all that much.)
    4032             :  */
    4033             : static int
    4034     4482262 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
    4035             : {
    4036     4482262 :     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    4037     4482262 :     int         result = 0;
    4038             :     uint64      buf_state;
    4039             :     BufferTag   tag;
    4040             : 
    4041             :     /* Make sure we can handle the pin */
    4042     4482262 :     ReservePrivateRefCountEntry();
    4043     4482262 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    4044             : 
    4045             :     /*
    4046             :      * Check whether buffer needs writing.
    4047             :      *
    4048             :      * We can make this check without taking the buffer content lock so long
    4049             :      * as we mark pages dirty in access methods *before* logging changes with
    4050             :      * XLogInsert(): if someone marks the buffer dirty just after our check we
    4051             :      * don't worry because our checkpoint.redo points before log record for
    4052             :      * upcoming changes and so we are not required to write such dirty buffer.
    4053             :      */
    4054     4482262 :     buf_state = LockBufHdr(bufHdr);
    4055             : 
    4056     4482262 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
    4057     4472812 :         BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
    4058             :     {
    4059     3000704 :         result |= BUF_REUSABLE;
    4060             :     }
    4061     1481558 :     else if (skip_recently_used)
    4062             :     {
    4063             :         /* Caller told us not to write recently-used buffers */
    4064      936418 :         UnlockBufHdr(bufHdr);
    4065      936418 :         return result;
    4066             :     }
    4067             : 
    4068     3545844 :     if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
    4069             :     {
    4070             :         /* It's clean, so nothing to do */
    4071     2938564 :         UnlockBufHdr(bufHdr);
    4072     2938564 :         return result;
    4073             :     }
    4074             : 
    4075             :     /*
    4076             :      * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
    4077             :      * buffer is clean by the time we've locked it.)
    4078             :      */
    4079      607280 :     PinBuffer_Locked(bufHdr);
    4080             : 
    4081      607280 :     FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4082             : 
    4083      607280 :     tag = bufHdr->tag;
    4084             : 
    4085      607280 :     UnpinBuffer(bufHdr);
    4086             : 
    4087             :     /*
    4088             :      * SyncOneBuffer() is only called by checkpointer and bgwriter, so
    4089             :      * IOContext will always be IOCONTEXT_NORMAL.
    4090             :      */
    4091      607280 :     ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
    4092             : 
    4093      607280 :     return result | BUF_WRITTEN;
    4094             : }
    4095             : 
    4096             : /*
    4097             :  *      AtEOXact_Buffers - clean up at end of transaction.
    4098             :  *
    4099             :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    4100             :  *      ResourceOwner mechanism.  This routine is just a debugging
    4101             :  *      cross-check that no pins remain.
    4102             :  */
    4103             : void
    4104     1025086 : AtEOXact_Buffers(bool isCommit)
    4105             : {
    4106     1025086 :     CheckForBufferLeaks();
    4107             : 
    4108     1025086 :     AtEOXact_LocalBuffers(isCommit);
    4109             : 
    4110             :     Assert(PrivateRefCountOverflowed == 0);
    4111     1025086 : }
    4112             : 
    4113             : /*
    4114             :  * Initialize access to shared buffer pool
    4115             :  *
    4116             :  * This is called during backend startup (whether standalone or under the
    4117             :  * postmaster).  It sets up for this backend's access to the already-existing
    4118             :  * buffer pool.
    4119             :  */
    4120             : void
    4121       45352 : InitBufferManagerAccess(void)
    4122             : {
    4123             :     HASHCTL     hash_ctl;
    4124             : 
    4125             :     /*
    4126             :      * An advisory limit on the number of pins each backend should hold, based
    4127             :      * on shared_buffers and the maximum number of connections possible.
    4128             :      * That's very pessimistic, but outside toy-sized shared_buffers it should
    4129             :      * allow plenty of pins.  LimitAdditionalPins() and
    4130             :      * GetAdditionalPinLimit() can be used to check the remaining balance.
    4131             :      */
    4132       45352 :     MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
    4133             : 
    4134       45352 :     memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
    4135       45352 :     memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
    4136             : 
    4137       45352 :     hash_ctl.keysize = sizeof(Buffer);
    4138       45352 :     hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
    4139             : 
    4140       45352 :     PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
    4141             :                                       HASH_ELEM | HASH_BLOBS);
    4142             : 
    4143             :     /*
    4144             :      * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
    4145             :      * the corresponding phase of backend shutdown.
    4146             :      */
    4147             :     Assert(MyProc != NULL);
    4148       45352 :     on_shmem_exit(AtProcExit_Buffers, 0);
    4149       45352 : }
    4150             : 
    4151             : /*
    4152             :  * During backend exit, ensure that we released all shared-buffer locks and
    4153             :  * assert that we have no remaining pins.
    4154             :  */
    4155             : static void
    4156       45352 : AtProcExit_Buffers(int code, Datum arg)
    4157             : {
    4158       45352 :     UnlockBuffers();
    4159             : 
    4160       45352 :     CheckForBufferLeaks();
    4161             : 
    4162             :     /* localbuf.c needs a chance too */
    4163       45352 :     AtProcExit_LocalBuffers();
    4164       45352 : }
    4165             : 
    4166             : /*
    4167             :  *      CheckForBufferLeaks - ensure this backend holds no buffer pins
    4168             :  *
    4169             :  *      As of PostgreSQL 8.0, buffer pins should get released by the
    4170             :  *      ResourceOwner mechanism.  This routine is just a debugging
    4171             :  *      cross-check that no pins remain.
    4172             :  */
    4173             : static void
    4174     1070438 : CheckForBufferLeaks(void)
    4175             : {
    4176             : #ifdef USE_ASSERT_CHECKING
    4177             :     int         RefCountErrors = 0;
    4178             :     PrivateRefCountEntry *res;
    4179             :     int         i;
    4180             :     char       *s;
    4181             : 
    4182             :     /* check the array */
    4183             :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4184             :     {
    4185             :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4186             :         {
    4187             :             res = &PrivateRefCountArray[i];
    4188             : 
    4189             :             s = DebugPrintBufferRefcount(res->buffer);
    4190             :             elog(WARNING, "buffer refcount leak: %s", s);
    4191             :             pfree(s);
    4192             : 
    4193             :             RefCountErrors++;
    4194             :         }
    4195             :     }
    4196             : 
    4197             :     /* if necessary search the hash */
    4198             :     if (PrivateRefCountOverflowed)
    4199             :     {
    4200             :         HASH_SEQ_STATUS hstat;
    4201             : 
    4202             :         hash_seq_init(&hstat, PrivateRefCountHash);
    4203             :         while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
    4204             :         {
    4205             :             s = DebugPrintBufferRefcount(res->buffer);
    4206             :             elog(WARNING, "buffer refcount leak: %s", s);
    4207             :             pfree(s);
    4208             :             RefCountErrors++;
    4209             :         }
    4210             :     }
    4211             : 
    4212             :     Assert(RefCountErrors == 0);
    4213             : #endif
    4214     1070438 : }
    4215             : 
    4216             : #ifdef USE_ASSERT_CHECKING
    4217             : /*
    4218             :  * Check for exclusive-locked catalog buffers.  This is the core of
    4219             :  * AssertCouldGetRelation().
    4220             :  *
    4221             :  * A backend would self-deadlock on the content lock if the catalog scan read
    4222             :  * the exclusive-locked buffer.  The main threat is exclusive-locked buffers
    4223             :  * of catalogs used in relcache, because a catcache search on any catalog may
    4224             :  * build that catalog's relcache entry.  We don't have an inventory of
    4225             :  * catalogs relcache uses, so just check buffers of most catalogs.
    4226             :  *
    4227             :  * It's better to minimize waits while holding an exclusive buffer lock, so it
    4228             :  * would be nice to broaden this check not to be catalog-specific.  However,
    4229             :  * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
    4230             :  * read tables.  That is deadlock-free as long as there's no loop in the
    4231             :  * dependency graph: modifying table A may cause an opclass to read table B,
    4232             :  * but it must not cause a read of table A.
    4233             :  */
    4234             : void
    4235             : AssertBufferLocksPermitCatalogRead(void)
    4236             : {
    4237             :     PrivateRefCountEntry *res;
    4238             : 
    4239             :     /* check the array */
    4240             :     for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
    4241             :     {
    4242             :         if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
    4243             :         {
    4244             :             res = &PrivateRefCountArray[i];
    4245             : 
    4246             :             if (res->buffer == InvalidBuffer)
    4247             :                 continue;
    4248             : 
    4249             :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4250             :         }
    4251             :     }
    4252             : 
    4253             :     /* if necessary search the hash */
    4254             :     if (PrivateRefCountOverflowed)
    4255             :     {
    4256             :         HASH_SEQ_STATUS hstat;
    4257             : 
    4258             :         hash_seq_init(&hstat, PrivateRefCountHash);
    4259             :         while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
    4260             :         {
    4261             :             AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
    4262             :         }
    4263             :     }
    4264             : }
    4265             : 
    4266             : static void
    4267             : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
    4268             : {
    4269             :     BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
    4270             :     BufferTag   tag;
    4271             :     Oid         relid;
    4272             : 
    4273             :     if (mode != BUFFER_LOCK_EXCLUSIVE)
    4274             :         return;
    4275             : 
    4276             :     tag = bufHdr->tag;
    4277             : 
    4278             :     /*
    4279             :      * This relNumber==relid assumption holds until a catalog experiences
    4280             :      * VACUUM FULL or similar.  After a command like that, relNumber will be
    4281             :      * in the normal (non-catalog) range, and we lose the ability to detect
    4282             :      * hazardous access to that catalog.  Calling RelidByRelfilenumber() would
    4283             :      * close that gap, but RelidByRelfilenumber() might then deadlock with a
    4284             :      * held lock.
    4285             :      */
    4286             :     relid = tag.relNumber;
    4287             : 
    4288             :     if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
    4289             :         return;
    4290             : 
    4291             :     Assert(!IsCatalogRelationOid(relid));
    4292             : }
    4293             : #endif
    4294             : 
    4295             : 
    4296             : /*
    4297             :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
    4298             :  */
    4299             : char *
    4300          80 : DebugPrintBufferRefcount(Buffer buffer)
    4301             : {
    4302             :     BufferDesc *buf;
    4303             :     int32       loccount;
    4304             :     char       *result;
    4305             :     ProcNumber  backend;
    4306             :     uint64      buf_state;
    4307             : 
    4308             :     Assert(BufferIsValid(buffer));
    4309          80 :     if (BufferIsLocal(buffer))
    4310             :     {
    4311          32 :         buf = GetLocalBufferDescriptor(-buffer - 1);
    4312          32 :         loccount = LocalRefCount[-buffer - 1];
    4313          32 :         backend = MyProcNumber;
    4314             :     }
    4315             :     else
    4316             :     {
    4317          48 :         buf = GetBufferDescriptor(buffer - 1);
    4318          48 :         loccount = GetPrivateRefCount(buffer);
    4319          48 :         backend = INVALID_PROC_NUMBER;
    4320             :     }
    4321             : 
    4322             :     /* theoretically we should lock the bufHdr here */
    4323          80 :     buf_state = pg_atomic_read_u64(&buf->state);
    4324             : 
    4325          80 :     result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
    4326             :                       buffer,
    4327          80 :                       relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
    4328             :                                      BufTagGetForkNum(&buf->tag)).str,
    4329             :                       buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
    4330             :                       BUF_STATE_GET_REFCOUNT(buf_state), loccount);
    4331          80 :     return result;
    4332             : }
    4333             : 
    4334             : /*
    4335             :  * CheckPointBuffers
    4336             :  *
    4337             :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
    4338             :  *
    4339             :  * Note: temporary relations do not participate in checkpoints, so they don't
    4340             :  * need to be flushed.
    4341             :  */
    4342             : void
    4343        3574 : CheckPointBuffers(int flags)
    4344             : {
    4345        3574 :     BufferSync(flags);
    4346        3574 : }
    4347             : 
    4348             : /*
    4349             :  * BufferGetBlockNumber
    4350             :  *      Returns the block number associated with a buffer.
    4351             :  *
    4352             :  * Note:
    4353             :  *      Assumes that the buffer is valid and pinned, else the
    4354             :  *      value may be obsolete immediately...
    4355             :  */
    4356             : BlockNumber
    4357   101225014 : BufferGetBlockNumber(Buffer buffer)
    4358             : {
    4359             :     BufferDesc *bufHdr;
    4360             : 
    4361             :     Assert(BufferIsPinned(buffer));
    4362             : 
    4363   101225014 :     if (BufferIsLocal(buffer))
    4364     3808214 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4365             :     else
    4366    97416800 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4367             : 
    4368             :     /* pinned, so OK to read tag without spinlock */
    4369   101225014 :     return bufHdr->tag.blockNum;
    4370             : }
    4371             : 
    4372             : /*
    4373             :  * BufferGetTag
    4374             :  *      Returns the relfilelocator, fork number and block number associated with
    4375             :  *      a buffer.
    4376             :  */
    4377             : void
    4378    31778690 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
    4379             :              BlockNumber *blknum)
    4380             : {
    4381             :     BufferDesc *bufHdr;
    4382             : 
    4383             :     /* Do the same checks as BufferGetBlockNumber. */
    4384             :     Assert(BufferIsPinned(buffer));
    4385             : 
    4386    31778690 :     if (BufferIsLocal(buffer))
    4387           0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
    4388             :     else
    4389    31778690 :         bufHdr = GetBufferDescriptor(buffer - 1);
    4390             : 
    4391             :     /* pinned, so OK to read tag without spinlock */
    4392    31778690 :     *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    4393    31778690 :     *forknum = BufTagGetForkNum(&bufHdr->tag);
    4394    31778690 :     *blknum = bufHdr->tag.blockNum;
    4395    31778690 : }
    4396             : 
    4397             : /*
    4398             :  * FlushBuffer
    4399             :  *      Physically write out a shared buffer.
    4400             :  *
    4401             :  * NOTE: this actually just passes the buffer contents to the kernel; the
    4402             :  * real write to disk won't happen until the kernel feels like it.  This
    4403             :  * is okay from our point of view since we can redo the changes from WAL.
    4404             :  * However, we will need to force the changes to disk via fsync before
    4405             :  * we can checkpoint WAL.
    4406             :  *
    4407             :  * The caller must hold a pin on the buffer and have share-locked the
    4408             :  * buffer contents.  (Note: a share-lock does not prevent updates of
    4409             :  * hint bits in the buffer, so the page could change while the write
    4410             :  * is in progress, but we assume that that will not invalidate the data
    4411             :  * written.)
    4412             :  *
    4413             :  * If the caller has an smgr reference for the buffer's relation, pass it
    4414             :  * as the second parameter.  If not, pass NULL.
    4415             :  */
    4416             : static void
    4417     1151278 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
    4418             :             IOContext io_context)
    4419             : {
    4420             :     XLogRecPtr  recptr;
    4421             :     ErrorContextCallback errcallback;
    4422             :     instr_time  io_start;
    4423             :     Block       bufBlock;
    4424             :     char       *bufToWrite;
    4425             :     uint64      buf_state;
    4426             : 
    4427             :     /*
    4428             :      * Try to start an I/O operation.  If StartBufferIO returns false, then
    4429             :      * someone else flushed the buffer before we could, so we need not do
    4430             :      * anything.
    4431             :      */
    4432     1151278 :     if (!StartBufferIO(buf, false, false))
    4433          24 :         return;
    4434             : 
    4435             :     /* Setup error traceback support for ereport() */
    4436     1151254 :     errcallback.callback = shared_buffer_write_error_callback;
    4437     1151254 :     errcallback.arg = buf;
    4438     1151254 :     errcallback.previous = error_context_stack;
    4439     1151254 :     error_context_stack = &errcallback;
    4440             : 
    4441             :     /* Find smgr relation for buffer */
    4442     1151254 :     if (reln == NULL)
    4443     1146440 :         reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
    4444             : 
    4445             :     TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
    4446             :                                         buf->tag.blockNum,
    4447             :                                         reln->smgr_rlocator.locator.spcOid,
    4448             :                                         reln->smgr_rlocator.locator.dbOid,
    4449             :                                         reln->smgr_rlocator.locator.relNumber);
    4450             : 
    4451     1151254 :     buf_state = LockBufHdr(buf);
    4452             : 
    4453             :     /*
    4454             :      * Run PageGetLSN while holding header lock, since we don't have the
    4455             :      * buffer locked exclusively in all cases.
    4456             :      */
    4457     1151254 :     recptr = BufferGetLSN(buf);
    4458             : 
    4459             :     /* To check if block content changes while flushing. - vadim 01/17/97 */
    4460     1151254 :     UnlockBufHdrExt(buf, buf_state,
    4461             :                     0, BM_JUST_DIRTIED,
    4462             :                     0);
    4463             : 
    4464             :     /*
    4465             :      * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
    4466             :      * rule that log updates must hit disk before any of the data-file changes
    4467             :      * they describe do.
    4468             :      *
    4469             :      * However, this rule does not apply to unlogged relations, which will be
    4470             :      * lost after a crash anyway.  Most unlogged relation pages do not bear
    4471             :      * LSNs since we never emit WAL records for them, and therefore flushing
    4472             :      * up through the buffer LSN would be useless, but harmless.  However,
    4473             :      * GiST indexes use LSNs internally to track page-splits, and therefore
    4474             :      * unlogged GiST pages bear "fake" LSNs generated by
    4475             :      * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
    4476             :      * LSN counter could advance past the WAL insertion point; and if it did
    4477             :      * happen, attempting to flush WAL through that location would fail, with
    4478             :      * disastrous system-wide consequences.  To make sure that can't happen,
    4479             :      * skip the flush if the buffer isn't permanent.
    4480             :      */
    4481     1151254 :     if (buf_state & BM_PERMANENT)
    4482     1147634 :         XLogFlush(recptr);
    4483             : 
    4484             :     /*
    4485             :      * Now it's safe to write the buffer to disk. Note that no one else should
    4486             :      * have been able to write it, while we were busy with log flushing,
    4487             :      * because we got the exclusive right to perform I/O by setting the
    4488             :      * BM_IO_IN_PROGRESS bit.
    4489             :      */
    4490     1151254 :     bufBlock = BufHdrGetBlock(buf);
    4491             : 
    4492             :     /*
    4493             :      * Update page checksum if desired.  Since we have only shared lock on the
    4494             :      * buffer, other processes might be updating hint bits in it, so we must
    4495             :      * copy the page to private storage if we do checksumming.
    4496             :      */
    4497     1151254 :     bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
    4498             : 
    4499     1151254 :     io_start = pgstat_prepare_io_time(track_io_timing);
    4500             : 
    4501             :     /*
    4502             :      * bufToWrite is either the shared buffer or a copy, as appropriate.
    4503             :      */
    4504     1151254 :     smgrwrite(reln,
    4505     1151254 :               BufTagGetForkNum(&buf->tag),
    4506             :               buf->tag.blockNum,
    4507             :               bufToWrite,
    4508             :               false);
    4509             : 
    4510             :     /*
    4511             :      * When a strategy is in use, only flushes of dirty buffers already in the
    4512             :      * strategy ring are counted as strategy writes (IOCONTEXT
    4513             :      * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
    4514             :      * statistics tracking.
    4515             :      *
    4516             :      * If a shared buffer initially added to the ring must be flushed before
    4517             :      * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
    4518             :      *
    4519             :      * If a shared buffer which was added to the ring later because the
    4520             :      * current strategy buffer is pinned or in use or because all strategy
    4521             :      * buffers were dirty and rejected (for BAS_BULKREAD operations only)
    4522             :      * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
    4523             :      * (from_ring will be false).
    4524             :      *
    4525             :      * When a strategy is not in use, the write can only be a "regular" write
    4526             :      * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
    4527             :      */
    4528     1151254 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    4529             :                             IOOP_WRITE, io_start, 1, BLCKSZ);
    4530             : 
    4531     1151254 :     pgBufferUsage.shared_blks_written++;
    4532             : 
    4533             :     /*
    4534             :      * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
    4535             :      * end the BM_IO_IN_PROGRESS state.
    4536             :      */
    4537     1151254 :     TerminateBufferIO(buf, true, 0, true, false);
    4538             : 
    4539             :     TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
    4540             :                                        buf->tag.blockNum,
    4541             :                                        reln->smgr_rlocator.locator.spcOid,
    4542             :                                        reln->smgr_rlocator.locator.dbOid,
    4543             :                                        reln->smgr_rlocator.locator.relNumber);
    4544             : 
    4545             :     /* Pop the error context stack */
    4546     1151254 :     error_context_stack = errcallback.previous;
    4547             : }
    4548             : 
    4549             : /*
    4550             :  * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
    4551             :  * before/after calling FlushBuffer().
    4552             :  */
    4553             : static void
    4554      614040 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
    4555             :                     IOObject io_object, IOContext io_context)
    4556             : {
    4557      614040 :     Buffer      buffer = BufferDescriptorGetBuffer(buf);
    4558             : 
    4559      614040 :     BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE);
    4560      614040 :     FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    4561      614040 :     BufferLockUnlock(buffer, buf);
    4562      614040 : }
    4563             : 
    4564             : /*
    4565             :  * RelationGetNumberOfBlocksInFork
    4566             :  *      Determines the current number of pages in the specified relation fork.
    4567             :  *
    4568             :  * Note that the accuracy of the result will depend on the details of the
    4569             :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
    4570             :  * it might not be.
    4571             :  */
    4572             : BlockNumber
    4573     3802054 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
    4574             : {
    4575     3802054 :     if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
    4576             :     {
    4577             :         /*
    4578             :          * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
    4579             :          * tableam returns the size in bytes - but for the purpose of this
    4580             :          * routine, we want the number of blocks. Therefore divide, rounding
    4581             :          * up.
    4582             :          */
    4583             :         uint64      szbytes;
    4584             : 
    4585     2872744 :         szbytes = table_relation_size(relation, forkNum);
    4586             : 
    4587     2872706 :         return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
    4588             :     }
    4589      929310 :     else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
    4590             :     {
    4591      929310 :         return smgrnblocks(RelationGetSmgr(relation), forkNum);
    4592             :     }
    4593             :     else
    4594             :         Assert(false);
    4595             : 
    4596           0 :     return 0;                   /* keep compiler quiet */
    4597             : }
    4598             : 
    4599             : /*
    4600             :  * BufferIsPermanent
    4601             :  *      Determines whether a buffer will potentially still be around after
    4602             :  *      a crash.  Caller must hold a buffer pin.
    4603             :  */
    4604             : bool
    4605    19168026 : BufferIsPermanent(Buffer buffer)
    4606             : {
    4607             :     BufferDesc *bufHdr;
    4608             : 
    4609             :     /* Local buffers are used only for temp relations. */
    4610    19168026 :     if (BufferIsLocal(buffer))
    4611     1254082 :         return false;
    4612             : 
    4613             :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4614             :     Assert(BufferIsValid(buffer));
    4615             :     Assert(BufferIsPinned(buffer));
    4616             : 
    4617             :     /*
    4618             :      * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
    4619             :      * need not bother with the buffer header spinlock.  Even if someone else
    4620             :      * changes the buffer header state while we're doing this, the state is
    4621             :      * changed atomically, so we'll read the old value or the new value, but
    4622             :      * not random garbage.
    4623             :      */
    4624    17913944 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4625    17913944 :     return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
    4626             : }
    4627             : 
    4628             : /*
    4629             :  * BufferGetLSNAtomic
    4630             :  *      Retrieves the LSN of the buffer atomically using a buffer header lock.
    4631             :  *      This is necessary for some callers who may not have an exclusive lock
    4632             :  *      on the buffer.
    4633             :  */
    4634             : XLogRecPtr
    4635    14268304 : BufferGetLSNAtomic(Buffer buffer)
    4636             : {
    4637    14268304 :     char       *page = BufferGetPage(buffer);
    4638             :     BufferDesc *bufHdr;
    4639             :     XLogRecPtr  lsn;
    4640             : 
    4641             :     /*
    4642             :      * If we don't need locking for correctness, fastpath out.
    4643             :      */
    4644    14268304 :     if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
    4645      478398 :         return PageGetLSN(page);
    4646             : 
    4647             :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
    4648             :     Assert(BufferIsValid(buffer));
    4649             :     Assert(BufferIsPinned(buffer));
    4650             : 
    4651    13789906 :     bufHdr = GetBufferDescriptor(buffer - 1);
    4652    13789906 :     LockBufHdr(bufHdr);
    4653    13789906 :     lsn = PageGetLSN(page);
    4654    13789906 :     UnlockBufHdr(bufHdr);
    4655             : 
    4656    13789906 :     return lsn;
    4657             : }
    4658             : 
    4659             : /* ---------------------------------------------------------------------
    4660             :  *      DropRelationBuffers
    4661             :  *
    4662             :  *      This function removes from the buffer pool all the pages of the
    4663             :  *      specified relation forks that have block numbers >= firstDelBlock.
    4664             :  *      (In particular, with firstDelBlock = 0, all pages are removed.)
    4665             :  *      Dirty pages are simply dropped, without bothering to write them
    4666             :  *      out first.  Therefore, this is NOT rollback-able, and so should be
    4667             :  *      used only with extreme caution!
    4668             :  *
    4669             :  *      Currently, this is called only from smgr.c when the underlying file
    4670             :  *      is about to be deleted or truncated (firstDelBlock is needed for
    4671             :  *      the truncation case).  The data in the affected pages would therefore
    4672             :  *      be deleted momentarily anyway, and there is no point in writing it.
    4673             :  *      It is the responsibility of higher-level code to ensure that the
    4674             :  *      deletion or truncation does not lose any data that could be needed
    4675             :  *      later.  It is also the responsibility of higher-level code to ensure
    4676             :  *      that no other process could be trying to load more pages of the
    4677             :  *      relation into buffers.
    4678             :  * --------------------------------------------------------------------
    4679             :  */
    4680             : void
    4681        1290 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
    4682             :                     int nforks, BlockNumber *firstDelBlock)
    4683             : {
    4684             :     int         i;
    4685             :     int         j;
    4686             :     RelFileLocatorBackend rlocator;
    4687             :     BlockNumber nForkBlock[MAX_FORKNUM];
    4688        1290 :     uint64      nBlocksToInvalidate = 0;
    4689             : 
    4690        1290 :     rlocator = smgr_reln->smgr_rlocator;
    4691             : 
    4692             :     /* If it's a local relation, it's localbuf.c's problem. */
    4693        1290 :     if (RelFileLocatorBackendIsTemp(rlocator))
    4694             :     {
    4695         750 :         if (rlocator.backend == MyProcNumber)
    4696         750 :             DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
    4697             :                                      firstDelBlock);
    4698             : 
    4699         830 :         return;
    4700             :     }
    4701             : 
    4702             :     /*
    4703             :      * To remove all the pages of the specified relation forks from the buffer
    4704             :      * pool, we need to scan the entire buffer pool but we can optimize it by
    4705             :      * finding the buffers from BufMapping table provided we know the exact
    4706             :      * size of each fork of the relation. The exact size is required to ensure
    4707             :      * that we don't leave any buffer for the relation being dropped as
    4708             :      * otherwise the background writer or checkpointer can lead to a PANIC
    4709             :      * error while flushing buffers corresponding to files that don't exist.
    4710             :      *
    4711             :      * To know the exact size, we rely on the size cached for each fork by us
    4712             :      * during recovery which limits the optimization to recovery and on
    4713             :      * standbys but we can easily extend it once we have shared cache for
    4714             :      * relation size.
    4715             :      *
    4716             :      * In recovery, we cache the value returned by the first lseek(SEEK_END)
    4717             :      * and the future writes keeps the cached value up-to-date. See
    4718             :      * smgrextend. It is possible that the value of the first lseek is smaller
    4719             :      * than the actual number of existing blocks in the file due to buggy
    4720             :      * Linux kernels that might not have accounted for the recent write. But
    4721             :      * that should be fine because there must not be any buffers after that
    4722             :      * file size.
    4723             :      */
    4724         746 :     for (i = 0; i < nforks; i++)
    4725             :     {
    4726             :         /* Get the number of blocks for a relation's fork */
    4727         636 :         nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
    4728             : 
    4729         636 :         if (nForkBlock[i] == InvalidBlockNumber)
    4730             :         {
    4731         430 :             nBlocksToInvalidate = InvalidBlockNumber;
    4732         430 :             break;
    4733             :         }
    4734             : 
    4735             :         /* calculate the number of blocks to be invalidated */
    4736         206 :         nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
    4737             :     }
    4738             : 
    4739             :     /*
    4740             :      * We apply the optimization iff the total number of blocks to invalidate
    4741             :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4742             :      */
    4743         540 :     if (BlockNumberIsValid(nBlocksToInvalidate) &&
    4744         110 :         nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4745             :     {
    4746         218 :         for (j = 0; j < nforks; j++)
    4747         138 :             FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
    4748         138 :                                        nForkBlock[j], firstDelBlock[j]);
    4749          80 :         return;
    4750             :     }
    4751             : 
    4752     5846476 :     for (i = 0; i < NBuffers; i++)
    4753             :     {
    4754     5846016 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    4755             : 
    4756             :         /*
    4757             :          * We can make this a tad faster by prechecking the buffer tag before
    4758             :          * we attempt to lock the buffer; this saves a lot of lock
    4759             :          * acquisitions in typical cases.  It should be safe because the
    4760             :          * caller must have AccessExclusiveLock on the relation, or some other
    4761             :          * reason to be certain that no one is loading new pages of the rel
    4762             :          * into the buffer pool.  (Otherwise we might well miss such pages
    4763             :          * entirely.)  Therefore, while the tag might be changing while we
    4764             :          * look at it, it can't be changing *to* a value we care about, only
    4765             :          * *away* from such a value.  So false negatives are impossible, and
    4766             :          * false positives are safe because we'll recheck after getting the
    4767             :          * buffer lock.
    4768             :          *
    4769             :          * We could check forkNum and blockNum as well as the rlocator, but
    4770             :          * the incremental win from doing so seems small.
    4771             :          */
    4772     5846016 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
    4773     5829570 :             continue;
    4774             : 
    4775       16446 :         LockBufHdr(bufHdr);
    4776             : 
    4777       41204 :         for (j = 0; j < nforks; j++)
    4778             :         {
    4779       29090 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
    4780       29090 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
    4781       16238 :                 bufHdr->tag.blockNum >= firstDelBlock[j])
    4782             :             {
    4783        4332 :                 InvalidateBuffer(bufHdr);   /* releases spinlock */
    4784        4332 :                 break;
    4785             :             }
    4786             :         }
    4787       16446 :         if (j >= nforks)
    4788       12114 :             UnlockBufHdr(bufHdr);
    4789             :     }
    4790             : }
    4791             : 
    4792             : /* ---------------------------------------------------------------------
    4793             :  *      DropRelationsAllBuffers
    4794             :  *
    4795             :  *      This function removes from the buffer pool all the pages of all
    4796             :  *      forks of the specified relations.  It's equivalent to calling
    4797             :  *      DropRelationBuffers once per fork per relation with firstDelBlock = 0.
    4798             :  *      --------------------------------------------------------------------
    4799             :  */
    4800             : void
    4801       28642 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
    4802             : {
    4803             :     int         i;
    4804       28642 :     int         n = 0;
    4805             :     SMgrRelation *rels;
    4806             :     BlockNumber (*block)[MAX_FORKNUM + 1];
    4807       28642 :     uint64      nBlocksToInvalidate = 0;
    4808             :     RelFileLocator *locators;
    4809       28642 :     bool        cached = true;
    4810             :     bool        use_bsearch;
    4811             : 
    4812       28642 :     if (nlocators == 0)
    4813           0 :         return;
    4814             : 
    4815       28642 :     rels = palloc_array(SMgrRelation, nlocators);   /* non-local relations */
    4816             : 
    4817             :     /* If it's a local relation, it's localbuf.c's problem. */
    4818      125212 :     for (i = 0; i < nlocators; i++)
    4819             :     {
    4820       96570 :         if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
    4821             :         {
    4822        6532 :             if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
    4823        6532 :                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
    4824             :         }
    4825             :         else
    4826       90038 :             rels[n++] = smgr_reln[i];
    4827             :     }
    4828             : 
    4829             :     /*
    4830             :      * If there are no non-local relations, then we're done. Release the
    4831             :      * memory and return.
    4832             :      */
    4833       28642 :     if (n == 0)
    4834             :     {
    4835        1720 :         pfree(rels);
    4836        1720 :         return;
    4837             :     }
    4838             : 
    4839             :     /*
    4840             :      * This is used to remember the number of blocks for all the relations
    4841             :      * forks.
    4842             :      */
    4843             :     block = (BlockNumber (*)[MAX_FORKNUM + 1])
    4844       26922 :         palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
    4845             : 
    4846             :     /*
    4847             :      * We can avoid scanning the entire buffer pool if we know the exact size
    4848             :      * of each of the given relation forks. See DropRelationBuffers.
    4849             :      */
    4850       56430 :     for (i = 0; i < n && cached; i++)
    4851             :     {
    4852       46512 :         for (int j = 0; j <= MAX_FORKNUM; j++)
    4853             :         {
    4854             :             /* Get the number of blocks for a relation's fork. */
    4855       42290 :             block[i][j] = smgrnblocks_cached(rels[i], j);
    4856             : 
    4857             :             /* We need to only consider the relation forks that exists. */
    4858       42290 :             if (block[i][j] == InvalidBlockNumber)
    4859             :             {
    4860       37746 :                 if (!smgrexists(rels[i], j))
    4861       12460 :                     continue;
    4862       25286 :                 cached = false;
    4863       25286 :                 break;
    4864             :             }
    4865             : 
    4866             :             /* calculate the total number of blocks to be invalidated */
    4867        4544 :             nBlocksToInvalidate += block[i][j];
    4868             :         }
    4869             :     }
    4870             : 
    4871             :     /*
    4872             :      * We apply the optimization iff the total number of blocks to invalidate
    4873             :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
    4874             :      */
    4875       26922 :     if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
    4876             :     {
    4877        2722 :         for (i = 0; i < n; i++)
    4878             :         {
    4879        7500 :             for (int j = 0; j <= MAX_FORKNUM; j++)
    4880             :             {
    4881             :                 /* ignore relation forks that doesn't exist */
    4882        6000 :                 if (!BlockNumberIsValid(block[i][j]))
    4883        4482 :                     continue;
    4884             : 
    4885             :                 /* drop all the buffers for a particular relation fork */
    4886        1518 :                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
    4887        1518 :                                            j, block[i][j], 0);
    4888             :             }
    4889             :         }
    4890             : 
    4891        1222 :         pfree(block);
    4892        1222 :         pfree(rels);
    4893        1222 :         return;
    4894             :     }
    4895             : 
    4896       25700 :     pfree(block);
    4897       25700 :     locators = palloc_array(RelFileLocator, n); /* non-local relations */
    4898      114238 :     for (i = 0; i < n; i++)
    4899       88538 :         locators[i] = rels[i]->smgr_rlocator.locator;
    4900             : 
    4901             :     /*
    4902             :      * For low number of relations to drop just use a simple walk through, to
    4903             :      * save the bsearch overhead. The threshold to use is rather a guess than
    4904             :      * an exactly determined value, as it depends on many factors (CPU and RAM
    4905             :      * speeds, amount of shared buffers etc.).
    4906             :      */
    4907       25700 :     use_bsearch = n > RELS_BSEARCH_THRESHOLD;
    4908             : 
    4909             :     /* sort the list of rlocators if necessary */
    4910       25700 :     if (use_bsearch)
    4911         348 :         qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
    4912             : 
    4913   276936292 :     for (i = 0; i < NBuffers; i++)
    4914             :     {
    4915   276910592 :         RelFileLocator *rlocator = NULL;
    4916   276910592 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    4917             : 
    4918             :         /*
    4919             :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    4920             :          * saves some cycles.
    4921             :          */
    4922             : 
    4923   276910592 :         if (!use_bsearch)
    4924             :         {
    4925             :             int         j;
    4926             : 
    4927  1110004416 :             for (j = 0; j < n; j++)
    4928             :             {
    4929   837020424 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
    4930             :                 {
    4931      175688 :                     rlocator = &locators[j];
    4932      175688 :                     break;
    4933             :                 }
    4934             :             }
    4935             :         }
    4936             :         else
    4937             :         {
    4938             :             RelFileLocator locator;
    4939             : 
    4940     3750912 :             locator = BufTagGetRelFileLocator(&bufHdr->tag);
    4941     3750912 :             rlocator = bsearch(&locator,
    4942             :                                locators, n, sizeof(RelFileLocator),
    4943             :                                rlocator_comparator);
    4944             :         }
    4945             : 
    4946             :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    4947   276910592 :         if (rlocator == NULL)
    4948   276731878 :             continue;
    4949             : 
    4950      178714 :         LockBufHdr(bufHdr);
    4951      178714 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
    4952      178714 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    4953             :         else
    4954           0 :             UnlockBufHdr(bufHdr);
    4955             :     }
    4956             : 
    4957       25700 :     pfree(locators);
    4958       25700 :     pfree(rels);
    4959             : }
    4960             : 
    4961             : /* ---------------------------------------------------------------------
    4962             :  *      FindAndDropRelationBuffers
    4963             :  *
    4964             :  *      This function performs look up in BufMapping table and removes from the
    4965             :  *      buffer pool all the pages of the specified relation fork that has block
    4966             :  *      number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
    4967             :  *      pages are removed.)
    4968             :  * --------------------------------------------------------------------
    4969             :  */
    4970             : static void
    4971        1656 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
    4972             :                            BlockNumber nForkBlock,
    4973             :                            BlockNumber firstDelBlock)
    4974             : {
    4975             :     BlockNumber curBlock;
    4976             : 
    4977        3988 :     for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
    4978             :     {
    4979             :         uint32      bufHash;    /* hash value for tag */
    4980             :         BufferTag   bufTag;     /* identity of requested block */
    4981             :         LWLock     *bufPartitionLock;   /* buffer partition lock for it */
    4982             :         int         buf_id;
    4983             :         BufferDesc *bufHdr;
    4984             : 
    4985             :         /* create a tag so we can lookup the buffer */
    4986        2332 :         InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
    4987             : 
    4988             :         /* determine its hash code and partition lock ID */
    4989        2332 :         bufHash = BufTableHashCode(&bufTag);
    4990        2332 :         bufPartitionLock = BufMappingPartitionLock(bufHash);
    4991             : 
    4992             :         /* Check that it is in the buffer pool. If not, do nothing. */
    4993        2332 :         LWLockAcquire(bufPartitionLock, LW_SHARED);
    4994        2332 :         buf_id = BufTableLookup(&bufTag, bufHash);
    4995        2332 :         LWLockRelease(bufPartitionLock);
    4996             : 
    4997        2332 :         if (buf_id < 0)
    4998         242 :             continue;
    4999             : 
    5000        2090 :         bufHdr = GetBufferDescriptor(buf_id);
    5001             : 
    5002             :         /*
    5003             :          * We need to lock the buffer header and recheck if the buffer is
    5004             :          * still associated with the same block because the buffer could be
    5005             :          * evicted by some other backend loading blocks for a different
    5006             :          * relation after we release lock on the BufMapping table.
    5007             :          */
    5008        2090 :         LockBufHdr(bufHdr);
    5009             : 
    5010        4180 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
    5011        2090 :             BufTagGetForkNum(&bufHdr->tag) == forkNum &&
    5012        2090 :             bufHdr->tag.blockNum >= firstDelBlock)
    5013        2090 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    5014             :         else
    5015           0 :             UnlockBufHdr(bufHdr);
    5016             :     }
    5017        1656 : }
    5018             : 
    5019             : /* ---------------------------------------------------------------------
    5020             :  *      DropDatabaseBuffers
    5021             :  *
    5022             :  *      This function removes all the buffers in the buffer cache for a
    5023             :  *      particular database.  Dirty pages are simply dropped, without
    5024             :  *      bothering to write them out first.  This is used when we destroy a
    5025             :  *      database, to avoid trying to flush data to disk when the directory
    5026             :  *      tree no longer exists.  Implementation is pretty similar to
    5027             :  *      DropRelationBuffers() which is for destroying just one relation.
    5028             :  * --------------------------------------------------------------------
    5029             :  */
    5030             : void
    5031         152 : DropDatabaseBuffers(Oid dbid)
    5032             : {
    5033             :     int         i;
    5034             : 
    5035             :     /*
    5036             :      * We needn't consider local buffers, since by assumption the target
    5037             :      * database isn't our own.
    5038             :      */
    5039             : 
    5040     1092504 :     for (i = 0; i < NBuffers; i++)
    5041             :     {
    5042     1092352 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    5043             : 
    5044             :         /*
    5045             :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5046             :          * saves some cycles.
    5047             :          */
    5048     1092352 :         if (bufHdr->tag.dbOid != dbid)
    5049     1064270 :             continue;
    5050             : 
    5051       28082 :         LockBufHdr(bufHdr);
    5052       28082 :         if (bufHdr->tag.dbOid == dbid)
    5053       28082 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
    5054             :         else
    5055           0 :             UnlockBufHdr(bufHdr);
    5056             :     }
    5057         152 : }
    5058             : 
    5059             : /* ---------------------------------------------------------------------
    5060             :  *      FlushRelationBuffers
    5061             :  *
    5062             :  *      This function writes all dirty pages of a relation out to disk
    5063             :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    5064             :  *      kernel has an up-to-date view of the relation.
    5065             :  *
    5066             :  *      Generally, the caller should be holding AccessExclusiveLock on the
    5067             :  *      target relation to ensure that no other backend is busy dirtying
    5068             :  *      more blocks of the relation; the effects can't be expected to last
    5069             :  *      after the lock is released.
    5070             :  *
    5071             :  *      XXX currently it sequentially searches the buffer pool, should be
    5072             :  *      changed to more clever ways of searching.  This routine is not
    5073             :  *      used in any performance-critical code paths, so it's not worth
    5074             :  *      adding additional overhead to normal paths to make it go faster.
    5075             :  * --------------------------------------------------------------------
    5076             :  */
    5077             : void
    5078         276 : FlushRelationBuffers(Relation rel)
    5079             : {
    5080             :     int         i;
    5081             :     BufferDesc *bufHdr;
    5082         276 :     SMgrRelation srel = RelationGetSmgr(rel);
    5083             : 
    5084         276 :     if (RelationUsesLocalBuffers(rel))
    5085             :     {
    5086        1818 :         for (i = 0; i < NLocBuffer; i++)
    5087             :         {
    5088             :             uint64      buf_state;
    5089             : 
    5090        1800 :             bufHdr = GetLocalBufferDescriptor(i);
    5091        1800 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5092         600 :                 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
    5093             :                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5094             :             {
    5095             :                 ErrorContextCallback errcallback;
    5096             : 
    5097             :                 /* Setup error traceback support for ereport() */
    5098         600 :                 errcallback.callback = local_buffer_write_error_callback;
    5099         600 :                 errcallback.arg = bufHdr;
    5100         600 :                 errcallback.previous = error_context_stack;
    5101         600 :                 error_context_stack = &errcallback;
    5102             : 
    5103             :                 /* Make sure we can handle the pin */
    5104         600 :                 ReservePrivateRefCountEntry();
    5105         600 :                 ResourceOwnerEnlarge(CurrentResourceOwner);
    5106             : 
    5107             :                 /*
    5108             :                  * Pin/unpin mostly to make valgrind work, but it also seems
    5109             :                  * like the right thing to do.
    5110             :                  */
    5111         600 :                 PinLocalBuffer(bufHdr, false);
    5112             : 
    5113             : 
    5114         600 :                 FlushLocalBuffer(bufHdr, srel);
    5115             : 
    5116         600 :                 UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
    5117             : 
    5118             :                 /* Pop the error context stack */
    5119         600 :                 error_context_stack = errcallback.previous;
    5120             :             }
    5121             :         }
    5122             : 
    5123          18 :         return;
    5124             :     }
    5125             : 
    5126     3024386 :     for (i = 0; i < NBuffers; i++)
    5127             :     {
    5128             :         uint64      buf_state;
    5129             : 
    5130     3024128 :         bufHdr = GetBufferDescriptor(i);
    5131             : 
    5132             :         /*
    5133             :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5134             :          * saves some cycles.
    5135             :          */
    5136     3024128 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
    5137     3023706 :             continue;
    5138             : 
    5139             :         /* Make sure we can handle the pin */
    5140         422 :         ReservePrivateRefCountEntry();
    5141         422 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5142             : 
    5143         422 :         buf_state = LockBufHdr(bufHdr);
    5144         422 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
    5145         422 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5146             :         {
    5147         342 :             PinBuffer_Locked(bufHdr);
    5148         342 :             FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5149         342 :             UnpinBuffer(bufHdr);
    5150             :         }
    5151             :         else
    5152          80 :             UnlockBufHdr(bufHdr);
    5153             :     }
    5154             : }
    5155             : 
    5156             : /* ---------------------------------------------------------------------
    5157             :  *      FlushRelationsAllBuffers
    5158             :  *
    5159             :  *      This function flushes out of the buffer pool all the pages of all
    5160             :  *      forks of the specified smgr relations.  It's equivalent to calling
    5161             :  *      FlushRelationBuffers once per relation.  The relations are assumed not
    5162             :  *      to use local buffers.
    5163             :  * --------------------------------------------------------------------
    5164             :  */
    5165             : void
    5166          12 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
    5167             : {
    5168             :     int         i;
    5169             :     SMgrSortArray *srels;
    5170             :     bool        use_bsearch;
    5171             : 
    5172          12 :     if (nrels == 0)
    5173           0 :         return;
    5174             : 
    5175             :     /* fill-in array for qsort */
    5176          12 :     srels = palloc_array(SMgrSortArray, nrels);
    5177             : 
    5178          32 :     for (i = 0; i < nrels; i++)
    5179             :     {
    5180             :         Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
    5181             : 
    5182          20 :         srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
    5183          20 :         srels[i].srel = smgrs[i];
    5184             :     }
    5185             : 
    5186             :     /*
    5187             :      * Save the bsearch overhead for low number of relations to sync. See
    5188             :      * DropRelationsAllBuffers for details.
    5189             :      */
    5190          12 :     use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
    5191             : 
    5192             :     /* sort the list of SMgrRelations if necessary */
    5193          12 :     if (use_bsearch)
    5194           0 :         qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
    5195             : 
    5196      196620 :     for (i = 0; i < NBuffers; i++)
    5197             :     {
    5198      196608 :         SMgrSortArray *srelent = NULL;
    5199      196608 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
    5200             :         uint64      buf_state;
    5201             : 
    5202             :         /*
    5203             :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5204             :          * saves some cycles.
    5205             :          */
    5206             : 
    5207      196608 :         if (!use_bsearch)
    5208             :         {
    5209             :             int         j;
    5210             : 
    5211      519726 :             for (j = 0; j < nrels; j++)
    5212             :             {
    5213      327662 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
    5214             :                 {
    5215        4544 :                     srelent = &srels[j];
    5216        4544 :                     break;
    5217             :                 }
    5218             :             }
    5219             :         }
    5220             :         else
    5221             :         {
    5222             :             RelFileLocator rlocator;
    5223             : 
    5224           0 :             rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
    5225           0 :             srelent = bsearch(&rlocator,
    5226             :                               srels, nrels, sizeof(SMgrSortArray),
    5227             :                               rlocator_comparator);
    5228             :         }
    5229             : 
    5230             :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
    5231      196608 :         if (srelent == NULL)
    5232      192064 :             continue;
    5233             : 
    5234             :         /* Make sure we can handle the pin */
    5235        4544 :         ReservePrivateRefCountEntry();
    5236        4544 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5237             : 
    5238        4544 :         buf_state = LockBufHdr(bufHdr);
    5239        4544 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
    5240        4544 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5241             :         {
    5242        4472 :             PinBuffer_Locked(bufHdr);
    5243        4472 :             FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5244        4472 :             UnpinBuffer(bufHdr);
    5245             :         }
    5246             :         else
    5247          72 :             UnlockBufHdr(bufHdr);
    5248             :     }
    5249             : 
    5250          12 :     pfree(srels);
    5251             : }
    5252             : 
    5253             : /* ---------------------------------------------------------------------
    5254             :  *      RelationCopyStorageUsingBuffer
    5255             :  *
    5256             :  *      Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
    5257             :  *      of using smgrread and smgrextend this will copy using bufmgr APIs.
    5258             :  *
    5259             :  *      Refer comments atop CreateAndCopyRelationData() for details about
    5260             :  *      'permanent' parameter.
    5261             :  * --------------------------------------------------------------------
    5262             :  */
    5263             : static void
    5264      150852 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
    5265             :                                RelFileLocator dstlocator,
    5266             :                                ForkNumber forkNum, bool permanent)
    5267             : {
    5268             :     Buffer      srcBuf;
    5269             :     Buffer      dstBuf;
    5270             :     Page        srcPage;
    5271             :     Page        dstPage;
    5272             :     bool        use_wal;
    5273             :     BlockNumber nblocks;
    5274             :     BlockNumber blkno;
    5275             :     PGIOAlignedBlock buf;
    5276             :     BufferAccessStrategy bstrategy_src;
    5277             :     BufferAccessStrategy bstrategy_dst;
    5278             :     BlockRangeReadStreamPrivate p;
    5279             :     ReadStream *src_stream;
    5280             :     SMgrRelation src_smgr;
    5281             : 
    5282             :     /*
    5283             :      * In general, we want to write WAL whenever wal_level > 'minimal', but we
    5284             :      * can skip it when copying any fork of an unlogged relation other than
    5285             :      * the init fork.
    5286             :      */
    5287      150852 :     use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
    5288             : 
    5289             :     /* Get number of blocks in the source relation. */
    5290      150852 :     nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
    5291             :                           forkNum);
    5292             : 
    5293             :     /* Nothing to copy; just return. */
    5294      150852 :     if (nblocks == 0)
    5295       26318 :         return;
    5296             : 
    5297             :     /*
    5298             :      * Bulk extend the destination relation of the same size as the source
    5299             :      * relation before starting to copy block by block.
    5300             :      */
    5301      124534 :     memset(buf.data, 0, BLCKSZ);
    5302      124534 :     smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
    5303             :                buf.data, true);
    5304             : 
    5305             :     /* This is a bulk operation, so use buffer access strategies. */
    5306      124534 :     bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
    5307      124534 :     bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
    5308             : 
    5309             :     /* Initialize streaming read */
    5310      124534 :     p.current_blocknum = 0;
    5311      124534 :     p.last_exclusive = nblocks;
    5312      124534 :     src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
    5313             : 
    5314             :     /*
    5315             :      * It is safe to use batchmode as block_range_read_stream_cb takes no
    5316             :      * locks.
    5317             :      */
    5318      124534 :     src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
    5319             :                                                  READ_STREAM_USE_BATCHING,
    5320             :                                                  bstrategy_src,
    5321             :                                                  src_smgr,
    5322             :                                                  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
    5323             :                                                  forkNum,
    5324             :                                                  block_range_read_stream_cb,
    5325             :                                                  &p,
    5326             :                                                  0);
    5327             : 
    5328             :     /* Iterate over each block of the source relation file. */
    5329      601434 :     for (blkno = 0; blkno < nblocks; blkno++)
    5330             :     {
    5331      476904 :         CHECK_FOR_INTERRUPTS();
    5332             : 
    5333             :         /* Read block from source relation. */
    5334      476904 :         srcBuf = read_stream_next_buffer(src_stream, NULL);
    5335      476900 :         LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
    5336      476900 :         srcPage = BufferGetPage(srcBuf);
    5337             : 
    5338      476900 :         dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
    5339             :                                            BufferGetBlockNumber(srcBuf),
    5340             :                                            RBM_ZERO_AND_LOCK, bstrategy_dst,
    5341             :                                            permanent);
    5342      476900 :         dstPage = BufferGetPage(dstBuf);
    5343             : 
    5344      476900 :         START_CRIT_SECTION();
    5345             : 
    5346             :         /* Copy page data from the source to the destination. */
    5347      476900 :         memcpy(dstPage, srcPage, BLCKSZ);
    5348      476900 :         MarkBufferDirty(dstBuf);
    5349             : 
    5350             :         /* WAL-log the copied page. */
    5351      476900 :         if (use_wal)
    5352      275262 :             log_newpage_buffer(dstBuf, true);
    5353             : 
    5354      476900 :         END_CRIT_SECTION();
    5355             : 
    5356      476900 :         UnlockReleaseBuffer(dstBuf);
    5357      476900 :         UnlockReleaseBuffer(srcBuf);
    5358             :     }
    5359             :     Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
    5360      124530 :     read_stream_end(src_stream);
    5361             : 
    5362      124530 :     FreeAccessStrategy(bstrategy_src);
    5363      124530 :     FreeAccessStrategy(bstrategy_dst);
    5364             : }
    5365             : 
    5366             : /* ---------------------------------------------------------------------
    5367             :  *      CreateAndCopyRelationData
    5368             :  *
    5369             :  *      Create destination relation storage and copy all forks from the
    5370             :  *      source relation to the destination.
    5371             :  *
    5372             :  *      Pass permanent as true for permanent relations and false for
    5373             :  *      unlogged relations.  Currently this API is not supported for
    5374             :  *      temporary relations.
    5375             :  * --------------------------------------------------------------------
    5376             :  */
    5377             : void
    5378      113400 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
    5379             :                           RelFileLocator dst_rlocator, bool permanent)
    5380             : {
    5381             :     char        relpersistence;
    5382             :     SMgrRelation src_rel;
    5383             :     SMgrRelation dst_rel;
    5384             : 
    5385             :     /* Set the relpersistence. */
    5386      113400 :     relpersistence = permanent ?
    5387             :         RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
    5388             : 
    5389      113400 :     src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
    5390      113400 :     dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
    5391             : 
    5392             :     /*
    5393             :      * Create and copy all forks of the relation.  During create database we
    5394             :      * have a separate cleanup mechanism which deletes complete database
    5395             :      * directory.  Therefore, each individual relation doesn't need to be
    5396             :      * registered for cleanup.
    5397             :      */
    5398      113400 :     RelationCreateStorage(dst_rlocator, relpersistence, false);
    5399             : 
    5400             :     /* copy main fork. */
    5401      113400 :     RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
    5402             :                                    permanent);
    5403             : 
    5404             :     /* copy those extra forks that exist */
    5405      113396 :     for (ForkNumber forkNum = MAIN_FORKNUM + 1;
    5406      453584 :          forkNum <= MAX_FORKNUM; forkNum++)
    5407             :     {
    5408      340188 :         if (smgrexists(src_rel, forkNum))
    5409             :         {
    5410       37452 :             smgrcreate(dst_rel, forkNum, false);
    5411             : 
    5412             :             /*
    5413             :              * WAL log creation if the relation is persistent, or this is the
    5414             :              * init fork of an unlogged relation.
    5415             :              */
    5416       37452 :             if (permanent || forkNum == INIT_FORKNUM)
    5417       37452 :                 log_smgrcreate(&dst_rlocator, forkNum);
    5418             : 
    5419             :             /* Copy a fork's data, block by block. */
    5420       37452 :             RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
    5421             :                                            permanent);
    5422             :         }
    5423             :     }
    5424      113396 : }
    5425             : 
    5426             : /* ---------------------------------------------------------------------
    5427             :  *      FlushDatabaseBuffers
    5428             :  *
    5429             :  *      This function writes all dirty pages of a database out to disk
    5430             :  *      (or more accurately, out to kernel disk buffers), ensuring that the
    5431             :  *      kernel has an up-to-date view of the database.
    5432             :  *
    5433             :  *      Generally, the caller should be holding an appropriate lock to ensure
    5434             :  *      no other backend is active in the target database; otherwise more
    5435             :  *      pages could get dirtied.
    5436             :  *
    5437             :  *      Note we don't worry about flushing any pages of temporary relations.
    5438             :  *      It's assumed these wouldn't be interesting.
    5439             :  * --------------------------------------------------------------------
    5440             :  */
    5441             : void
    5442          10 : FlushDatabaseBuffers(Oid dbid)
    5443             : {
    5444             :     int         i;
    5445             :     BufferDesc *bufHdr;
    5446             : 
    5447        1290 :     for (i = 0; i < NBuffers; i++)
    5448             :     {
    5449             :         uint64      buf_state;
    5450             : 
    5451        1280 :         bufHdr = GetBufferDescriptor(i);
    5452             : 
    5453             :         /*
    5454             :          * As in DropRelationBuffers, an unlocked precheck should be safe and
    5455             :          * saves some cycles.
    5456             :          */
    5457        1280 :         if (bufHdr->tag.dbOid != dbid)
    5458         996 :             continue;
    5459             : 
    5460             :         /* Make sure we can handle the pin */
    5461         284 :         ReservePrivateRefCountEntry();
    5462         284 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    5463             : 
    5464         284 :         buf_state = LockBufHdr(bufHdr);
    5465         284 :         if (bufHdr->tag.dbOid == dbid &&
    5466         284 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
    5467             :         {
    5468           0 :             PinBuffer_Locked(bufHdr);
    5469           0 :             FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5470           0 :             UnpinBuffer(bufHdr);
    5471             :         }
    5472             :         else
    5473         284 :             UnlockBufHdr(bufHdr);
    5474             :     }
    5475          10 : }
    5476             : 
    5477             : /*
    5478             :  * Flush a previously, shared or exclusively, locked and pinned buffer to the
    5479             :  * OS.
    5480             :  */
    5481             : void
    5482         158 : FlushOneBuffer(Buffer buffer)
    5483             : {
    5484             :     BufferDesc *bufHdr;
    5485             : 
    5486             :     /* currently not needed, but no fundamental reason not to support */
    5487             :     Assert(!BufferIsLocal(buffer));
    5488             : 
    5489             :     Assert(BufferIsPinned(buffer));
    5490             : 
    5491         158 :     bufHdr = GetBufferDescriptor(buffer - 1);
    5492             : 
    5493             :     Assert(BufferIsLockedByMe(buffer));
    5494             : 
    5495         158 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    5496         158 : }
    5497             : 
    5498             : /*
    5499             :  * ReleaseBuffer -- release the pin on a buffer
    5500             :  */
    5501             : void
    5502   126227274 : ReleaseBuffer(Buffer buffer)
    5503             : {
    5504   126227274 :     if (!BufferIsValid(buffer))
    5505           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    5506             : 
    5507   126227274 :     if (BufferIsLocal(buffer))
    5508     3220784 :         UnpinLocalBuffer(buffer);
    5509             :     else
    5510   123006490 :         UnpinBuffer(GetBufferDescriptor(buffer - 1));
    5511   126227274 : }
    5512             : 
    5513             : /*
    5514             :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
    5515             :  *
    5516             :  * This is just a shorthand for a common combination.
    5517             :  */
    5518             : void
    5519    37797268 : UnlockReleaseBuffer(Buffer buffer)
    5520             : {
    5521    37797268 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    5522    37797268 :     ReleaseBuffer(buffer);
    5523    37797268 : }
    5524             : 
    5525             : /*
    5526             :  * IncrBufferRefCount
    5527             :  *      Increment the pin count on a buffer that we have *already* pinned
    5528             :  *      at least once.
    5529             :  *
    5530             :  *      This function cannot be used on a buffer we do not have pinned,
    5531             :  *      because it doesn't change the shared buffer state.
    5532             :  */
    5533             : void
    5534    23779728 : IncrBufferRefCount(Buffer buffer)
    5535             : {
    5536             :     Assert(BufferIsPinned(buffer));
    5537    23779728 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    5538    23779728 :     if (BufferIsLocal(buffer))
    5539      709430 :         LocalRefCount[-buffer - 1]++;
    5540             :     else
    5541             :     {
    5542             :         PrivateRefCountEntry *ref;
    5543             : 
    5544    23070298 :         ref = GetPrivateRefCountEntry(buffer, true);
    5545             :         Assert(ref != NULL);
    5546    23070298 :         ref->data.refcount++;
    5547             :     }
    5548    23779728 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
    5549    23779728 : }
    5550             : 
    5551             : /*
    5552             :  * MarkBufferDirtyHint
    5553             :  *
    5554             :  *  Mark a buffer dirty for non-critical changes.
    5555             :  *
    5556             :  * This is essentially the same as MarkBufferDirty, except:
    5557             :  *
    5558             :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
    5559             :  *    to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
    5560             :  * 2. The caller might have only share-lock instead of exclusive-lock on the
    5561             :  *    buffer's content lock.
    5562             :  * 3. This function does not guarantee that the buffer is always marked dirty
    5563             :  *    (due to a race condition), so it cannot be used for important changes.
    5564             :  */
    5565             : void
    5566    20142316 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
    5567             : {
    5568             :     BufferDesc *bufHdr;
    5569    20142316 :     Page        page = BufferGetPage(buffer);
    5570             : 
    5571    20142316 :     if (!BufferIsValid(buffer))
    5572           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    5573             : 
    5574    20142316 :     if (BufferIsLocal(buffer))
    5575             :     {
    5576     1270438 :         MarkLocalBufferDirty(buffer);
    5577     1270438 :         return;
    5578             :     }
    5579             : 
    5580    18871878 :     bufHdr = GetBufferDescriptor(buffer - 1);
    5581             : 
    5582             :     Assert(GetPrivateRefCount(buffer) > 0);
    5583             :     /* here, either share or exclusive lock is OK */
    5584             :     Assert(BufferIsLockedByMe(buffer));
    5585             : 
    5586             :     /*
    5587             :      * This routine might get called many times on the same page, if we are
    5588             :      * making the first scan after commit of an xact that added/deleted many
    5589             :      * tuples. So, be as quick as we can if the buffer is already dirty.  We
    5590             :      * do this by not acquiring spinlock if it looks like the status bits are
    5591             :      * already set.  Since we make this test unlocked, there's a chance we
    5592             :      * might fail to notice that the flags have just been cleared, and failed
    5593             :      * to reset them, due to memory-ordering issues.  But since this function
    5594             :      * is only intended to be used in cases where failing to write out the
    5595             :      * data would be harmless anyway, it doesn't really matter.
    5596             :      */
    5597    18871878 :     if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
    5598             :         (BM_DIRTY | BM_JUST_DIRTIED))
    5599             :     {
    5600     1625552 :         XLogRecPtr  lsn = InvalidXLogRecPtr;
    5601     1625552 :         bool        dirtied = false;
    5602     1625552 :         bool        delayChkptFlags = false;
    5603             :         uint64      buf_state;
    5604             : 
    5605             :         /*
    5606             :          * If we need to protect hint bit updates from torn writes, WAL-log a
    5607             :          * full page image of the page. This full page image is only necessary
    5608             :          * if the hint bit update is the first change to the page since the
    5609             :          * last checkpoint.
    5610             :          *
    5611             :          * We don't check full_page_writes here because that logic is included
    5612             :          * when we call XLogInsert() since the value changes dynamically.
    5613             :          */
    5614     1625552 :         if (XLogHintBitIsNeeded() &&
    5615     1623370 :             (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT))
    5616             :         {
    5617             :             /*
    5618             :              * If we must not write WAL, due to a relfilelocator-specific
    5619             :              * condition or being in recovery, don't dirty the page.  We can
    5620             :              * set the hint, just not dirty the page as a result so the hint
    5621             :              * is lost when we evict the page or shutdown.
    5622             :              *
    5623             :              * See src/backend/storage/page/README for longer discussion.
    5624             :              */
    5625     1747898 :             if (RecoveryInProgress() ||
    5626      124592 :                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
    5627     1501210 :                 return;
    5628             : 
    5629             :             /*
    5630             :              * If the block is already dirty because we either made a change
    5631             :              * or set a hint already, then we don't need to write a full page
    5632             :              * image.  Note that aggressive cleaning of blocks dirtied by hint
    5633             :              * bit setting would increase the call rate. Bulk setting of hint
    5634             :              * bits would reduce the call rate...
    5635             :              *
    5636             :              * We must issue the WAL record before we mark the buffer dirty.
    5637             :              * Otherwise we might write the page before we write the WAL. That
    5638             :              * causes a race condition, since a checkpoint might occur between
    5639             :              * writing the WAL record and marking the buffer dirty. We solve
    5640             :              * that with a kluge, but one that is already in use during
    5641             :              * transaction commit to prevent race conditions. Basically, we
    5642             :              * simply prevent the checkpoint WAL record from being written
    5643             :              * until we have marked the buffer dirty. We don't start the
    5644             :              * checkpoint flush until we have marked dirty, so our checkpoint
    5645             :              * must flush the change to disk successfully or the checkpoint
    5646             :              * never gets written, so crash recovery will fix.
    5647             :              *
    5648             :              * It's possible we may enter here without an xid, so it is
    5649             :              * essential that CreateCheckPoint waits for virtual transactions
    5650             :              * rather than full transactionids.
    5651             :              */
    5652             :             Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
    5653      122096 :             MyProc->delayChkptFlags |= DELAY_CHKPT_START;
    5654      122096 :             delayChkptFlags = true;
    5655      122096 :             lsn = XLogSaveBufferForHint(buffer, buffer_std);
    5656             :         }
    5657             : 
    5658      124342 :         buf_state = LockBufHdr(bufHdr);
    5659             : 
    5660             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    5661             : 
    5662      124342 :         if (!(buf_state & BM_DIRTY))
    5663             :         {
    5664      124236 :             dirtied = true;     /* Means "will be dirtied by this action" */
    5665             : 
    5666             :             /*
    5667             :              * Set the page LSN if we wrote a backup block. We aren't supposed
    5668             :              * to set this when only holding a share lock but as long as we
    5669             :              * serialise it somehow we're OK. We choose to set LSN while
    5670             :              * holding the buffer header lock, which causes any reader of an
    5671             :              * LSN who holds only a share lock to also obtain a buffer header
    5672             :              * lock before using PageGetLSN(), which is enforced in
    5673             :              * BufferGetLSNAtomic().
    5674             :              *
    5675             :              * If checksums are enabled, you might think we should reset the
    5676             :              * checksum here. That will happen when the page is written
    5677             :              * sometime later in this checkpoint cycle.
    5678             :              */
    5679      124236 :             if (XLogRecPtrIsValid(lsn))
    5680       64300 :                 PageSetLSN(page, lsn);
    5681             :         }
    5682             : 
    5683      124342 :         UnlockBufHdrExt(bufHdr, buf_state,
    5684             :                         BM_DIRTY | BM_JUST_DIRTIED,
    5685             :                         0, 0);
    5686             : 
    5687      124342 :         if (delayChkptFlags)
    5688      122096 :             MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
    5689             : 
    5690      124342 :         if (dirtied)
    5691             :         {
    5692      124236 :             pgBufferUsage.shared_blks_dirtied++;
    5693      124236 :             if (VacuumCostActive)
    5694        3668 :                 VacuumCostBalance += VacuumCostPageDirty;
    5695             :         }
    5696             :     }
    5697             : }
    5698             : 
    5699             : /*
    5700             :  * Release buffer content locks for shared buffers.
    5701             :  *
    5702             :  * Used to clean up after errors.
    5703             :  *
    5704             :  * Currently, we can expect that resource owner cleanup, via
    5705             :  * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
    5706             :  * se; the only thing we need to deal with here is clearing any PIN_COUNT
    5707             :  * request that was in progress.
    5708             :  */
    5709             : void
    5710      107254 : UnlockBuffers(void)
    5711             : {
    5712      107254 :     BufferDesc *buf = PinCountWaitBuf;
    5713             : 
    5714      107254 :     if (buf)
    5715             :     {
    5716             :         uint64      buf_state;
    5717           0 :         uint64      unset_bits = 0;
    5718             : 
    5719           0 :         buf_state = LockBufHdr(buf);
    5720             : 
    5721             :         /*
    5722             :          * Don't complain if flag bit not set; it could have been reset but we
    5723             :          * got a cancel/die interrupt before getting the signal.
    5724             :          */
    5725           0 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    5726           0 :             buf->wait_backend_pgprocno == MyProcNumber)
    5727           0 :             unset_bits = BM_PIN_COUNT_WAITER;
    5728             : 
    5729           0 :         UnlockBufHdrExt(buf, buf_state,
    5730             :                         0, unset_bits,
    5731             :                         0);
    5732             : 
    5733           0 :         PinCountWaitBuf = NULL;
    5734             :     }
    5735      107254 : }
    5736             : 
    5737             : /*
    5738             :  * Acquire the buffer content lock in the specified mode
    5739             :  *
    5740             :  * If the lock is not available, sleep until it is.
    5741             :  *
    5742             :  * Side effect: cancel/die interrupts are held off until lock release.
    5743             :  *
    5744             :  * This uses almost the same locking approach as lwlock.c's
    5745             :  * LWLockAcquire(). See documentation at the top of lwlock.c for a more
    5746             :  * detailed discussion.
    5747             :  *
    5748             :  * The reason that this, and most of the other BufferLock* functions, get both
    5749             :  * the Buffer and BufferDesc* as parameters, is that looking up one from the
    5750             :  * other repeatedly shows up noticeably in profiles.
    5751             :  *
    5752             :  * Callers should provide a constant for mode, for more efficient code
    5753             :  * generation.
    5754             :  */
    5755             : static inline void
    5756   163240836 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    5757             : {
    5758             :     PrivateRefCountEntry *entry;
    5759   163240836 :     int         extraWaits = 0;
    5760             : 
    5761             :     /*
    5762             :      * Get reference to the refcount entry before we hold the lock, it seems
    5763             :      * better to do before holding the lock.
    5764             :      */
    5765   163240836 :     entry = GetPrivateRefCountEntry(buffer, true);
    5766             : 
    5767             :     /*
    5768             :      * We better not already hold a lock on the buffer.
    5769             :      */
    5770             :     Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
    5771             : 
    5772             :     /*
    5773             :      * Lock out cancel/die interrupts until we exit the code section protected
    5774             :      * by the content lock.  This ensures that interrupts will not interfere
    5775             :      * with manipulations of data structures in shared memory.
    5776             :      */
    5777   163240836 :     HOLD_INTERRUPTS();
    5778             : 
    5779             :     for (;;)
    5780       40890 :     {
    5781   163281726 :         uint32      wait_event = 0; /* initialized to avoid compiler warning */
    5782             :         bool        mustwait;
    5783             : 
    5784             :         /*
    5785             :          * Try to grab the lock the first time, we're not in the waitqueue
    5786             :          * yet/anymore.
    5787             :          */
    5788   163281726 :         mustwait = BufferLockAttempt(buf_hdr, mode);
    5789             : 
    5790   163281726 :         if (likely(!mustwait))
    5791             :         {
    5792   163238686 :             break;
    5793             :         }
    5794             : 
    5795             :         /*
    5796             :          * Ok, at this point we couldn't grab the lock on the first try. We
    5797             :          * cannot simply queue ourselves to the end of the list and wait to be
    5798             :          * woken up because by now the lock could long have been released.
    5799             :          * Instead add us to the queue and try to grab the lock again. If we
    5800             :          * succeed we need to revert the queuing and be happy, otherwise we
    5801             :          * recheck the lock. If we still couldn't grab it, we know that the
    5802             :          * other locker will see our queue entries when releasing since they
    5803             :          * existed before we checked for the lock.
    5804             :          */
    5805             : 
    5806             :         /* add to the queue */
    5807       43040 :         BufferLockQueueSelf(buf_hdr, mode);
    5808             : 
    5809             :         /* we're now guaranteed to be woken up if necessary */
    5810       43040 :         mustwait = BufferLockAttempt(buf_hdr, mode);
    5811             : 
    5812             :         /* ok, grabbed the lock the second time round, need to undo queueing */
    5813       43040 :         if (!mustwait)
    5814             :         {
    5815        2150 :             BufferLockDequeueSelf(buf_hdr);
    5816        2150 :             break;
    5817             :         }
    5818             : 
    5819       40890 :         switch (mode)
    5820             :         {
    5821       22896 :             case BUFFER_LOCK_EXCLUSIVE:
    5822       22896 :                 wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
    5823       22896 :                 break;
    5824           0 :             case BUFFER_LOCK_SHARE_EXCLUSIVE:
    5825           0 :                 wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
    5826           0 :                 break;
    5827       17994 :             case BUFFER_LOCK_SHARE:
    5828       17994 :                 wait_event = WAIT_EVENT_BUFFER_SHARED;
    5829       17994 :                 break;
    5830             :             case BUFFER_LOCK_UNLOCK:
    5831             :                 pg_unreachable();
    5832             : 
    5833             :         }
    5834       40890 :         pgstat_report_wait_start(wait_event);
    5835             : 
    5836             :         /*
    5837             :          * Wait until awakened.
    5838             :          *
    5839             :          * It is possible that we get awakened for a reason other than being
    5840             :          * signaled by BufferLockWakeup().  If so, loop back and wait again.
    5841             :          * Once we've gotten the lock, re-increment the sema by the number of
    5842             :          * additional signals received.
    5843             :          */
    5844             :         for (;;)
    5845             :         {
    5846       40890 :             PGSemaphoreLock(MyProc->sem);
    5847       40890 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    5848       40890 :                 break;
    5849           0 :             extraWaits++;
    5850             :         }
    5851             : 
    5852       40890 :         pgstat_report_wait_end();
    5853             : 
    5854             :         /* Retrying, allow BufferLockRelease to release waiters again. */
    5855       40890 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    5856             :     }
    5857             : 
    5858             :     /* Remember that we now hold this lock */
    5859   163240836 :     entry->data.lockmode = mode;
    5860             : 
    5861             :     /*
    5862             :      * Fix the process wait semaphore's count for any absorbed wakeups.
    5863             :      */
    5864   163240836 :     while (unlikely(extraWaits-- > 0))
    5865           0 :         PGSemaphoreUnlock(MyProc->sem);
    5866   163240836 : }
    5867             : 
    5868             : /*
    5869             :  * Release a previously acquired buffer content lock.
    5870             :  */
    5871             : static void
    5872   166455268 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
    5873             : {
    5874             :     BufferLockMode mode;
    5875             :     uint64      oldstate;
    5876             :     uint64      sub;
    5877             : 
    5878   166455268 :     mode = BufferLockDisownInternal(buffer, buf_hdr);
    5879             : 
    5880             :     /*
    5881             :      * Release my hold on lock, after that it can immediately be acquired by
    5882             :      * others, even if we still have to wakeup other waiters.
    5883             :      */
    5884   166455268 :     sub = BufferLockReleaseSub(mode);
    5885             : 
    5886   166455268 :     oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
    5887             : 
    5888   166455268 :     BufferLockProcessRelease(buf_hdr, mode, oldstate);
    5889             : 
    5890             :     /*
    5891             :      * Now okay to allow cancel/die interrupts.
    5892             :      */
    5893   166455268 :     RESUME_INTERRUPTS();
    5894   166455268 : }
    5895             : 
    5896             : 
    5897             : /*
    5898             :  * Acquire the content lock for the buffer, but only if we don't have to wait.
    5899             :  *
    5900             :  * It is allowed to try to conditionally acquire a lock on a buffer that this
    5901             :  * backend has already locked, but the lock acquisition will always fail, even
    5902             :  * if the new lock acquisition does not conflict with an already held lock
    5903             :  * (e.g. two share locks). This is because we currently do not have space to
    5904             :  * track multiple lock ownerships of the same buffer within one backend.  That
    5905             :  * is ok for the current uses of BufferLockConditional().
    5906             :  */
    5907             : static bool
    5908     3216438 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
    5909             : {
    5910     3216438 :     PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
    5911             :     bool        mustwait;
    5912             : 
    5913             :     /*
    5914             :      * As described above, if we're trying to lock a buffer this backend
    5915             :      * already has locked, return false, independent of the existing and
    5916             :      * desired lock level.
    5917             :      */
    5918     3216438 :     if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
    5919           0 :         return false;
    5920             : 
    5921             :     /*
    5922             :      * Lock out cancel/die interrupts until we exit the code section protected
    5923             :      * by the content lock.  This ensures that interrupts will not interfere
    5924             :      * with manipulations of data structures in shared memory.
    5925             :      */
    5926     3216438 :     HOLD_INTERRUPTS();
    5927             : 
    5928             :     /* Check for the lock */
    5929     3216438 :     mustwait = BufferLockAttempt(buf_hdr, mode);
    5930             : 
    5931     3216438 :     if (mustwait)
    5932             :     {
    5933             :         /* Failed to get lock, so release interrupt holdoff */
    5934        2006 :         RESUME_INTERRUPTS();
    5935             :     }
    5936             :     else
    5937             :     {
    5938     3214432 :         entry->data.lockmode = mode;
    5939             :     }
    5940             : 
    5941     3216438 :     return !mustwait;
    5942             : }
    5943             : 
    5944             : /*
    5945             :  * Internal function that tries to atomically acquire the content lock in the
    5946             :  * passed in mode.
    5947             :  *
    5948             :  * This function will not block waiting for a lock to become free - that's the
    5949             :  * caller's job.
    5950             :  *
    5951             :  * Similar to LWLockAttemptLock().
    5952             :  */
    5953             : static inline bool
    5954   166541204 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
    5955             : {
    5956             :     uint64      old_state;
    5957             : 
    5958             :     /*
    5959             :      * Read once outside the loop, later iterations will get the newer value
    5960             :      * via compare & exchange.
    5961             :      */
    5962   166541204 :     old_state = pg_atomic_read_u64(&buf_hdr->state);
    5963             : 
    5964             :     /* loop until we've determined whether we could acquire the lock or not */
    5965             :     while (true)
    5966       33088 :     {
    5967             :         uint64      desired_state;
    5968             :         bool        lock_free;
    5969             : 
    5970   166574292 :         desired_state = old_state;
    5971             : 
    5972   166574292 :         if (mode == BUFFER_LOCK_EXCLUSIVE)
    5973             :         {
    5974    51046890 :             lock_free = (old_state & BM_LOCK_MASK) == 0;
    5975    51046890 :             if (lock_free)
    5976    50997090 :                 desired_state += BM_LOCK_VAL_EXCLUSIVE;
    5977             :         }
    5978   115527402 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    5979             :         {
    5980           0 :             lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
    5981           0 :             if (lock_free)
    5982           0 :                 desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
    5983             :         }
    5984             :         else
    5985             :         {
    5986   115527402 :             lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
    5987   115527402 :             if (lock_free)
    5988   115490274 :                 desired_state += BM_LOCK_VAL_SHARED;
    5989             :         }
    5990             : 
    5991             :         /*
    5992             :          * Attempt to swap in the state we are expecting. If we didn't see
    5993             :          * lock to be free, that's just the old value. If we saw it as free,
    5994             :          * we'll attempt to mark it acquired. The reason that we always swap
    5995             :          * in the value is that this doubles as a memory barrier. We could try
    5996             :          * to be smarter and only swap in values if we saw the lock as free,
    5997             :          * but benchmark haven't shown it as beneficial so far.
    5998             :          *
    5999             :          * Retry if the value changed since we last looked at it.
    6000             :          */
    6001   166574292 :         if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
    6002             :                                                   &old_state, desired_state)))
    6003             :         {
    6004   166541204 :             if (lock_free)
    6005             :             {
    6006             :                 /* Great! Got the lock. */
    6007   166455268 :                 return false;
    6008             :             }
    6009             :             else
    6010       85936 :                 return true;    /* somebody else has the lock */
    6011             :         }
    6012             :     }
    6013             : 
    6014             :     pg_unreachable();
    6015             : }
    6016             : 
    6017             : /*
    6018             :  * Add ourselves to the end of the content lock's wait queue.
    6019             :  */
    6020             : static void
    6021       43040 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
    6022             : {
    6023             :     /*
    6024             :      * If we don't have a PGPROC structure, there's no way to wait. This
    6025             :      * should never occur, since MyProc should only be null during shared
    6026             :      * memory initialization.
    6027             :      */
    6028       43040 :     if (MyProc == NULL)
    6029           0 :         elog(PANIC, "cannot wait without a PGPROC structure");
    6030             : 
    6031       43040 :     if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
    6032           0 :         elog(PANIC, "queueing for lock while waiting on another one");
    6033             : 
    6034       43040 :     LockBufHdr(buf_hdr);
    6035             : 
    6036             :     /* setting the flag is protected by the spinlock */
    6037       43040 :     pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
    6038             : 
    6039             :     /*
    6040             :      * These are currently used both for lwlocks and buffer content locks,
    6041             :      * which is acceptable, although not pretty, because a backend can't wait
    6042             :      * for both types of locks at the same time.
    6043             :      */
    6044       43040 :     MyProc->lwWaiting = LW_WS_WAITING;
    6045       43040 :     MyProc->lwWaitMode = mode;
    6046             : 
    6047       43040 :     proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6048             : 
    6049             :     /* Can release the mutex now */
    6050       43040 :     UnlockBufHdr(buf_hdr);
    6051       43040 : }
    6052             : 
    6053             : /*
    6054             :  * Remove ourselves from the waitlist.
    6055             :  *
    6056             :  * This is used if we queued ourselves because we thought we needed to sleep
    6057             :  * but, after further checking, we discovered that we don't actually need to
    6058             :  * do so.
    6059             :  */
    6060             : static void
    6061        2150 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
    6062             : {
    6063             :     bool        on_waitlist;
    6064             : 
    6065        2150 :     LockBufHdr(buf_hdr);
    6066             : 
    6067        2150 :     on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
    6068        2150 :     if (on_waitlist)
    6069        1568 :         proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
    6070             : 
    6071        2150 :     if (proclist_is_empty(&buf_hdr->lock_waiters) &&
    6072        2064 :         (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
    6073             :     {
    6074        1484 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
    6075             :     }
    6076             : 
    6077             :     /* XXX: combine with fetch_and above? */
    6078        2150 :     UnlockBufHdr(buf_hdr);
    6079             : 
    6080             :     /* clear waiting state again, nice for debugging */
    6081        2150 :     if (on_waitlist)
    6082        1568 :         MyProc->lwWaiting = LW_WS_NOT_WAITING;
    6083             :     else
    6084             :     {
    6085         582 :         int         extraWaits = 0;
    6086             : 
    6087             : 
    6088             :         /*
    6089             :          * Somebody else dequeued us and has or will wake us up. Deal with the
    6090             :          * superfluous absorption of a wakeup.
    6091             :          */
    6092             : 
    6093             :         /*
    6094             :          * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
    6095             :          * removed ourselves - they'll have set it.
    6096             :          */
    6097         582 :         pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
    6098             : 
    6099             :         /*
    6100             :          * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
    6101             :          * get reset at some inconvenient point later. Most of the time this
    6102             :          * will immediately return.
    6103             :          */
    6104             :         for (;;)
    6105             :         {
    6106         582 :             PGSemaphoreLock(MyProc->sem);
    6107         582 :             if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
    6108         582 :                 break;
    6109           0 :             extraWaits++;
    6110             :         }
    6111             : 
    6112             :         /*
    6113             :          * Fix the process wait semaphore's count for any absorbed wakeups.
    6114             :          */
    6115         582 :         while (extraWaits-- > 0)
    6116           0 :             PGSemaphoreUnlock(MyProc->sem);
    6117             :     }
    6118        2150 : }
    6119             : 
    6120             : /*
    6121             :  * Stop treating lock as held by current backend.
    6122             :  *
    6123             :  * After calling this function it's the callers responsibility to ensure that
    6124             :  * the lock gets released, even in case of an error. This only is desirable if
    6125             :  * the lock is going to be released in a different process than the process
    6126             :  * that acquired it.
    6127             :  */
    6128             : static inline void
    6129           0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
    6130             : {
    6131           0 :     BufferLockDisownInternal(buffer, buf_hdr);
    6132           0 :     RESUME_INTERRUPTS();
    6133           0 : }
    6134             : 
    6135             : /*
    6136             :  * Stop treating lock as held by current backend.
    6137             :  *
    6138             :  * This is the code that can be shared between actually releasing a lock
    6139             :  * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
    6140             :  * without releasing the lock (BufferLockDisown()).
    6141             :  */
    6142             : static inline int
    6143   166455268 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
    6144             : {
    6145             :     BufferLockMode mode;
    6146             :     PrivateRefCountEntry *ref;
    6147             : 
    6148   166455268 :     ref = GetPrivateRefCountEntry(buffer, false);
    6149   166455268 :     if (ref == NULL)
    6150           0 :         elog(ERROR, "lock %d is not held", buffer);
    6151   166455268 :     mode = ref->data.lockmode;
    6152   166455268 :     ref->data.lockmode = BUFFER_LOCK_UNLOCK;
    6153             : 
    6154   166455268 :     return mode;
    6155             : }
    6156             : 
    6157             : /*
    6158             :  * Wakeup all the lockers that currently have a chance to acquire the lock.
    6159             :  *
    6160             :  * wake_exclusive indicates whether exclusive lock waiters should be woken up.
    6161             :  */
    6162             : static void
    6163       39356 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
    6164             : {
    6165       39356 :     bool        new_wake_in_progress = false;
    6166       39356 :     bool        wake_share_exclusive = true;
    6167             :     proclist_head wakeup;
    6168             :     proclist_mutable_iter iter;
    6169             : 
    6170       39356 :     proclist_init(&wakeup);
    6171             : 
    6172             :     /* lock wait list while collecting backends to wake up */
    6173       39356 :     LockBufHdr(buf_hdr);
    6174             : 
    6175       59590 :     proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
    6176             :     {
    6177       43468 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6178             : 
    6179             :         /*
    6180             :          * Already woke up a conflicting lock, so skip over this wait list
    6181             :          * entry.
    6182             :          */
    6183       43468 :         if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6184        1996 :             continue;
    6185       41472 :         if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6186           0 :             continue;
    6187             : 
    6188       41472 :         proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
    6189       41472 :         proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
    6190             : 
    6191             :         /*
    6192             :          * Prevent additional wakeups until retryer gets to run. Backends that
    6193             :          * are just waiting for the lock to become free don't retry
    6194             :          * automatically.
    6195             :          */
    6196       41472 :         new_wake_in_progress = true;
    6197             : 
    6198             :         /*
    6199             :          * Signal that the process isn't on the wait list anymore. This allows
    6200             :          * BufferLockDequeueSelf() to remove itself from the waitlist with a
    6201             :          * proclist_delete(), rather than having to check if it has been
    6202             :          * removed from the list.
    6203             :          */
    6204             :         Assert(waiter->lwWaiting == LW_WS_WAITING);
    6205       41472 :         waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
    6206             : 
    6207             :         /*
    6208             :          * Don't wakeup further waiters after waking a conflicting waiter.
    6209             :          */
    6210       41472 :         if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
    6211             :         {
    6212             :             /*
    6213             :              * Share locks conflict with exclusive locks.
    6214             :              */
    6215       18238 :             wake_exclusive = false;
    6216             :         }
    6217       23234 :         else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6218             :         {
    6219             :             /*
    6220             :              * Share-exclusive locks conflict with share-exclusive and
    6221             :              * exclusive locks.
    6222             :              */
    6223           0 :             wake_exclusive = false;
    6224           0 :             wake_share_exclusive = false;
    6225             :         }
    6226       23234 :         else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
    6227             :         {
    6228             :             /*
    6229             :              * Exclusive locks conflict with all other locks, there's no point
    6230             :              * in waking up anybody else.
    6231             :              */
    6232       23234 :             break;
    6233             :         }
    6234             :     }
    6235             : 
    6236             :     Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
    6237             : 
    6238             :     /* unset required flags, and release lock, in one fell swoop */
    6239             :     {
    6240             :         uint64      old_state;
    6241             :         uint64      desired_state;
    6242             : 
    6243       39356 :         old_state = pg_atomic_read_u64(&buf_hdr->state);
    6244             :         while (true)
    6245             :         {
    6246       39390 :             desired_state = old_state;
    6247             : 
    6248             :             /* compute desired flags */
    6249             : 
    6250       39390 :             if (new_wake_in_progress)
    6251       39034 :                 desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
    6252             :             else
    6253         356 :                 desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
    6254             : 
    6255       39390 :             if (proclist_is_empty(&buf_hdr->lock_waiters))
    6256       33006 :                 desired_state &= ~BM_LOCK_HAS_WAITERS;
    6257             : 
    6258       39390 :             desired_state &= ~BM_LOCKED;    /* release lock */
    6259             : 
    6260       39390 :             if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
    6261             :                                                desired_state))
    6262       39356 :                 break;
    6263             :         }
    6264             :     }
    6265             : 
    6266             :     /* Awaken any waiters I removed from the queue. */
    6267       80828 :     proclist_foreach_modify(iter, &wakeup, lwWaitLink)
    6268             :     {
    6269       41472 :         PGPROC     *waiter = GetPGProcByNumber(iter.cur);
    6270             : 
    6271       41472 :         proclist_delete(&wakeup, iter.cur, lwWaitLink);
    6272             : 
    6273             :         /*
    6274             :          * Guarantee that lwWaiting being unset only becomes visible once the
    6275             :          * unlink from the link has completed. Otherwise the target backend
    6276             :          * could be woken up for other reason and enqueue for a new lock - if
    6277             :          * that happens before the list unlink happens, the list would end up
    6278             :          * being corrupted.
    6279             :          *
    6280             :          * The barrier pairs with the LockBufHdr() when enqueuing for another
    6281             :          * lock.
    6282             :          */
    6283       41472 :         pg_write_barrier();
    6284       41472 :         waiter->lwWaiting = LW_WS_NOT_WAITING;
    6285       41472 :         PGSemaphoreUnlock(waiter->sem);
    6286             :     }
    6287       39356 : }
    6288             : 
    6289             : /*
    6290             :  * Compute subtraction from buffer state for a release of a held lock in
    6291             :  * `mode`.
    6292             :  *
    6293             :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6294             :  * release with other atomic operations when possible, leading to the lock
    6295             :  * release being done in multiple places, each needing to compute what to
    6296             :  * subtract from the lock state.
    6297             :  */
    6298             : static inline uint64
    6299   166455268 : BufferLockReleaseSub(BufferLockMode mode)
    6300             : {
    6301             :     /*
    6302             :      * Turns out that a switch() leads gcc to generate sufficiently worse code
    6303             :      * for this to show up in profiles...
    6304             :      */
    6305   166455268 :     if (mode == BUFFER_LOCK_EXCLUSIVE)
    6306    50996268 :         return BM_LOCK_VAL_EXCLUSIVE;
    6307   115459000 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6308           0 :         return BM_LOCK_VAL_SHARE_EXCLUSIVE;
    6309             :     else
    6310             :     {
    6311             :         Assert(mode == BUFFER_LOCK_SHARE);
    6312   115459000 :         return BM_LOCK_VAL_SHARED;
    6313             :     }
    6314             : 
    6315             :     return 0;                   /* keep compiler quiet */
    6316             : }
    6317             : 
    6318             : /*
    6319             :  * Handle work that needs to be done after releasing a lock that was held in
    6320             :  * `mode`, where `lockstate` is the result of the atomic operation modifying
    6321             :  * the state variable.
    6322             :  *
    6323             :  * This is separated from BufferLockUnlock() as we want to combine the lock
    6324             :  * release with other atomic operations when possible, leading to the lock
    6325             :  * release being done in multiple places.
    6326             :  */
    6327             : static void
    6328   166455268 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
    6329             : {
    6330   166455268 :     bool        check_waiters = false;
    6331   166455268 :     bool        wake_exclusive = false;
    6332             : 
    6333             :     /* nobody else can have that kind of lock */
    6334             :     Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
    6335             : 
    6336             :     /*
    6337             :      * If we're still waiting for backends to get scheduled, don't wake them
    6338             :      * up again. Otherwise check if we need to look through the waitqueue to
    6339             :      * wake other backends.
    6340             :      */
    6341   166455268 :     if ((lockstate & BM_LOCK_HAS_WAITERS) &&
    6342      157026 :         !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
    6343             :     {
    6344       78356 :         if ((lockstate & BM_LOCK_MASK) == 0)
    6345             :         {
    6346             :             /*
    6347             :              * We released a lock and the lock was, in that moment, free. We
    6348             :              * therefore can wake waiters for any kind of lock.
    6349             :              */
    6350       39356 :             check_waiters = true;
    6351       39356 :             wake_exclusive = true;
    6352             :         }
    6353       39000 :         else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6354             :         {
    6355             :             /*
    6356             :              * We released the lock, but another backend still holds a lock.
    6357             :              * We can't have released an exclusive lock, as there couldn't
    6358             :              * have been other lock holders. If we released a share lock, no
    6359             :              * waiters need to be woken up, as there must be other share
    6360             :              * lockers. However, if we held a share-exclusive lock, another
    6361             :              * backend now could acquire a share-exclusive lock.
    6362             :              */
    6363           0 :             check_waiters = true;
    6364           0 :             wake_exclusive = false;
    6365             :         }
    6366             :     }
    6367             : 
    6368             :     /*
    6369             :      * As waking up waiters requires the spinlock to be acquired, only do so
    6370             :      * if necessary.
    6371             :      */
    6372   166455268 :     if (check_waiters)
    6373       39356 :         BufferLockWakeup(buf_hdr, wake_exclusive);
    6374   166455268 : }
    6375             : 
    6376             : /*
    6377             :  * BufferLockHeldByMeInMode - test whether my process holds the content lock
    6378             :  * in the specified mode
    6379             :  *
    6380             :  * This is meant as debug support only.
    6381             :  */
    6382             : static bool
    6383           0 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
    6384             : {
    6385             :     PrivateRefCountEntry *entry =
    6386           0 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6387             : 
    6388           0 :     if (!entry)
    6389           0 :         return false;
    6390             :     else
    6391           0 :         return entry->data.lockmode == mode;
    6392             : }
    6393             : 
    6394             : /*
    6395             :  * BufferLockHeldByMe - test whether my process holds the content lock in any
    6396             :  * mode
    6397             :  *
    6398             :  * This is meant as debug support only.
    6399             :  */
    6400             : static bool
    6401           0 : BufferLockHeldByMe(BufferDesc *buf_hdr)
    6402             : {
    6403             :     PrivateRefCountEntry *entry =
    6404           0 :         GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
    6405             : 
    6406           0 :     if (!entry)
    6407           0 :         return false;
    6408             :     else
    6409           0 :         return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
    6410             : }
    6411             : 
    6412             : /*
    6413             :  * Release the content lock for the buffer.
    6414             :  */
    6415             : void
    6416   175818856 : UnlockBuffer(Buffer buffer)
    6417             : {
    6418             :     BufferDesc *buf_hdr;
    6419             : 
    6420             :     Assert(BufferIsPinned(buffer));
    6421   175818856 :     if (BufferIsLocal(buffer))
    6422     9977878 :         return;                 /* local buffers need no lock */
    6423             : 
    6424   165840978 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    6425   165840978 :     BufferLockUnlock(buffer, buf_hdr);
    6426             : }
    6427             : 
    6428             : /*
    6429             :  * Acquire the content_lock for the buffer.
    6430             :  */
    6431             : void
    6432   172451016 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
    6433             : {
    6434             :     BufferDesc *buf_hdr;
    6435             : 
    6436             :     /*
    6437             :      * We can't wait if we haven't got a PGPROC.  This should only occur
    6438             :      * during bootstrap or shared memory initialization.  Put an Assert here
    6439             :      * to catch unsafe coding practices.
    6440             :      */
    6441             :     Assert(!(MyProc == NULL && IsUnderPostmaster));
    6442             : 
    6443             :     /* handled in LockBuffer() wrapper */
    6444             :     Assert(mode != BUFFER_LOCK_UNLOCK);
    6445             : 
    6446             :     Assert(BufferIsPinned(buffer));
    6447   172451016 :     if (BufferIsLocal(buffer))
    6448     9824254 :         return;                 /* local buffers need no lock */
    6449             : 
    6450   162626762 :     buf_hdr = GetBufferDescriptor(buffer - 1);
    6451             : 
    6452             :     /*
    6453             :      * Test the most frequent lock modes first. While a switch (mode) would be
    6454             :      * nice, at least gcc generates considerably worse code for it.
    6455             :      *
    6456             :      * Call BufferLockAcquire() with a constant argument for mode, to generate
    6457             :      * more efficient code for the different lock modes.
    6458             :      */
    6459   162626762 :     if (mode == BUFFER_LOCK_SHARE)
    6460   114295650 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
    6461    48331112 :     else if (mode == BUFFER_LOCK_EXCLUSIVE)
    6462    48331112 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
    6463           0 :     else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
    6464           0 :         BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
    6465             :     else
    6466           0 :         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
    6467             : }
    6468             : 
    6469             : /*
    6470             :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
    6471             :  *
    6472             :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
    6473             :  */
    6474             : bool
    6475     2796502 : ConditionalLockBuffer(Buffer buffer)
    6476             : {
    6477             :     BufferDesc *buf;
    6478             : 
    6479             :     Assert(BufferIsPinned(buffer));
    6480     2796502 :     if (BufferIsLocal(buffer))
    6481      129374 :         return true;            /* act as though we got it */
    6482             : 
    6483     2667128 :     buf = GetBufferDescriptor(buffer - 1);
    6484             : 
    6485     2667128 :     return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
    6486             : }
    6487             : 
    6488             : /*
    6489             :  * Verify that this backend is pinning the buffer exactly once.
    6490             :  *
    6491             :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
    6492             :  * holds a pin on the buffer.  We do not care whether some other backend does.
    6493             :  */
    6494             : void
    6495     4856636 : CheckBufferIsPinnedOnce(Buffer buffer)
    6496             : {
    6497     4856636 :     if (BufferIsLocal(buffer))
    6498             :     {
    6499        1582 :         if (LocalRefCount[-buffer - 1] != 1)
    6500           0 :             elog(ERROR, "incorrect local pin count: %d",
    6501             :                  LocalRefCount[-buffer - 1]);
    6502             :     }
    6503             :     else
    6504             :     {
    6505     4855054 :         if (GetPrivateRefCount(buffer) != 1)
    6506           0 :             elog(ERROR, "incorrect local pin count: %d",
    6507             :                  GetPrivateRefCount(buffer));
    6508             :     }
    6509     4856636 : }
    6510             : 
    6511             : /*
    6512             :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
    6513             :  *
    6514             :  * Items may be deleted from a disk page only when the caller (a) holds an
    6515             :  * exclusive lock on the buffer and (b) has observed that no other backend
    6516             :  * holds a pin on the buffer.  If there is a pin, then the other backend
    6517             :  * might have a pointer into the buffer (for example, a heapscan reference
    6518             :  * to an item --- see README for more details).  It's OK if a pin is added
    6519             :  * after the cleanup starts, however; the newly-arrived backend will be
    6520             :  * unable to look at the page until we release the exclusive lock.
    6521             :  *
    6522             :  * To implement this protocol, a would-be deleter must pin the buffer and
    6523             :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
    6524             :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
    6525             :  * it has successfully observed pin count = 1.
    6526             :  */
    6527             : void
    6528       47868 : LockBufferForCleanup(Buffer buffer)
    6529             : {
    6530             :     BufferDesc *bufHdr;
    6531       47868 :     TimestampTz waitStart = 0;
    6532       47868 :     bool        waiting = false;
    6533       47868 :     bool        logged_recovery_conflict = false;
    6534             : 
    6535             :     Assert(BufferIsPinned(buffer));
    6536             :     Assert(PinCountWaitBuf == NULL);
    6537             : 
    6538       47868 :     CheckBufferIsPinnedOnce(buffer);
    6539             : 
    6540             :     /*
    6541             :      * We do not yet need to be worried about in-progress AIOs holding a pin,
    6542             :      * as we, so far, only support doing reads via AIO and this function can
    6543             :      * only be called once the buffer is valid (i.e. no read can be in
    6544             :      * flight).
    6545             :      */
    6546             : 
    6547             :     /* Nobody else to wait for */
    6548       47868 :     if (BufferIsLocal(buffer))
    6549          32 :         return;
    6550             : 
    6551       47836 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6552             : 
    6553             :     for (;;)
    6554         178 :     {
    6555             :         uint64      buf_state;
    6556       48014 :         uint64      unset_bits = 0;
    6557             : 
    6558             :         /* Try to acquire lock */
    6559       48014 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    6560       48014 :         buf_state = LockBufHdr(bufHdr);
    6561             : 
    6562             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6563       48014 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6564             :         {
    6565             :             /* Successfully acquired exclusive lock with pincount 1 */
    6566       47836 :             UnlockBufHdr(bufHdr);
    6567             : 
    6568             :             /*
    6569             :              * Emit the log message if recovery conflict on buffer pin was
    6570             :              * resolved but the startup process waited longer than
    6571             :              * deadlock_timeout for it.
    6572             :              */
    6573       47836 :             if (logged_recovery_conflict)
    6574           4 :                 LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
    6575             :                                     waitStart, GetCurrentTimestamp(),
    6576             :                                     NULL, false);
    6577             : 
    6578       47836 :             if (waiting)
    6579             :             {
    6580             :                 /* reset ps display to remove the suffix if we added one */
    6581           4 :                 set_ps_display_remove_suffix();
    6582           4 :                 waiting = false;
    6583             :             }
    6584       47836 :             return;
    6585             :         }
    6586             :         /* Failed, so mark myself as waiting for pincount 1 */
    6587         178 :         if (buf_state & BM_PIN_COUNT_WAITER)
    6588             :         {
    6589           0 :             UnlockBufHdr(bufHdr);
    6590           0 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6591           0 :             elog(ERROR, "multiple backends attempting to wait for pincount 1");
    6592             :         }
    6593         178 :         bufHdr->wait_backend_pgprocno = MyProcNumber;
    6594         178 :         PinCountWaitBuf = bufHdr;
    6595         178 :         UnlockBufHdrExt(bufHdr, buf_state,
    6596             :                         BM_PIN_COUNT_WAITER, 0,
    6597             :                         0);
    6598         178 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6599             : 
    6600             :         /* Wait to be signaled by UnpinBuffer() */
    6601         178 :         if (InHotStandby)
    6602             :         {
    6603          18 :             if (!waiting)
    6604             :             {
    6605             :                 /* adjust the process title to indicate that it's waiting */
    6606           4 :                 set_ps_display_suffix("waiting");
    6607           4 :                 waiting = true;
    6608             :             }
    6609             : 
    6610             :             /*
    6611             :              * Emit the log message if the startup process is waiting longer
    6612             :              * than deadlock_timeout for recovery conflict on buffer pin.
    6613             :              *
    6614             :              * Skip this if first time through because the startup process has
    6615             :              * not started waiting yet in this case. So, the wait start
    6616             :              * timestamp is set after this logic.
    6617             :              */
    6618          18 :             if (waitStart != 0 && !logged_recovery_conflict)
    6619             :             {
    6620           6 :                 TimestampTz now = GetCurrentTimestamp();
    6621             : 
    6622           6 :                 if (TimestampDifferenceExceeds(waitStart, now,
    6623             :                                                DeadlockTimeout))
    6624             :                 {
    6625           4 :                     LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
    6626             :                                         waitStart, now, NULL, true);
    6627           4 :                     logged_recovery_conflict = true;
    6628             :                 }
    6629             :             }
    6630             : 
    6631             :             /*
    6632             :              * Set the wait start timestamp if logging is enabled and first
    6633             :              * time through.
    6634             :              */
    6635          18 :             if (log_recovery_conflict_waits && waitStart == 0)
    6636           4 :                 waitStart = GetCurrentTimestamp();
    6637             : 
    6638             :             /* Publish the bufid that Startup process waits on */
    6639          18 :             SetStartupBufferPinWaitBufId(buffer - 1);
    6640             :             /* Set alarm and then wait to be signaled by UnpinBuffer() */
    6641          18 :             ResolveRecoveryConflictWithBufferPin();
    6642             :             /* Reset the published bufid */
    6643          18 :             SetStartupBufferPinWaitBufId(-1);
    6644             :         }
    6645             :         else
    6646         160 :             ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
    6647             : 
    6648             :         /*
    6649             :          * Remove flag marking us as waiter. Normally this will not be set
    6650             :          * anymore, but ProcWaitForSignal() can return for other signals as
    6651             :          * well.  We take care to only reset the flag if we're the waiter, as
    6652             :          * theoretically another backend could have started waiting. That's
    6653             :          * impossible with the current usages due to table level locking, but
    6654             :          * better be safe.
    6655             :          */
    6656         178 :         buf_state = LockBufHdr(bufHdr);
    6657         178 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
    6658          14 :             bufHdr->wait_backend_pgprocno == MyProcNumber)
    6659          14 :             unset_bits |= BM_PIN_COUNT_WAITER;
    6660             : 
    6661         178 :         UnlockBufHdrExt(bufHdr, buf_state,
    6662             :                         0, unset_bits,
    6663             :                         0);
    6664             : 
    6665         178 :         PinCountWaitBuf = NULL;
    6666             :         /* Loop back and try again */
    6667             :     }
    6668             : }
    6669             : 
    6670             : /*
    6671             :  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
    6672             :  * requests cancellation of all pin holders that are blocking it.
    6673             :  */
    6674             : bool
    6675           6 : HoldingBufferPinThatDelaysRecovery(void)
    6676             : {
    6677           6 :     int         bufid = GetStartupBufferPinWaitBufId();
    6678             : 
    6679             :     /*
    6680             :      * If we get woken slowly then it's possible that the Startup process was
    6681             :      * already woken by other backends before we got here. Also possible that
    6682             :      * we get here by multiple interrupts or interrupts at inappropriate
    6683             :      * times, so make sure we do nothing if the bufid is not set.
    6684             :      */
    6685           6 :     if (bufid < 0)
    6686           2 :         return false;
    6687             : 
    6688           4 :     if (GetPrivateRefCount(bufid + 1) > 0)
    6689           4 :         return true;
    6690             : 
    6691           0 :     return false;
    6692             : }
    6693             : 
    6694             : /*
    6695             :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
    6696             :  *
    6697             :  * We won't loop, but just check once to see if the pin count is OK.  If
    6698             :  * not, return false with no lock held.
    6699             :  */
    6700             : bool
    6701      909148 : ConditionalLockBufferForCleanup(Buffer buffer)
    6702             : {
    6703             :     BufferDesc *bufHdr;
    6704             :     uint64      buf_state,
    6705             :                 refcount;
    6706             : 
    6707             :     Assert(BufferIsValid(buffer));
    6708             : 
    6709             :     /* see AIO related comment in LockBufferForCleanup() */
    6710             : 
    6711      909148 :     if (BufferIsLocal(buffer))
    6712             :     {
    6713        1612 :         refcount = LocalRefCount[-buffer - 1];
    6714             :         /* There should be exactly one pin */
    6715             :         Assert(refcount > 0);
    6716        1612 :         if (refcount != 1)
    6717          42 :             return false;
    6718             :         /* Nobody else to wait for */
    6719        1570 :         return true;
    6720             :     }
    6721             : 
    6722             :     /* There should be exactly one local pin */
    6723      907536 :     refcount = GetPrivateRefCount(buffer);
    6724             :     Assert(refcount);
    6725      907536 :     if (refcount != 1)
    6726         564 :         return false;
    6727             : 
    6728             :     /* Try to acquire lock */
    6729      906972 :     if (!ConditionalLockBuffer(buffer))
    6730          58 :         return false;
    6731             : 
    6732      906914 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6733      906914 :     buf_state = LockBufHdr(bufHdr);
    6734      906914 :     refcount = BUF_STATE_GET_REFCOUNT(buf_state);
    6735             : 
    6736             :     Assert(refcount > 0);
    6737      906914 :     if (refcount == 1)
    6738             :     {
    6739             :         /* Successfully acquired exclusive lock with pincount 1 */
    6740      906376 :         UnlockBufHdr(bufHdr);
    6741      906376 :         return true;
    6742             :     }
    6743             : 
    6744             :     /* Failed, so release the lock */
    6745         538 :     UnlockBufHdr(bufHdr);
    6746         538 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    6747         538 :     return false;
    6748             : }
    6749             : 
    6750             : /*
    6751             :  * IsBufferCleanupOK - as above, but we already have the lock
    6752             :  *
    6753             :  * Check whether it's OK to perform cleanup on a buffer we've already
    6754             :  * locked.  If we observe that the pin count is 1, our exclusive lock
    6755             :  * happens to be a cleanup lock, and we can proceed with anything that
    6756             :  * would have been allowable had we sought a cleanup lock originally.
    6757             :  */
    6758             : bool
    6759        4038 : IsBufferCleanupOK(Buffer buffer)
    6760             : {
    6761             :     BufferDesc *bufHdr;
    6762             :     uint64      buf_state;
    6763             : 
    6764             :     Assert(BufferIsValid(buffer));
    6765             : 
    6766             :     /* see AIO related comment in LockBufferForCleanup() */
    6767             : 
    6768        4038 :     if (BufferIsLocal(buffer))
    6769             :     {
    6770             :         /* There should be exactly one pin */
    6771           0 :         if (LocalRefCount[-buffer - 1] != 1)
    6772           0 :             return false;
    6773             :         /* Nobody else to wait for */
    6774           0 :         return true;
    6775             :     }
    6776             : 
    6777             :     /* There should be exactly one local pin */
    6778        4038 :     if (GetPrivateRefCount(buffer) != 1)
    6779           0 :         return false;
    6780             : 
    6781        4038 :     bufHdr = GetBufferDescriptor(buffer - 1);
    6782             : 
    6783             :     /* caller must hold exclusive lock on buffer */
    6784             :     Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
    6785             : 
    6786        4038 :     buf_state = LockBufHdr(bufHdr);
    6787             : 
    6788             :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6789        4038 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
    6790             :     {
    6791             :         /* pincount is OK. */
    6792        4038 :         UnlockBufHdr(bufHdr);
    6793        4038 :         return true;
    6794             :     }
    6795             : 
    6796           0 :     UnlockBufHdr(bufHdr);
    6797           0 :     return false;
    6798             : }
    6799             : 
    6800             : 
    6801             : /*
    6802             :  *  Functions for buffer I/O handling
    6803             :  *
    6804             :  *  Also note that these are used only for shared buffers, not local ones.
    6805             :  */
    6806             : 
    6807             : /*
    6808             :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
    6809             :  */
    6810             : static void
    6811        4504 : WaitIO(BufferDesc *buf)
    6812             : {
    6813        4504 :     ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
    6814             : 
    6815        4504 :     ConditionVariablePrepareToSleep(cv);
    6816             :     for (;;)
    6817        4454 :     {
    6818             :         uint64      buf_state;
    6819             :         PgAioWaitRef iow;
    6820             : 
    6821             :         /*
    6822             :          * It may not be necessary to acquire the spinlock to check the flag
    6823             :          * here, but since this test is essential for correctness, we'd better
    6824             :          * play it safe.
    6825             :          */
    6826        8958 :         buf_state = LockBufHdr(buf);
    6827             : 
    6828             :         /*
    6829             :          * Copy the wait reference while holding the spinlock. This protects
    6830             :          * against a concurrent TerminateBufferIO() in another backend from
    6831             :          * clearing the wref while it's being read.
    6832             :          */
    6833        8958 :         iow = buf->io_wref;
    6834        8958 :         UnlockBufHdr(buf);
    6835             : 
    6836             :         /* no IO in progress, we don't need to wait */
    6837        8958 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    6838        4504 :             break;
    6839             : 
    6840             :         /*
    6841             :          * The buffer has asynchronous IO in progress, wait for it to
    6842             :          * complete.
    6843             :          */
    6844        4454 :         if (pgaio_wref_valid(&iow))
    6845             :         {
    6846        3924 :             pgaio_wref_wait(&iow);
    6847             : 
    6848             :             /*
    6849             :              * The AIO subsystem internally uses condition variables and thus
    6850             :              * might remove this backend from the BufferDesc's CV. While that
    6851             :              * wouldn't cause a correctness issue (the first CV sleep just
    6852             :              * immediately returns if not already registered), it seems worth
    6853             :              * avoiding unnecessary loop iterations, given that we take care
    6854             :              * to do so at the start of the function.
    6855             :              */
    6856        3924 :             ConditionVariablePrepareToSleep(cv);
    6857        3924 :             continue;
    6858             :         }
    6859             : 
    6860             :         /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
    6861         530 :         ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
    6862             :     }
    6863        4504 :     ConditionVariableCancelSleep();
    6864        4504 : }
    6865             : 
    6866             : /*
    6867             :  * StartBufferIO: begin I/O on this buffer
    6868             :  *  (Assumptions)
    6869             :  *  My process is executing no IO on this buffer
    6870             :  *  The buffer is Pinned
    6871             :  *
    6872             :  * In some scenarios multiple backends could attempt the same I/O operation
    6873             :  * concurrently.  If someone else has already started I/O on this buffer then
    6874             :  * we will wait for completion of the IO using WaitIO().
    6875             :  *
    6876             :  * Input operations are only attempted on buffers that are not BM_VALID,
    6877             :  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
    6878             :  * so we can always tell if the work is already done.
    6879             :  *
    6880             :  * Returns true if we successfully marked the buffer as I/O busy,
    6881             :  * false if someone else already did the work.
    6882             :  *
    6883             :  * If nowait is true, then we don't wait for an I/O to be finished by another
    6884             :  * backend.  In that case, false indicates either that the I/O was already
    6885             :  * finished, or is still in progress.  This is useful for callers that want to
    6886             :  * find out if they can perform the I/O as part of a larger operation, without
    6887             :  * waiting for the answer or distinguishing the reasons why not.
    6888             :  */
    6889             : bool
    6890     5130324 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
    6891             : {
    6892             :     uint64      buf_state;
    6893             : 
    6894     5130324 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    6895             : 
    6896             :     for (;;)
    6897             :     {
    6898     5134818 :         buf_state = LockBufHdr(buf);
    6899             : 
    6900     5134818 :         if (!(buf_state & BM_IO_IN_PROGRESS))
    6901     5130316 :             break;
    6902        4502 :         UnlockBufHdr(buf);
    6903        4502 :         if (nowait)
    6904           8 :             return false;
    6905        4494 :         WaitIO(buf);
    6906             :     }
    6907             : 
    6908             :     /* Once we get here, there is definitely no I/O active on this buffer */
    6909             : 
    6910             :     /* Check if someone else already did the I/O */
    6911     5130316 :     if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
    6912             :     {
    6913        4742 :         UnlockBufHdr(buf);
    6914        4742 :         return false;
    6915             :     }
    6916             : 
    6917     5125574 :     UnlockBufHdrExt(buf, buf_state,
    6918             :                     BM_IO_IN_PROGRESS, 0,
    6919             :                     0);
    6920             : 
    6921     5125574 :     ResourceOwnerRememberBufferIO(CurrentResourceOwner,
    6922             :                                   BufferDescriptorGetBuffer(buf));
    6923             : 
    6924     5125574 :     return true;
    6925             : }
    6926             : 
    6927             : /*
    6928             :  * TerminateBufferIO: release a buffer we were doing I/O on
    6929             :  *  (Assumptions)
    6930             :  *  My process is executing IO for the buffer
    6931             :  *  BM_IO_IN_PROGRESS bit is set for the buffer
    6932             :  *  The buffer is Pinned
    6933             :  *
    6934             :  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
    6935             :  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
    6936             :  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
    6937             :  * marking the buffer clean if it was re-dirtied while we were writing.
    6938             :  *
    6939             :  * set_flag_bits gets ORed into the buffer's flags.  It must include
    6940             :  * BM_IO_ERROR in a failure case.  For successful completion it could
    6941             :  * be 0, or BM_VALID if we just finished reading in the page.
    6942             :  *
    6943             :  * If forget_owner is true, we release the buffer I/O from the current
    6944             :  * resource owner. (forget_owner=false is used when the resource owner itself
    6945             :  * is being released)
    6946             :  */
    6947             : void
    6948     4846310 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
    6949             :                   bool forget_owner, bool release_aio)
    6950             : {
    6951             :     uint64      buf_state;
    6952     4846310 :     uint64      unset_flag_bits = 0;
    6953     4846310 :     int         refcount_change = 0;
    6954             : 
    6955     4846310 :     buf_state = LockBufHdr(buf);
    6956             : 
    6957             :     Assert(buf_state & BM_IO_IN_PROGRESS);
    6958     4846310 :     unset_flag_bits |= BM_IO_IN_PROGRESS;
    6959             : 
    6960             :     /* Clear earlier errors, if this IO failed, it'll be marked again */
    6961     4846310 :     unset_flag_bits |= BM_IO_ERROR;
    6962             : 
    6963     4846310 :     if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
    6964     1151160 :         unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
    6965             : 
    6966     4846310 :     if (release_aio)
    6967             :     {
    6968             :         /* release ownership by the AIO subsystem */
    6969             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
    6970     2666002 :         refcount_change = -1;
    6971     2666002 :         pgaio_wref_clear(&buf->io_wref);
    6972             :     }
    6973             : 
    6974     4846310 :     buf_state = UnlockBufHdrExt(buf, buf_state,
    6975             :                                 set_flag_bits, unset_flag_bits,
    6976             :                                 refcount_change);
    6977             : 
    6978     4846310 :     if (forget_owner)
    6979     2180266 :         ResourceOwnerForgetBufferIO(CurrentResourceOwner,
    6980             :                                     BufferDescriptorGetBuffer(buf));
    6981             : 
    6982     4846310 :     ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
    6983             : 
    6984             :     /*
    6985             :      * Support LockBufferForCleanup()
    6986             :      *
    6987             :      * We may have just released the last pin other than the waiter's. In most
    6988             :      * cases, this backend holds another pin on the buffer. But, if, for
    6989             :      * example, this backend is completing an IO issued by another backend, it
    6990             :      * may be time to wake the waiter.
    6991             :      */
    6992     4846310 :     if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
    6993           0 :         WakePinCountWaiter(buf);
    6994     4846310 : }
    6995             : 
    6996             : /*
    6997             :  * AbortBufferIO: Clean up active buffer I/O after an error.
    6998             :  *
    6999             :  *  All LWLocks & content locks we might have held have been released, but we
    7000             :  *  haven't yet released buffer pins, so the buffer is still pinned.
    7001             :  *
    7002             :  *  If I/O was in progress, we always set BM_IO_ERROR, even though it's
    7003             :  *  possible the error condition wasn't related to the I/O.
    7004             :  *
    7005             :  *  Note: this does not remove the buffer I/O from the resource owner.
    7006             :  *  That's correct when we're releasing the whole resource owner, but
    7007             :  *  beware if you use this in other contexts.
    7008             :  */
    7009             : static void
    7010          30 : AbortBufferIO(Buffer buffer)
    7011             : {
    7012          30 :     BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
    7013             :     uint64      buf_state;
    7014             : 
    7015          30 :     buf_state = LockBufHdr(buf_hdr);
    7016             :     Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
    7017             : 
    7018          30 :     if (!(buf_state & BM_VALID))
    7019             :     {
    7020             :         Assert(!(buf_state & BM_DIRTY));
    7021          30 :         UnlockBufHdr(buf_hdr);
    7022             :     }
    7023             :     else
    7024             :     {
    7025             :         Assert(buf_state & BM_DIRTY);
    7026           0 :         UnlockBufHdr(buf_hdr);
    7027             : 
    7028             :         /* Issue notice if this is not the first failure... */
    7029           0 :         if (buf_state & BM_IO_ERROR)
    7030             :         {
    7031             :             /* Buffer is pinned, so we can read tag without spinlock */
    7032           0 :             ereport(WARNING,
    7033             :                     (errcode(ERRCODE_IO_ERROR),
    7034             :                      errmsg("could not write block %u of %s",
    7035             :                             buf_hdr->tag.blockNum,
    7036             :                             relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
    7037             :                                         BufTagGetForkNum(&buf_hdr->tag)).str),
    7038             :                      errdetail("Multiple failures --- write error might be permanent.")));
    7039             :         }
    7040             :     }
    7041             : 
    7042          30 :     TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
    7043          30 : }
    7044             : 
    7045             : /*
    7046             :  * Error context callback for errors occurring during shared buffer writes.
    7047             :  */
    7048             : static void
    7049          94 : shared_buffer_write_error_callback(void *arg)
    7050             : {
    7051          94 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    7052             : 
    7053             :     /* Buffer is pinned, so we can read the tag without locking the spinlock */
    7054          94 :     if (bufHdr != NULL)
    7055         188 :         errcontext("writing block %u of relation \"%s\"",
    7056             :                    bufHdr->tag.blockNum,
    7057          94 :                    relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
    7058             :                                BufTagGetForkNum(&bufHdr->tag)).str);
    7059          94 : }
    7060             : 
    7061             : /*
    7062             :  * Error context callback for errors occurring during local buffer writes.
    7063             :  */
    7064             : static void
    7065           0 : local_buffer_write_error_callback(void *arg)
    7066             : {
    7067           0 :     BufferDesc *bufHdr = (BufferDesc *) arg;
    7068             : 
    7069           0 :     if (bufHdr != NULL)
    7070           0 :         errcontext("writing block %u of relation \"%s\"",
    7071             :                    bufHdr->tag.blockNum,
    7072           0 :                    relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
    7073             :                                   MyProcNumber,
    7074             :                                   BufTagGetForkNum(&bufHdr->tag)).str);
    7075           0 : }
    7076             : 
    7077             : /*
    7078             :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
    7079             :  */
    7080             : static int
    7081    19581582 : rlocator_comparator(const void *p1, const void *p2)
    7082             : {
    7083    19581582 :     RelFileLocator n1 = *(const RelFileLocator *) p1;
    7084    19581582 :     RelFileLocator n2 = *(const RelFileLocator *) p2;
    7085             : 
    7086    19581582 :     if (n1.relNumber < n2.relNumber)
    7087    19507674 :         return -1;
    7088       73908 :     else if (n1.relNumber > n2.relNumber)
    7089       70882 :         return 1;
    7090             : 
    7091        3026 :     if (n1.dbOid < n2.dbOid)
    7092           0 :         return -1;
    7093        3026 :     else if (n1.dbOid > n2.dbOid)
    7094           0 :         return 1;
    7095             : 
    7096        3026 :     if (n1.spcOid < n2.spcOid)
    7097           0 :         return -1;
    7098        3026 :     else if (n1.spcOid > n2.spcOid)
    7099           0 :         return 1;
    7100             :     else
    7101        3026 :         return 0;
    7102             : }
    7103             : 
    7104             : /*
    7105             :  * Lock buffer header - set BM_LOCKED in buffer state.
    7106             :  */
    7107             : uint64
    7108    65573176 : LockBufHdr(BufferDesc *desc)
    7109             : {
    7110             :     uint64      old_buf_state;
    7111             : 
    7112             :     Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
    7113             : 
    7114             :     while (true)
    7115             :     {
    7116             :         /*
    7117             :          * Always try once to acquire the lock directly, without setting up
    7118             :          * the spin-delay infrastructure. The work necessary for that shows up
    7119             :          * in profiles and is rarely necessary.
    7120             :          */
    7121    65577314 :         old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
    7122    65577314 :         if (likely(!(old_buf_state & BM_LOCKED)))
    7123    65573176 :             break;              /* got lock */
    7124             : 
    7125             :         /* and then spin without atomic operations until lock is released */
    7126             :         {
    7127             :             SpinDelayStatus delayStatus;
    7128             : 
    7129        4138 :             init_local_spin_delay(&delayStatus);
    7130             : 
    7131       16962 :             while (old_buf_state & BM_LOCKED)
    7132             :             {
    7133       12824 :                 perform_spin_delay(&delayStatus);
    7134       12824 :                 old_buf_state = pg_atomic_read_u64(&desc->state);
    7135             :             }
    7136        4138 :             finish_spin_delay(&delayStatus);
    7137             :         }
    7138             : 
    7139             :         /*
    7140             :          * Retry. The lock might obviously already be re-acquired by the time
    7141             :          * we're attempting to get it again.
    7142             :          */
    7143             :     }
    7144             : 
    7145    65573176 :     return old_buf_state | BM_LOCKED;
    7146             : }
    7147             : 
    7148             : /*
    7149             :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
    7150             :  * state at that point.
    7151             :  *
    7152             :  * Obviously the buffer could be locked by the time the value is returned, so
    7153             :  * this is primarily useful in CAS style loops.
    7154             :  */
    7155             : pg_noinline uint64
    7156        1534 : WaitBufHdrUnlocked(BufferDesc *buf)
    7157             : {
    7158             :     SpinDelayStatus delayStatus;
    7159             :     uint64      buf_state;
    7160             : 
    7161        1534 :     init_local_spin_delay(&delayStatus);
    7162             : 
    7163        1534 :     buf_state = pg_atomic_read_u64(&buf->state);
    7164             : 
    7165        9410 :     while (buf_state & BM_LOCKED)
    7166             :     {
    7167        7876 :         perform_spin_delay(&delayStatus);
    7168        7876 :         buf_state = pg_atomic_read_u64(&buf->state);
    7169             :     }
    7170             : 
    7171        1534 :     finish_spin_delay(&delayStatus);
    7172             : 
    7173        1534 :     return buf_state;
    7174             : }
    7175             : 
    7176             : /*
    7177             :  * BufferTag comparator.
    7178             :  */
    7179             : static inline int
    7180           0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
    7181             : {
    7182             :     int         ret;
    7183             :     RelFileLocator rlocatora;
    7184             :     RelFileLocator rlocatorb;
    7185             : 
    7186           0 :     rlocatora = BufTagGetRelFileLocator(ba);
    7187           0 :     rlocatorb = BufTagGetRelFileLocator(bb);
    7188             : 
    7189           0 :     ret = rlocator_comparator(&rlocatora, &rlocatorb);
    7190             : 
    7191           0 :     if (ret != 0)
    7192           0 :         return ret;
    7193             : 
    7194           0 :     if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
    7195           0 :         return -1;
    7196           0 :     if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
    7197           0 :         return 1;
    7198             : 
    7199           0 :     if (ba->blockNum < bb->blockNum)
    7200           0 :         return -1;
    7201           0 :     if (ba->blockNum > bb->blockNum)
    7202           0 :         return 1;
    7203             : 
    7204           0 :     return 0;
    7205             : }
    7206             : 
    7207             : /*
    7208             :  * Comparator determining the writeout order in a checkpoint.
    7209             :  *
    7210             :  * It is important that tablespaces are compared first, the logic balancing
    7211             :  * writes between tablespaces relies on it.
    7212             :  */
    7213             : static inline int
    7214     6023498 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
    7215             : {
    7216             :     /* compare tablespace */
    7217     6023498 :     if (a->tsId < b->tsId)
    7218       20086 :         return -1;
    7219     6003412 :     else if (a->tsId > b->tsId)
    7220       52398 :         return 1;
    7221             :     /* compare relation */
    7222     5951014 :     if (a->relNumber < b->relNumber)
    7223     1675358 :         return -1;
    7224     4275656 :     else if (a->relNumber > b->relNumber)
    7225     1618090 :         return 1;
    7226             :     /* compare fork */
    7227     2657566 :     else if (a->forkNum < b->forkNum)
    7228      122060 :         return -1;
    7229     2535506 :     else if (a->forkNum > b->forkNum)
    7230      121660 :         return 1;
    7231             :     /* compare block number */
    7232     2413846 :     else if (a->blockNum < b->blockNum)
    7233     1181006 :         return -1;
    7234     1232840 :     else if (a->blockNum > b->blockNum)
    7235     1159450 :         return 1;
    7236             :     /* equal page IDs are unlikely, but not impossible */
    7237       73390 :     return 0;
    7238             : }
    7239             : 
    7240             : /*
    7241             :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
    7242             :  * progress.
    7243             :  */
    7244             : static int
    7245      491664 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
    7246             : {
    7247      491664 :     CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
    7248      491664 :     CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
    7249             : 
    7250             :     /* we want a min-heap, so return 1 for the a < b */
    7251      491664 :     if (sa->progress < sb->progress)
    7252      445014 :         return 1;
    7253       46650 :     else if (sa->progress == sb->progress)
    7254        1510 :         return 0;
    7255             :     else
    7256       45140 :         return -1;
    7257             : }
    7258             : 
    7259             : /*
    7260             :  * Initialize a writeback context, discarding potential previous state.
    7261             :  *
    7262             :  * *max_pending is a pointer instead of an immediate value, so the coalesce
    7263             :  * limits can easily changed by the GUC mechanism, and so calling code does
    7264             :  * not have to check the current configuration. A value of 0 means that no
    7265             :  * writeback control will be performed.
    7266             :  */
    7267             : void
    7268        5622 : WritebackContextInit(WritebackContext *context, int *max_pending)
    7269             : {
    7270             :     Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7271             : 
    7272        5622 :     context->max_pending = max_pending;
    7273        5622 :     context->nr_pending = 0;
    7274        5622 : }
    7275             : 
    7276             : /*
    7277             :  * Add buffer to list of pending writeback requests.
    7278             :  */
    7279             : void
    7280     1144360 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
    7281             :                               BufferTag *tag)
    7282             : {
    7283             :     PendingWriteback *pending;
    7284             : 
    7285             :     /*
    7286             :      * As pg_flush_data() doesn't do anything with fsync disabled, there's no
    7287             :      * point in tracking in that case.
    7288             :      */
    7289     1144360 :     if (io_direct_flags & IO_DIRECT_DATA ||
    7290     1143320 :         !enableFsync)
    7291     1144358 :         return;
    7292             : 
    7293             :     /*
    7294             :      * Add buffer to the pending writeback array, unless writeback control is
    7295             :      * disabled.
    7296             :      */
    7297           2 :     if (*wb_context->max_pending > 0)
    7298             :     {
    7299             :         Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
    7300             : 
    7301           0 :         pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
    7302             : 
    7303           0 :         pending->tag = *tag;
    7304             :     }
    7305             : 
    7306             :     /*
    7307             :      * Perform pending flushes if the writeback limit is exceeded. This
    7308             :      * includes the case where previously an item has been added, but control
    7309             :      * is now disabled.
    7310             :      */
    7311           2 :     if (wb_context->nr_pending >= *wb_context->max_pending)
    7312           2 :         IssuePendingWritebacks(wb_context, io_context);
    7313             : }
    7314             : 
    7315             : #define ST_SORT sort_pending_writebacks
    7316             : #define ST_ELEMENT_TYPE PendingWriteback
    7317             : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
    7318             : #define ST_SCOPE static
    7319             : #define ST_DEFINE
    7320             : #include "lib/sort_template.h"
    7321             : 
    7322             : /*
    7323             :  * Issue all pending writeback requests, previously scheduled with
    7324             :  * ScheduleBufferTagForWriteback, to the OS.
    7325             :  *
    7326             :  * Because this is only used to improve the OSs IO scheduling we try to never
    7327             :  * error out - it's just a hint.
    7328             :  */
    7329             : void
    7330        2202 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
    7331             : {
    7332             :     instr_time  io_start;
    7333             :     int         i;
    7334             : 
    7335        2202 :     if (wb_context->nr_pending == 0)
    7336        2202 :         return;
    7337             : 
    7338             :     /*
    7339             :      * Executing the writes in-order can make them a lot faster, and allows to
    7340             :      * merge writeback requests to consecutive blocks into larger writebacks.
    7341             :      */
    7342           0 :     sort_pending_writebacks(wb_context->pending_writebacks,
    7343           0 :                             wb_context->nr_pending);
    7344             : 
    7345           0 :     io_start = pgstat_prepare_io_time(track_io_timing);
    7346             : 
    7347             :     /*
    7348             :      * Coalesce neighbouring writes, but nothing else. For that we iterate
    7349             :      * through the, now sorted, array of pending flushes, and look forward to
    7350             :      * find all neighbouring (or identical) writes.
    7351             :      */
    7352           0 :     for (i = 0; i < wb_context->nr_pending; i++)
    7353             :     {
    7354             :         PendingWriteback *cur;
    7355             :         PendingWriteback *next;
    7356             :         SMgrRelation reln;
    7357             :         int         ahead;
    7358             :         BufferTag   tag;
    7359             :         RelFileLocator currlocator;
    7360           0 :         Size        nblocks = 1;
    7361             : 
    7362           0 :         cur = &wb_context->pending_writebacks[i];
    7363           0 :         tag = cur->tag;
    7364           0 :         currlocator = BufTagGetRelFileLocator(&tag);
    7365             : 
    7366             :         /*
    7367             :          * Peek ahead, into following writeback requests, to see if they can
    7368             :          * be combined with the current one.
    7369             :          */
    7370           0 :         for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
    7371             :         {
    7372             : 
    7373           0 :             next = &wb_context->pending_writebacks[i + ahead + 1];
    7374             : 
    7375             :             /* different file, stop */
    7376           0 :             if (!RelFileLocatorEquals(currlocator,
    7377           0 :                                       BufTagGetRelFileLocator(&next->tag)) ||
    7378           0 :                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
    7379             :                 break;
    7380             : 
    7381             :             /* ok, block queued twice, skip */
    7382           0 :             if (cur->tag.blockNum == next->tag.blockNum)
    7383           0 :                 continue;
    7384             : 
    7385             :             /* only merge consecutive writes */
    7386           0 :             if (cur->tag.blockNum + 1 != next->tag.blockNum)
    7387           0 :                 break;
    7388             : 
    7389           0 :             nblocks++;
    7390           0 :             cur = next;
    7391             :         }
    7392             : 
    7393           0 :         i += ahead;
    7394             : 
    7395             :         /* and finally tell the kernel to write the data to storage */
    7396           0 :         reln = smgropen(currlocator, INVALID_PROC_NUMBER);
    7397           0 :         smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
    7398             :     }
    7399             : 
    7400             :     /*
    7401             :      * Assume that writeback requests are only issued for buffers containing
    7402             :      * blocks of permanent relations.
    7403             :      */
    7404           0 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
    7405           0 :                             IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
    7406             : 
    7407           0 :     wb_context->nr_pending = 0;
    7408             : }
    7409             : 
    7410             : /* ResourceOwner callbacks */
    7411             : 
    7412             : static void
    7413          30 : ResOwnerReleaseBufferIO(Datum res)
    7414             : {
    7415          30 :     Buffer      buffer = DatumGetInt32(res);
    7416             : 
    7417          30 :     AbortBufferIO(buffer);
    7418          30 : }
    7419             : 
    7420             : static char *
    7421           0 : ResOwnerPrintBufferIO(Datum res)
    7422             : {
    7423           0 :     Buffer      buffer = DatumGetInt32(res);
    7424             : 
    7425           0 :     return psprintf("lost track of buffer IO on buffer %d", buffer);
    7426             : }
    7427             : 
    7428             : /*
    7429             :  * Release buffer as part of resource owner cleanup. This will only be called
    7430             :  * if the buffer is pinned. If this backend held the content lock at the time
    7431             :  * of the error we also need to release that (note that it is not possible to
    7432             :  * hold a content lock without a pin).
    7433             :  */
    7434             : static void
    7435       15292 : ResOwnerReleaseBuffer(Datum res)
    7436             : {
    7437       15292 :     Buffer      buffer = DatumGetInt32(res);
    7438             : 
    7439             :     /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
    7440       15292 :     if (!BufferIsValid(buffer))
    7441           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
    7442             : 
    7443       15292 :     if (BufferIsLocal(buffer))
    7444        6066 :         UnpinLocalBufferNoOwner(buffer);
    7445             :     else
    7446             :     {
    7447             :         PrivateRefCountEntry *ref;
    7448             : 
    7449        9226 :         ref = GetPrivateRefCountEntry(buffer, false);
    7450             : 
    7451             :         /* not having a private refcount would imply resowner corruption */
    7452             :         Assert(ref != NULL);
    7453             : 
    7454             :         /*
    7455             :          * If the buffer was locked at the time of the resowner release,
    7456             :          * release the lock now. This should only happen after errors.
    7457             :          */
    7458        9226 :         if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
    7459             :         {
    7460         216 :             BufferDesc *buf = GetBufferDescriptor(buffer - 1);
    7461             : 
    7462         216 :             HOLD_INTERRUPTS();  /* match the upcoming RESUME_INTERRUPTS */
    7463         216 :             BufferLockUnlock(buffer, buf);
    7464             :         }
    7465             : 
    7466        9226 :         UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
    7467             :     }
    7468       15292 : }
    7469             : 
    7470             : static char *
    7471           0 : ResOwnerPrintBuffer(Datum res)
    7472             : {
    7473           0 :     return DebugPrintBufferRefcount(DatumGetInt32(res));
    7474             : }
    7475             : 
    7476             : /*
    7477             :  * Helper function to evict unpinned buffer whose buffer header lock is
    7478             :  * already acquired.
    7479             :  */
    7480             : static bool
    7481        4286 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
    7482             : {
    7483             :     uint64      buf_state;
    7484             :     bool        result;
    7485             : 
    7486        4286 :     *buffer_flushed = false;
    7487             : 
    7488        4286 :     buf_state = pg_atomic_read_u64(&(desc->state));
    7489             :     Assert(buf_state & BM_LOCKED);
    7490             : 
    7491        4286 :     if ((buf_state & BM_VALID) == 0)
    7492             :     {
    7493           0 :         UnlockBufHdr(desc);
    7494           0 :         return false;
    7495             :     }
    7496             : 
    7497             :     /* Check that it's not pinned already. */
    7498        4286 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    7499             :     {
    7500           0 :         UnlockBufHdr(desc);
    7501           0 :         return false;
    7502             :     }
    7503             : 
    7504        4286 :     PinBuffer_Locked(desc);     /* releases spinlock */
    7505             : 
    7506             :     /* If it was dirty, try to clean it once. */
    7507        4286 :     if (buf_state & BM_DIRTY)
    7508             :     {
    7509        1946 :         FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
    7510        1946 :         *buffer_flushed = true;
    7511             :     }
    7512             : 
    7513             :     /* This will return false if it becomes dirty or someone else pins it. */
    7514        4286 :     result = InvalidateVictimBuffer(desc);
    7515             : 
    7516        4286 :     UnpinBuffer(desc);
    7517             : 
    7518        4286 :     return result;
    7519             : }
    7520             : 
    7521             : /*
    7522             :  * Try to evict the current block in a shared buffer.
    7523             :  *
    7524             :  * This function is intended for testing/development use only!
    7525             :  *
    7526             :  * To succeed, the buffer must not be pinned on entry, so if the caller had a
    7527             :  * particular block in mind, it might already have been replaced by some other
    7528             :  * block by the time this function runs.  It's also unpinned on return, so the
    7529             :  * buffer might be occupied again by the time control is returned, potentially
    7530             :  * even by the same block.  This inherent raciness without other interlocking
    7531             :  * makes the function unsuitable for non-testing usage.
    7532             :  *
    7533             :  * *buffer_flushed is set to true if the buffer was dirty and has been
    7534             :  * flushed, false otherwise.  However, *buffer_flushed=true does not
    7535             :  * necessarily mean that we flushed the buffer, it could have been flushed by
    7536             :  * someone else.
    7537             :  *
    7538             :  * Returns true if the buffer was valid and it has now been made invalid.
    7539             :  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
    7540             :  * or if the buffer becomes dirty again while we're trying to write it out.
    7541             :  */
    7542             : bool
    7543         280 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
    7544             : {
    7545             :     BufferDesc *desc;
    7546             : 
    7547             :     Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
    7548             : 
    7549             :     /* Make sure we can pin the buffer. */
    7550         280 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    7551         280 :     ReservePrivateRefCountEntry();
    7552             : 
    7553         280 :     desc = GetBufferDescriptor(buf - 1);
    7554         280 :     LockBufHdr(desc);
    7555             : 
    7556         280 :     return EvictUnpinnedBufferInternal(desc, buffer_flushed);
    7557             : }
    7558             : 
    7559             : /*
    7560             :  * Try to evict all the shared buffers.
    7561             :  *
    7562             :  * This function is intended for testing/development use only! See
    7563             :  * EvictUnpinnedBuffer().
    7564             :  *
    7565             :  * The buffers_* parameters are mandatory and indicate the total count of
    7566             :  * buffers that:
    7567             :  * - buffers_evicted - were evicted
    7568             :  * - buffers_flushed - were flushed
    7569             :  * - buffers_skipped - could not be evicted
    7570             :  */
    7571             : void
    7572           2 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
    7573             :                         int32 *buffers_skipped)
    7574             : {
    7575           2 :     *buffers_evicted = 0;
    7576           2 :     *buffers_skipped = 0;
    7577           2 :     *buffers_flushed = 0;
    7578             : 
    7579       32770 :     for (int buf = 1; buf <= NBuffers; buf++)
    7580             :     {
    7581       32768 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7582             :         uint64      buf_state;
    7583             :         bool        buffer_flushed;
    7584             : 
    7585       32768 :         CHECK_FOR_INTERRUPTS();
    7586             : 
    7587       32768 :         buf_state = pg_atomic_read_u64(&desc->state);
    7588       32768 :         if (!(buf_state & BM_VALID))
    7589       28762 :             continue;
    7590             : 
    7591        4006 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7592        4006 :         ReservePrivateRefCountEntry();
    7593             : 
    7594        4006 :         LockBufHdr(desc);
    7595             : 
    7596        4006 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    7597        4006 :             (*buffers_evicted)++;
    7598             :         else
    7599           0 :             (*buffers_skipped)++;
    7600             : 
    7601        4006 :         if (buffer_flushed)
    7602        1908 :             (*buffers_flushed)++;
    7603             :     }
    7604           2 : }
    7605             : 
    7606             : /*
    7607             :  * Try to evict all the shared buffers containing provided relation's pages.
    7608             :  *
    7609             :  * This function is intended for testing/development use only! See
    7610             :  * EvictUnpinnedBuffer().
    7611             :  *
    7612             :  * The caller must hold at least AccessShareLock on the relation to prevent
    7613             :  * the relation from being dropped.
    7614             :  *
    7615             :  * The buffers_* parameters are mandatory and indicate the total count of
    7616             :  * buffers that:
    7617             :  * - buffers_evicted - were evicted
    7618             :  * - buffers_flushed - were flushed
    7619             :  * - buffers_skipped - could not be evicted
    7620             :  */
    7621             : void
    7622           2 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
    7623             :                         int32 *buffers_flushed, int32 *buffers_skipped)
    7624             : {
    7625             :     Assert(!RelationUsesLocalBuffers(rel));
    7626             : 
    7627           2 :     *buffers_skipped = 0;
    7628           2 :     *buffers_evicted = 0;
    7629           2 :     *buffers_flushed = 0;
    7630             : 
    7631       32770 :     for (int buf = 1; buf <= NBuffers; buf++)
    7632             :     {
    7633       32768 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7634       32768 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
    7635             :         bool        buffer_flushed;
    7636             : 
    7637       32768 :         CHECK_FOR_INTERRUPTS();
    7638             : 
    7639             :         /* An unlocked precheck should be safe and saves some cycles. */
    7640       32768 :         if ((buf_state & BM_VALID) == 0 ||
    7641          54 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7642       32768 :             continue;
    7643             : 
    7644             :         /* Make sure we can pin the buffer. */
    7645           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7646           0 :         ReservePrivateRefCountEntry();
    7647             : 
    7648           0 :         buf_state = LockBufHdr(desc);
    7649             : 
    7650             :         /* recheck, could have changed without the lock */
    7651           0 :         if ((buf_state & BM_VALID) == 0 ||
    7652           0 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7653             :         {
    7654           0 :             UnlockBufHdr(desc);
    7655           0 :             continue;
    7656             :         }
    7657             : 
    7658           0 :         if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
    7659           0 :             (*buffers_evicted)++;
    7660             :         else
    7661           0 :             (*buffers_skipped)++;
    7662             : 
    7663           0 :         if (buffer_flushed)
    7664           0 :             (*buffers_flushed)++;
    7665             :     }
    7666           2 : }
    7667             : 
    7668             : /*
    7669             :  * Helper function to mark unpinned buffer dirty whose buffer header lock is
    7670             :  * already acquired.
    7671             :  */
    7672             : static bool
    7673          72 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
    7674             :                                 bool *buffer_already_dirty)
    7675             : {
    7676             :     uint64      buf_state;
    7677          72 :     bool        result = false;
    7678             : 
    7679          72 :     *buffer_already_dirty = false;
    7680             : 
    7681          72 :     buf_state = pg_atomic_read_u64(&(desc->state));
    7682             :     Assert(buf_state & BM_LOCKED);
    7683             : 
    7684          72 :     if ((buf_state & BM_VALID) == 0)
    7685             :     {
    7686           2 :         UnlockBufHdr(desc);
    7687           2 :         return false;
    7688             :     }
    7689             : 
    7690             :     /* Check that it's not pinned already. */
    7691          70 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
    7692             :     {
    7693           0 :         UnlockBufHdr(desc);
    7694           0 :         return false;
    7695             :     }
    7696             : 
    7697             :     /* Pin the buffer and then release the buffer spinlock */
    7698          70 :     PinBuffer_Locked(desc);
    7699             : 
    7700             :     /* If it was not already dirty, mark it as dirty. */
    7701          70 :     if (!(buf_state & BM_DIRTY))
    7702             :     {
    7703          34 :         BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
    7704          34 :         MarkBufferDirty(buf);
    7705          34 :         result = true;
    7706          34 :         BufferLockUnlock(buf, desc);
    7707             :     }
    7708             :     else
    7709          36 :         *buffer_already_dirty = true;
    7710             : 
    7711          70 :     UnpinBuffer(desc);
    7712             : 
    7713          70 :     return result;
    7714             : }
    7715             : 
    7716             : /*
    7717             :  * Try to mark the provided shared buffer as dirty.
    7718             :  *
    7719             :  * This function is intended for testing/development use only!
    7720             :  *
    7721             :  * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
    7722             :  *
    7723             :  * The buffer_already_dirty parameter is mandatory and indicate if the buffer
    7724             :  * could not be dirtied because it is already dirty.
    7725             :  *
    7726             :  * Returns true if the buffer has successfully been marked as dirty.
    7727             :  */
    7728             : bool
    7729           2 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
    7730             : {
    7731             :     BufferDesc *desc;
    7732           2 :     bool        buffer_dirtied = false;
    7733             : 
    7734             :     Assert(!BufferIsLocal(buf));
    7735             : 
    7736             :     /* Make sure we can pin the buffer. */
    7737           2 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    7738           2 :     ReservePrivateRefCountEntry();
    7739             : 
    7740           2 :     desc = GetBufferDescriptor(buf - 1);
    7741           2 :     LockBufHdr(desc);
    7742             : 
    7743           2 :     buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
    7744             :     /* Both can not be true at the same time */
    7745             :     Assert(!(buffer_dirtied && *buffer_already_dirty));
    7746             : 
    7747           2 :     return buffer_dirtied;
    7748             : }
    7749             : 
    7750             : /*
    7751             :  * Try to mark all the shared buffers containing provided relation's pages as
    7752             :  * dirty.
    7753             :  *
    7754             :  * This function is intended for testing/development use only! See
    7755             :  * MarkDirtyUnpinnedBuffer().
    7756             :  *
    7757             :  * The buffers_* parameters are mandatory and indicate the total count of
    7758             :  * buffers that:
    7759             :  * - buffers_dirtied - were dirtied
    7760             :  * - buffers_already_dirty - were already dirty
    7761             :  * - buffers_skipped - could not be dirtied because of a reason different
    7762             :  * than a buffer being already dirty.
    7763             :  */
    7764             : void
    7765           2 : MarkDirtyRelUnpinnedBuffers(Relation rel,
    7766             :                             int32 *buffers_dirtied,
    7767             :                             int32 *buffers_already_dirty,
    7768             :                             int32 *buffers_skipped)
    7769             : {
    7770             :     Assert(!RelationUsesLocalBuffers(rel));
    7771             : 
    7772           2 :     *buffers_dirtied = 0;
    7773           2 :     *buffers_already_dirty = 0;
    7774           2 :     *buffers_skipped = 0;
    7775             : 
    7776       32770 :     for (int buf = 1; buf <= NBuffers; buf++)
    7777             :     {
    7778       32768 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7779       32768 :         uint64      buf_state = pg_atomic_read_u64(&(desc->state));
    7780             :         bool        buffer_already_dirty;
    7781             : 
    7782       32768 :         CHECK_FOR_INTERRUPTS();
    7783             : 
    7784             :         /* An unlocked precheck should be safe and saves some cycles. */
    7785       32768 :         if ((buf_state & BM_VALID) == 0 ||
    7786          54 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7787       32768 :             continue;
    7788             : 
    7789             :         /* Make sure we can pin the buffer. */
    7790           0 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7791           0 :         ReservePrivateRefCountEntry();
    7792             : 
    7793           0 :         buf_state = LockBufHdr(desc);
    7794             : 
    7795             :         /* recheck, could have changed without the lock */
    7796           0 :         if ((buf_state & BM_VALID) == 0 ||
    7797           0 :             !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
    7798             :         {
    7799           0 :             UnlockBufHdr(desc);
    7800           0 :             continue;
    7801             :         }
    7802             : 
    7803           0 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    7804           0 :             (*buffers_dirtied)++;
    7805           0 :         else if (buffer_already_dirty)
    7806           0 :             (*buffers_already_dirty)++;
    7807             :         else
    7808           0 :             (*buffers_skipped)++;
    7809             :     }
    7810           2 : }
    7811             : 
    7812             : /*
    7813             :  * Try to mark all the shared buffers as dirty.
    7814             :  *
    7815             :  * This function is intended for testing/development use only! See
    7816             :  * MarkDirtyUnpinnedBuffer().
    7817             :  *
    7818             :  * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
    7819             :  * parameters.
    7820             :  */
    7821             : void
    7822           2 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
    7823             :                             int32 *buffers_already_dirty,
    7824             :                             int32 *buffers_skipped)
    7825             : {
    7826           2 :     *buffers_dirtied = 0;
    7827           2 :     *buffers_already_dirty = 0;
    7828           2 :     *buffers_skipped = 0;
    7829             : 
    7830       32770 :     for (int buf = 1; buf <= NBuffers; buf++)
    7831             :     {
    7832       32768 :         BufferDesc *desc = GetBufferDescriptor(buf - 1);
    7833             :         uint64      buf_state;
    7834             :         bool        buffer_already_dirty;
    7835             : 
    7836       32768 :         CHECK_FOR_INTERRUPTS();
    7837             : 
    7838       32768 :         buf_state = pg_atomic_read_u64(&desc->state);
    7839       32768 :         if (!(buf_state & BM_VALID))
    7840       32698 :             continue;
    7841             : 
    7842          70 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    7843          70 :         ReservePrivateRefCountEntry();
    7844             : 
    7845          70 :         LockBufHdr(desc);
    7846             : 
    7847          70 :         if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
    7848          34 :             (*buffers_dirtied)++;
    7849          36 :         else if (buffer_already_dirty)
    7850          36 :             (*buffers_already_dirty)++;
    7851             :         else
    7852           0 :             (*buffers_skipped)++;
    7853             :     }
    7854           2 : }
    7855             : 
    7856             : /*
    7857             :  * Generic implementation of the AIO handle staging callback for readv/writev
    7858             :  * on local/shared buffers.
    7859             :  *
    7860             :  * Each readv/writev can target multiple buffers. The buffers have already
    7861             :  * been registered with the IO handle.
    7862             :  *
    7863             :  * To make the IO ready for execution ("staging"), we need to ensure that the
    7864             :  * targeted buffers are in an appropriate state while the IO is ongoing. For
    7865             :  * that the AIO subsystem needs to have its own buffer pin, otherwise an error
    7866             :  * in this backend could lead to this backend's buffer pin being released as
    7867             :  * part of error handling, which in turn could lead to the buffer being
    7868             :  * replaced while IO is ongoing.
    7869             :  */
    7870             : static pg_attribute_always_inline void
    7871     2616866 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
    7872             : {
    7873             :     uint64     *io_data;
    7874             :     uint8       handle_data_len;
    7875             :     PgAioWaitRef io_ref;
    7876     2616866 :     BufferTag   first PG_USED_FOR_ASSERTS_ONLY = {0};
    7877             : 
    7878     2616866 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    7879             : 
    7880     2616866 :     pgaio_io_get_wref(ioh, &io_ref);
    7881             : 
    7882             :     /* iterate over all buffers affected by the vectored readv/writev */
    7883     5578950 :     for (int i = 0; i < handle_data_len; i++)
    7884             :     {
    7885     2962084 :         Buffer      buffer = (Buffer) io_data[i];
    7886     2962084 :         BufferDesc *buf_hdr = is_temp ?
    7887       16818 :             GetLocalBufferDescriptor(-buffer - 1)
    7888     2962084 :             : GetBufferDescriptor(buffer - 1);
    7889             :         uint64      buf_state;
    7890             : 
    7891             :         /*
    7892             :          * Check that all the buffers are actually ones that could conceivably
    7893             :          * be done in one IO, i.e. are sequential. This is the last
    7894             :          * buffer-aware code before IO is actually executed and confusion
    7895             :          * about which buffers are targeted by IO can be hard to debug, making
    7896             :          * it worth doing extra-paranoid checks.
    7897             :          */
    7898     2962084 :         if (i == 0)
    7899     2616866 :             first = buf_hdr->tag;
    7900             :         else
    7901             :         {
    7902             :             Assert(buf_hdr->tag.relNumber == first.relNumber);
    7903             :             Assert(buf_hdr->tag.blockNum == first.blockNum + i);
    7904             :         }
    7905             : 
    7906     2962084 :         if (is_temp)
    7907       16818 :             buf_state = pg_atomic_read_u64(&buf_hdr->state);
    7908             :         else
    7909     2945266 :             buf_state = LockBufHdr(buf_hdr);
    7910             : 
    7911             :         /* verify the buffer is in the expected state */
    7912             :         Assert(buf_state & BM_TAG_VALID);
    7913             :         if (is_write)
    7914             :         {
    7915             :             Assert(buf_state & BM_VALID);
    7916             :             Assert(buf_state & BM_DIRTY);
    7917             :         }
    7918             :         else
    7919             :         {
    7920             :             Assert(!(buf_state & BM_VALID));
    7921             :             Assert(!(buf_state & BM_DIRTY));
    7922             :         }
    7923             : 
    7924             :         /* temp buffers don't use BM_IO_IN_PROGRESS */
    7925     2962084 :         if (!is_temp)
    7926             :             Assert(buf_state & BM_IO_IN_PROGRESS);
    7927             : 
    7928             :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
    7929             : 
    7930             :         /*
    7931             :          * Reflect that the buffer is now owned by the AIO subsystem.
    7932             :          *
    7933             :          * For local buffers: This can't be done just via LocalRefCount, as
    7934             :          * one might initially think, as this backend could error out while
    7935             :          * AIO is still in progress, releasing all the pins by the backend
    7936             :          * itself.
    7937             :          *
    7938             :          * This pin is released again in TerminateBufferIO().
    7939             :          */
    7940     2962084 :         buf_hdr->io_wref = io_ref;
    7941             : 
    7942     2962084 :         if (is_temp)
    7943             :         {
    7944       16818 :             buf_state += BUF_REFCOUNT_ONE;
    7945       16818 :             pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
    7946             :         }
    7947             :         else
    7948     2945266 :             UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
    7949             : 
    7950             :         /*
    7951             :          * Ensure the content lock that prevents buffer modifications while
    7952             :          * the buffer is being written out is not released early due to an
    7953             :          * error.
    7954             :          */
    7955     2962084 :         if (is_write && !is_temp)
    7956             :         {
    7957             :             Assert(BufferLockHeldByMe(buf_hdr));
    7958             : 
    7959             :             /*
    7960             :              * Lock is now owned by AIO subsystem.
    7961             :              */
    7962           0 :             BufferLockDisown(buffer, buf_hdr);
    7963             :         }
    7964             : 
    7965             :         /*
    7966             :          * Stop tracking this buffer via the resowner - the AIO system now
    7967             :          * keeps track.
    7968             :          */
    7969     2962084 :         if (!is_temp)
    7970     2945266 :             ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
    7971             :     }
    7972     2616866 : }
    7973             : 
    7974             : /*
    7975             :  * Decode readv errors as encoded by buffer_readv_encode_error().
    7976             :  */
    7977             : static inline void
    7978         698 : buffer_readv_decode_error(PgAioResult result,
    7979             :                           bool *zeroed_any,
    7980             :                           bool *ignored_any,
    7981             :                           uint8 *zeroed_or_error_count,
    7982             :                           uint8 *checkfail_count,
    7983             :                           uint8 *first_off)
    7984             : {
    7985         698 :     uint32      rem_error = result.error_data;
    7986             : 
    7987             :     /* see static asserts in buffer_readv_encode_error */
    7988             : #define READV_COUNT_BITS    7
    7989             : #define READV_COUNT_MASK    ((1 << READV_COUNT_BITS) - 1)
    7990             : 
    7991         698 :     *zeroed_any = rem_error & 1;
    7992         698 :     rem_error >>= 1;
    7993             : 
    7994         698 :     *ignored_any = rem_error & 1;
    7995         698 :     rem_error >>= 1;
    7996             : 
    7997         698 :     *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
    7998         698 :     rem_error >>= READV_COUNT_BITS;
    7999             : 
    8000         698 :     *checkfail_count = rem_error & READV_COUNT_MASK;
    8001         698 :     rem_error >>= READV_COUNT_BITS;
    8002             : 
    8003         698 :     *first_off = rem_error & READV_COUNT_MASK;
    8004         698 :     rem_error >>= READV_COUNT_BITS;
    8005         698 : }
    8006             : 
    8007             : /*
    8008             :  * Helper to encode errors for buffer_readv_complete()
    8009             :  *
    8010             :  * Errors are encoded as follows:
    8011             :  * - bit 0 indicates whether any page was zeroed (1) or not (0)
    8012             :  * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
    8013             :  * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
    8014             :  * - next READV_COUNT_BITS bits indicate the number of checksum failures
    8015             :  * - next READV_COUNT_BITS bits indicate the first offset of the first page
    8016             :  *   that was errored or zeroed or, if no errors/zeroes, the first ignored
    8017             :  *   checksum
    8018             :  */
    8019             : static inline void
    8020         384 : buffer_readv_encode_error(PgAioResult *result,
    8021             :                           bool is_temp,
    8022             :                           bool zeroed_any,
    8023             :                           bool ignored_any,
    8024             :                           uint8 error_count,
    8025             :                           uint8 zeroed_count,
    8026             :                           uint8 checkfail_count,
    8027             :                           uint8 first_error_off,
    8028             :                           uint8 first_zeroed_off,
    8029             :                           uint8 first_ignored_off)
    8030             : {
    8031             : 
    8032         384 :     uint8       shift = 0;
    8033         384 :     uint8       zeroed_or_error_count =
    8034             :         error_count > 0 ? error_count : zeroed_count;
    8035             :     uint8       first_off;
    8036             : 
    8037             :     StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
    8038             :                      "PG_IOV_MAX is bigger than reserved space for error data");
    8039             :     StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
    8040             :                      "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
    8041             : 
    8042             :     /*
    8043             :      * We only have space to encode one offset - but luckily that's good
    8044             :      * enough. If there is an error, the error is the interesting offset, same
    8045             :      * with a zeroed buffer vs an ignored buffer.
    8046             :      */
    8047         384 :     if (error_count > 0)
    8048         188 :         first_off = first_error_off;
    8049         196 :     else if (zeroed_count > 0)
    8050         160 :         first_off = first_zeroed_off;
    8051             :     else
    8052          36 :         first_off = first_ignored_off;
    8053             : 
    8054             :     Assert(!zeroed_any || error_count == 0);
    8055             : 
    8056         384 :     result->error_data = 0;
    8057             : 
    8058         384 :     result->error_data |= zeroed_any << shift;
    8059         384 :     shift += 1;
    8060             : 
    8061         384 :     result->error_data |= ignored_any << shift;
    8062         384 :     shift += 1;
    8063             : 
    8064         384 :     result->error_data |= ((uint32) zeroed_or_error_count) << shift;
    8065         384 :     shift += READV_COUNT_BITS;
    8066             : 
    8067         384 :     result->error_data |= ((uint32) checkfail_count) << shift;
    8068         384 :     shift += READV_COUNT_BITS;
    8069             : 
    8070         384 :     result->error_data |= ((uint32) first_off) << shift;
    8071         384 :     shift += READV_COUNT_BITS;
    8072             : 
    8073         384 :     result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
    8074             :         PGAIO_HCB_SHARED_BUFFER_READV;
    8075             : 
    8076         384 :     if (error_count > 0)
    8077         188 :         result->status = PGAIO_RS_ERROR;
    8078             :     else
    8079         196 :         result->status = PGAIO_RS_WARNING;
    8080             : 
    8081             :     /*
    8082             :      * The encoding is complicated enough to warrant cross-checking it against
    8083             :      * the decode function.
    8084             :      */
    8085             : #ifdef USE_ASSERT_CHECKING
    8086             :     {
    8087             :         bool        zeroed_any_2,
    8088             :                     ignored_any_2;
    8089             :         uint8       zeroed_or_error_count_2,
    8090             :                     checkfail_count_2,
    8091             :                     first_off_2;
    8092             : 
    8093             :         buffer_readv_decode_error(*result,
    8094             :                                   &zeroed_any_2, &ignored_any_2,
    8095             :                                   &zeroed_or_error_count_2,
    8096             :                                   &checkfail_count_2,
    8097             :                                   &first_off_2);
    8098             :         Assert(zeroed_any == zeroed_any_2);
    8099             :         Assert(ignored_any == ignored_any_2);
    8100             :         Assert(zeroed_or_error_count == zeroed_or_error_count_2);
    8101             :         Assert(checkfail_count == checkfail_count_2);
    8102             :         Assert(first_off == first_off_2);
    8103             :     }
    8104             : #endif
    8105             : 
    8106             : #undef READV_COUNT_BITS
    8107             : #undef READV_COUNT_MASK
    8108         384 : }
    8109             : 
    8110             : /*
    8111             :  * Helper for AIO readv completion callbacks, supporting both shared and temp
    8112             :  * buffers. Gets called once for each buffer in a multi-page read.
    8113             :  */
    8114             : static pg_attribute_always_inline void
    8115     2682820 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
    8116             :                           uint8 flags, bool failed, bool is_temp,
    8117             :                           bool *buffer_invalid,
    8118             :                           bool *failed_checksum,
    8119             :                           bool *ignored_checksum,
    8120             :                           bool *zeroed_buffer)
    8121             : {
    8122     2682820 :     BufferDesc *buf_hdr = is_temp ?
    8123       16818 :         GetLocalBufferDescriptor(-buffer - 1)
    8124     2682820 :         : GetBufferDescriptor(buffer - 1);
    8125     2682820 :     BufferTag   tag = buf_hdr->tag;
    8126     2682820 :     char       *bufdata = BufferGetBlock(buffer);
    8127             :     uint64      set_flag_bits;
    8128             :     int         piv_flags;
    8129             : 
    8130             :     /* check that the buffer is in the expected state for a read */
    8131             : #ifdef USE_ASSERT_CHECKING
    8132             :     {
    8133             :         uint64      buf_state = pg_atomic_read_u64(&buf_hdr->state);
    8134             : 
    8135             :         Assert(buf_state & BM_TAG_VALID);
    8136             :         Assert(!(buf_state & BM_VALID));
    8137             :         /* temp buffers don't use BM_IO_IN_PROGRESS */
    8138             :         if (!is_temp)
    8139             :             Assert(buf_state & BM_IO_IN_PROGRESS);
    8140             :         Assert(!(buf_state & BM_DIRTY));
    8141             :     }
    8142             : #endif
    8143             : 
    8144     2682820 :     *buffer_invalid = false;
    8145     2682820 :     *failed_checksum = false;
    8146     2682820 :     *ignored_checksum = false;
    8147     2682820 :     *zeroed_buffer = false;
    8148             : 
    8149             :     /*
    8150             :      * We ask PageIsVerified() to only log the message about checksum errors,
    8151             :      * as the completion might be run in any backend (or IO workers). We will
    8152             :      * report checksum errors in buffer_readv_report().
    8153             :      */
    8154     2682820 :     piv_flags = PIV_LOG_LOG;
    8155             : 
    8156             :     /* the local zero_damaged_pages may differ from the definer's */
    8157     2682820 :     if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
    8158          76 :         piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
    8159             : 
    8160             :     /* Check for garbage data. */
    8161     2682820 :     if (!failed)
    8162             :     {
    8163             :         /*
    8164             :          * If the buffer is not currently pinned by this backend, e.g. because
    8165             :          * we're completing this IO after an error, the buffer data will have
    8166             :          * been marked as inaccessible when the buffer was unpinned. The AIO
    8167             :          * subsystem holds a pin, but that doesn't prevent the buffer from
    8168             :          * having been marked as inaccessible. The completion might also be
    8169             :          * executed in a different process.
    8170             :          */
    8171             : #ifdef USE_VALGRIND
    8172             :         if (!BufferIsPinned(buffer))
    8173             :             VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
    8174             : #endif
    8175             : 
    8176     2682762 :         if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
    8177             :                             failed_checksum))
    8178             :         {
    8179         192 :             if (flags & READ_BUFFERS_ZERO_ON_ERROR)
    8180             :             {
    8181          92 :                 memset(bufdata, 0, BLCKSZ);
    8182          92 :                 *zeroed_buffer = true;
    8183             :             }
    8184             :             else
    8185             :             {
    8186         100 :                 *buffer_invalid = true;
    8187             :                 /* mark buffer as having failed */
    8188         100 :                 failed = true;
    8189             :             }
    8190             :         }
    8191     2682570 :         else if (*failed_checksum)
    8192          24 :             *ignored_checksum = true;
    8193             : 
    8194             :         /* undo what we did above */
    8195             : #ifdef USE_VALGRIND
    8196             :         if (!BufferIsPinned(buffer))
    8197             :             VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
    8198             : #endif
    8199             : 
    8200             :         /*
    8201             :          * Immediately log a message about the invalid page, but only to the
    8202             :          * server log. The reason to do so immediately is that this may be
    8203             :          * executed in a different backend than the one that originated the
    8204             :          * request. The reason to do so immediately is that the originator
    8205             :          * might not process the query result immediately (because it is busy
    8206             :          * doing another part of query processing) or at all (e.g. if it was
    8207             :          * cancelled or errored out due to another IO also failing). The
    8208             :          * definer of the IO will emit an ERROR or WARNING when processing the
    8209             :          * IO's results
    8210             :          *
    8211             :          * To avoid duplicating the code to emit these log messages, we reuse
    8212             :          * buffer_readv_report().
    8213             :          */
    8214     2682762 :         if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
    8215             :         {
    8216         216 :             PgAioResult result_one = {0};
    8217             : 
    8218         216 :             buffer_readv_encode_error(&result_one, is_temp,
    8219         216 :                                       *zeroed_buffer,
    8220         216 :                                       *ignored_checksum,
    8221         216 :                                       *buffer_invalid,
    8222         216 :                                       *zeroed_buffer ? 1 : 0,
    8223         216 :                                       *failed_checksum ? 1 : 0,
    8224             :                                       buf_off, buf_off, buf_off);
    8225         216 :             pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
    8226             :         }
    8227             :     }
    8228             : 
    8229             :     /* Terminate I/O and set BM_VALID. */
    8230     2682820 :     set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
    8231     2682820 :     if (is_temp)
    8232       16818 :         TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
    8233             :     else
    8234     2666002 :         TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
    8235             : 
    8236             :     /*
    8237             :      * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
    8238             :      * callback may not be executed in the same backend that called
    8239             :      * BUFFER_READ_START. The alternative would be to defer calling the
    8240             :      * tracepoint to a later point (e.g. the local completion callback for
    8241             :      * shared buffer reads), which seems even less helpful.
    8242             :      */
    8243             :     TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
    8244             :                                       tag.blockNum,
    8245             :                                       tag.spcOid,
    8246             :                                       tag.dbOid,
    8247             :                                       tag.relNumber,
    8248             :                                       is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
    8249             :                                       false);
    8250     2682820 : }
    8251             : 
    8252             : /*
    8253             :  * Perform completion handling of a single AIO read. This read may cover
    8254             :  * multiple blocks / buffers.
    8255             :  *
    8256             :  * Shared between shared and local buffers, to reduce code duplication.
    8257             :  */
    8258             : static pg_attribute_always_inline PgAioResult
    8259     2406098 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8260             :                       uint8 cb_data, bool is_temp)
    8261             : {
    8262     2406098 :     PgAioResult result = prior_result;
    8263     2406098 :     PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8264     2406098 :     uint8       first_error_off = 0;
    8265     2406098 :     uint8       first_zeroed_off = 0;
    8266     2406098 :     uint8       first_ignored_off = 0;
    8267     2406098 :     uint8       error_count = 0;
    8268     2406098 :     uint8       zeroed_count = 0;
    8269     2406098 :     uint8       ignored_count = 0;
    8270     2406098 :     uint8       checkfail_count = 0;
    8271             :     uint64     *io_data;
    8272             :     uint8       handle_data_len;
    8273             : 
    8274             :     if (is_temp)
    8275             :     {
    8276             :         Assert(td->smgr.is_temp);
    8277             :         Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
    8278             :     }
    8279             :     else
    8280             :         Assert(!td->smgr.is_temp);
    8281             : 
    8282             :     /*
    8283             :      * Iterate over all the buffers affected by this IO and call the
    8284             :      * per-buffer completion function for each buffer.
    8285             :      */
    8286     2406098 :     io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
    8287     5088918 :     for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
    8288             :     {
    8289     2682820 :         Buffer      buf = io_data[buf_off];
    8290             :         bool        failed;
    8291     2682820 :         bool        failed_verification = false;
    8292     2682820 :         bool        failed_checksum = false;
    8293     2682820 :         bool        zeroed_buffer = false;
    8294     2682820 :         bool        ignored_checksum = false;
    8295             : 
    8296             :         Assert(BufferIsValid(buf));
    8297             : 
    8298             :         /*
    8299             :          * If the entire I/O failed on a lower-level, each buffer needs to be
    8300             :          * marked as failed. In case of a partial read, the first few buffers
    8301             :          * may be ok.
    8302             :          */
    8303     2682820 :         failed =
    8304     2682820 :             prior_result.status == PGAIO_RS_ERROR
    8305     2682820 :             || prior_result.result <= buf_off;
    8306             : 
    8307     2682820 :         buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
    8308             :                                   &failed_verification,
    8309             :                                   &failed_checksum,
    8310             :                                   &ignored_checksum,
    8311             :                                   &zeroed_buffer);
    8312             : 
    8313             :         /*
    8314             :          * Track information about the number of different kinds of error
    8315             :          * conditions across all pages, as there can be multiple pages failing
    8316             :          * verification as part of one IO.
    8317             :          */
    8318     2682820 :         if (failed_verification && !zeroed_buffer && error_count++ == 0)
    8319          88 :             first_error_off = buf_off;
    8320     2682820 :         if (zeroed_buffer && zeroed_count++ == 0)
    8321          68 :             first_zeroed_off = buf_off;
    8322     2682820 :         if (ignored_checksum && ignored_count++ == 0)
    8323          20 :             first_ignored_off = buf_off;
    8324     2682820 :         if (failed_checksum)
    8325          64 :             checkfail_count++;
    8326             :     }
    8327             : 
    8328             :     /*
    8329             :      * If the smgr read succeeded [partially] and page verification failed for
    8330             :      * some of the pages, adjust the IO's result state appropriately.
    8331             :      */
    8332     2406098 :     if (prior_result.status != PGAIO_RS_ERROR &&
    8333     2405992 :         (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
    8334             :     {
    8335         168 :         buffer_readv_encode_error(&result, is_temp,
    8336             :                                   zeroed_count > 0, ignored_count > 0,
    8337             :                                   error_count, zeroed_count, checkfail_count,
    8338             :                                   first_error_off, first_zeroed_off,
    8339             :                                   first_ignored_off);
    8340         168 :         pgaio_result_report(result, td, DEBUG1);
    8341             :     }
    8342             : 
    8343             :     /*
    8344             :      * For shared relations this reporting is done in
    8345             :      * shared_buffer_readv_complete_local().
    8346             :      */
    8347     2406098 :     if (is_temp && checkfail_count > 0)
    8348           4 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8349             :                                               checkfail_count);
    8350             : 
    8351     2406098 :     return result;
    8352             : }
    8353             : 
    8354             : /*
    8355             :  * AIO error reporting callback for aio_shared_buffer_readv_cb and
    8356             :  * aio_local_buffer_readv_cb.
    8357             :  *
    8358             :  * The error is encoded / decoded in buffer_readv_encode_error() /
    8359             :  * buffer_readv_decode_error().
    8360             :  */
    8361             : static void
    8362         544 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
    8363             :                     int elevel)
    8364             : {
    8365         544 :     int         nblocks = td->smgr.nblocks;
    8366         544 :     BlockNumber first = td->smgr.blockNum;
    8367         544 :     BlockNumber last = first + nblocks - 1;
    8368         544 :     ProcNumber  errProc =
    8369         544 :         td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
    8370             :     RelPathStr  rpath =
    8371         544 :         relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
    8372             :     bool        zeroed_any,
    8373             :                 ignored_any;
    8374             :     uint8       zeroed_or_error_count,
    8375             :                 checkfail_count,
    8376             :                 first_off;
    8377             :     uint8       affected_count;
    8378             :     const char *msg_one,
    8379             :                *msg_mult,
    8380             :                *det_mult,
    8381             :                *hint_mult;
    8382             : 
    8383         544 :     buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
    8384             :                               &zeroed_or_error_count,
    8385             :                               &checkfail_count,
    8386             :                               &first_off);
    8387             : 
    8388             :     /*
    8389             :      * Treat a read that had both zeroed buffers *and* ignored checksums as a
    8390             :      * special case, it's too irregular to be emitted the same way as the
    8391             :      * other cases.
    8392             :      */
    8393         544 :     if (zeroed_any && ignored_any)
    8394             :     {
    8395             :         Assert(zeroed_any && ignored_any);
    8396             :         Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
    8397             :         Assert(result.status != PGAIO_RS_ERROR);
    8398           8 :         affected_count = zeroed_or_error_count;
    8399             : 
    8400           8 :         ereport(elevel,
    8401             :                 errcode(ERRCODE_DATA_CORRUPTED),
    8402             :                 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
    8403             :                        affected_count, checkfail_count, first, last, rpath.str),
    8404             :                 affected_count > 1 ?
    8405             :                 errdetail("Block %u held the first zeroed page.",
    8406             :                           first + first_off) : 0,
    8407             :                 errhint_plural("See server log for details about the other %d invalid block.",
    8408             :                                "See server log for details about the other %d invalid blocks.",
    8409             :                                affected_count + checkfail_count - 1,
    8410             :                                affected_count + checkfail_count - 1));
    8411           8 :         return;
    8412             :     }
    8413             : 
    8414             :     /*
    8415             :      * The other messages are highly repetitive. To avoid duplicating a long
    8416             :      * and complicated ereport(), gather the translated format strings
    8417             :      * separately and then do one common ereport.
    8418             :      */
    8419         536 :     if (result.status == PGAIO_RS_ERROR)
    8420             :     {
    8421             :         Assert(!zeroed_any);    /* can't have invalid pages when zeroing them */
    8422         272 :         affected_count = zeroed_or_error_count;
    8423         272 :         msg_one = _("invalid page in block %u of relation \"%s\"");
    8424         272 :         msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
    8425         272 :         det_mult = _("Block %u held the first invalid page.");
    8426         272 :         hint_mult = _("See server log for the other %u invalid block(s).");
    8427             :     }
    8428         264 :     else if (zeroed_any && !ignored_any)
    8429             :     {
    8430         216 :         affected_count = zeroed_or_error_count;
    8431         216 :         msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
    8432         216 :         msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
    8433         216 :         det_mult = _("Block %u held the first zeroed page.");
    8434         216 :         hint_mult = _("See server log for the other %u zeroed block(s).");
    8435             :     }
    8436          48 :     else if (!zeroed_any && ignored_any)
    8437             :     {
    8438          48 :         affected_count = checkfail_count;
    8439          48 :         msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
    8440          48 :         msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
    8441          48 :         det_mult = _("Block %u held the first ignored page.");
    8442          48 :         hint_mult = _("See server log for the other %u ignored block(s).");
    8443             :     }
    8444             :     else
    8445           0 :         pg_unreachable();
    8446             : 
    8447         536 :     ereport(elevel,
    8448             :             errcode(ERRCODE_DATA_CORRUPTED),
    8449             :             affected_count == 1 ?
    8450             :             errmsg_internal(msg_one, first + first_off, rpath.str) :
    8451             :             errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
    8452             :             affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
    8453             :             affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
    8454             : }
    8455             : 
    8456             : static void
    8457     2613254 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8458             : {
    8459     2613254 :     buffer_stage_common(ioh, false, false);
    8460     2613254 : }
    8461             : 
    8462             : static PgAioResult
    8463     2402486 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8464             :                              uint8 cb_data)
    8465             : {
    8466     2402486 :     return buffer_readv_complete(ioh, prior_result, cb_data, false);
    8467             : }
    8468             : 
    8469             : /*
    8470             :  * We need a backend-local completion callback for shared buffers, to be able
    8471             :  * to report checksum errors correctly. Unfortunately that can only safely
    8472             :  * happen if the reporting backend has previously called
    8473             :  * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
    8474             :  * the backend that started the IO. Hence this callback.
    8475             :  */
    8476             : static PgAioResult
    8477     2613254 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
    8478             :                                    uint8 cb_data)
    8479             : {
    8480             :     bool        zeroed_any,
    8481             :                 ignored_any;
    8482             :     uint8       zeroed_or_error_count,
    8483             :                 checkfail_count,
    8484             :                 first_off;
    8485             : 
    8486     2613254 :     if (prior_result.status == PGAIO_RS_OK)
    8487     2613100 :         return prior_result;
    8488             : 
    8489         154 :     buffer_readv_decode_error(prior_result,
    8490             :                               &zeroed_any,
    8491             :                               &ignored_any,
    8492             :                               &zeroed_or_error_count,
    8493             :                               &checkfail_count,
    8494             :                               &first_off);
    8495             : 
    8496         154 :     if (checkfail_count)
    8497             :     {
    8498          48 :         PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    8499             : 
    8500          48 :         pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
    8501             :                                               checkfail_count);
    8502             :     }
    8503             : 
    8504         154 :     return prior_result;
    8505             : }
    8506             : 
    8507             : static void
    8508        3612 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
    8509             : {
    8510        3612 :     buffer_stage_common(ioh, false, true);
    8511        3612 : }
    8512             : 
    8513             : static PgAioResult
    8514        3612 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
    8515             :                             uint8 cb_data)
    8516             : {
    8517        3612 :     return buffer_readv_complete(ioh, prior_result, cb_data, true);
    8518             : }
    8519             : 
    8520             : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8521             : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
    8522             :     .stage = shared_buffer_readv_stage,
    8523             :     .complete_shared = shared_buffer_readv_complete,
    8524             :     /* need a local callback to report checksum failures */
    8525             :     .complete_local = shared_buffer_readv_complete_local,
    8526             :     .report = buffer_readv_report,
    8527             : };
    8528             : 
    8529             : /* readv callback is passed READ_BUFFERS_* flags as callback data */
    8530             : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
    8531             :     .stage = local_buffer_readv_stage,
    8532             : 
    8533             :     /*
    8534             :      * Note that this, in contrast to the shared_buffers case, uses
    8535             :      * complete_local, as only the issuing backend has access to the required
    8536             :      * datastructures. This is important in case the IO completion may be
    8537             :      * consumed incidentally by another backend.
    8538             :      */
    8539             :     .complete_local = local_buffer_readv_complete,
    8540             :     .report = buffer_readv_report,
    8541             : };

Generated by: LCOV version 1.16